před 11 měsíci · 6683cf3b1c
--- a/toolchain/base/value_store.h
+++ b/toolchain/base/value_store.h
@@ -156,7 +156,10 @@ class ValueStore
 
				   // for (auto [id, value] : store.enumerate()) { ... }
			
 
				   // ```
			
 
				   auto enumerate() const [[clang::lifetimebound]] -> auto {
			
 
				-    auto index_to_id = [&](int32_t i) -> std::pair<IdT, ConstRefType> {
			
 
				+    // For `it->val`, writing `const std::pair` is required; otherwise
			
 
				+    // `mapped_iterator` incorrectly infers the pointer type for `PointerProxy`.
			
 
				+    // NOLINTNEXTLINE(readability-const-return-type)
			
 
				+    auto index_to_id = [&](int32_t i) -> const std::pair<IdT, ConstRefType> {
			
 
				       return std::pair<IdT, ConstRefType>(IdT(i), Get(IdT(i)));
			
 
				     };
			
 
				     // Because indices into `ValueStore` are all sequential values from 0, we
			
--- a/toolchain/lex/BUILD
+++ b/toolchain/lex/BUILD
@@ -222,6 +222,18 @@ cc_library(
 
				     ],
			
 
				 )
			
 
				 
			
 
				+cc_library(
			
 
				+    name = "token_info",
			
 
				+    hdrs = ["token_info.h"],
			
 
				+    deps = [
			
 
				+        ":token_index",
			
 
				+        ":token_kind",
			
 
				+        "//common:check",
			
 
				+        "//toolchain/base:int",
			
 
				+        "//toolchain/base:value_ids",
			
 
				+    ],
			
 
				+)
			
 
				+
			
 
				 cc_library(
			
 
				     name = "tokenized_buffer",
			
 
				     srcs = ["tokenized_buffer.cpp"],
			
@@ -232,6 +244,7 @@ cc_library(
 
				         ":numeric_literal",
			
 
				         ":string_literal",
			
 
				         ":token_index",
			
 
				+        ":token_info",
			
 
				         ":token_kind",
			
 
				         "//common:check",
			
 
				         "//common:ostream",
			
--- a/toolchain/lex/lex.cpp
+++ b/toolchain/lex/lex.cpp
@@ -53,8 +53,6 @@ namespace Carbon::Lex {
 
				 // `TokenizedBuffer` or undermining the performance constraints of the lexer.
			
 
				 class [[clang::internal_linkage]] Lexer {
			
 
				  public:
			
 
				-  using TokenInfo = TokenizedBuffer::TokenInfo;
			
 
				-
			
 
				   // Symbolic result of a lexing action. This indicates whether we successfully
			
 
				   // lexed a token, or whether other lexing actions should be attempted.
			
 
				   //
			
@@ -1225,10 +1223,10 @@ auto Lexer::LexClosingSymbolToken(llvm::StringRef source_text, TokenKind kind,
 
				   TokenIndex token =
			
 
				       LexTokenWithPayload(kind, opening_token.index, byte_offset);
			
 
				 
			
 
				-  auto& opening_token_info = buffer_.GetTokenInfo(opening_token);
			
 
				+  auto& opening_token_info = buffer_.token_infos_.Get(opening_token);
			
 
				   if (LLVM_UNLIKELY(opening_token_info.kind() != kind.opening_symbol())) {
			
 
				     has_mismatched_brackets_ = true;
			
 
				-    buffer_.GetTokenInfo(token).set_opening_token_index(TokenIndex::None);
			
 
				+    buffer_.token_infos_.Get(token).set_opening_token_index(TokenIndex::None);
			
 
				     return token;
			
 
				   }
			
 
				 
			
@@ -1376,7 +1374,8 @@ auto Lexer::LexHash(llvm::StringRef source_text, ssize_t& position)
 
				 
			
 
				   // Look for the `r` token. Note that this is always in bounds because we
			
 
				   // create a start of file token.
			
 
				-  auto& prev_token_info = buffer_.token_infos_.back();
			
 
				+  auto& prev_token_info =
			
 
				+      buffer_.token_infos_.Get(TokenIndex(buffer_.token_infos_.size() - 1));
			
 
				 
			
 
				   // If the previous token isn't the identifier `r`, or the character after `#`
			
 
				   // isn't the start of an identifier, this is not a raw identifier.
			
@@ -1534,7 +1533,7 @@ class Lexer::ErrorRecoveryBuffer {
 
				     // Find the end of the token before the target token, and add the new token
			
 
				     // there.
			
 
				     TokenIndex insert_after(insert_before.index - 1);
			
 
				-    const auto& prev_info = buffer_->GetTokenInfo(insert_after);
			
 
				+    const auto& prev_info = buffer_->token_infos_.Get(insert_after);
			
 
				     int32_t byte_offset =
			
 
				         prev_info.byte_offset() + buffer_->GetTokenText(insert_after).size();
			
 
				     new_tokens_.push_back(
			
@@ -1544,7 +1543,7 @@ class Lexer::ErrorRecoveryBuffer {
 
				   // Replace the given token with an error token. We do this immediately,
			
 
				   // because we don't benefit from buffering it.
			
 
				   auto ReplaceWithError(TokenIndex token) -> void {
			
 
				-    auto& token_info = buffer_->GetTokenInfo(token);
			
 
				+    auto& token_info = buffer_->token_infos_.Get(token);
			
 
				     int error_length = buffer_->GetTokenText(token).size();
			
 
				     token_info.ResetAsError(error_length);
			
 
				     any_error_tokens_ = true;
			
@@ -1552,22 +1551,24 @@ class Lexer::ErrorRecoveryBuffer {
 
				 
			
 
				   // Merge the recovery tokens into the token list of the tokenized buffer.
			
 
				   auto Apply() -> void {
			
 
				-    auto old_tokens = std::move(buffer_->token_infos_);
			
 
				-    buffer_->token_infos_.clear();
			
 
				+    ValueStore<TokenIndex> old_tokens =
			
 
				+        std::exchange(buffer_->token_infos_, {});
			
 
				     int new_size = old_tokens.size() + new_tokens_.size();
			
 
				-    buffer_->token_infos_.reserve(new_size);
			
 
				+    buffer_->token_infos_.Reserve(new_size);
			
 
				     buffer_->recovery_tokens_.resize(new_size);
			
 
				 
			
 
				-    int old_tokens_offset = 0;
			
 
				+    auto old_tokens_range = old_tokens.enumerate();
			
 
				+    auto old_tokens_it = old_tokens_range.begin();
			
 
				     for (auto [next_offset, info] : new_tokens_) {
			
 
				-      buffer_->token_infos_.append(old_tokens.begin() + old_tokens_offset,
			
 
				-                                   old_tokens.begin() + next_offset.index);
			
 
				+      for (; old_tokens_it->first < next_offset; ++old_tokens_it) {
			
 
				+        buffer_->token_infos_.Add(old_tokens_it->second);
			
 
				+      }
			
 
				       buffer_->AddToken(info);
			
 
				       buffer_->recovery_tokens_.set(next_offset.index);
			
 
				-      old_tokens_offset = next_offset.index;
			
 
				     }
			
 
				-    buffer_->token_infos_.append(old_tokens.begin() + old_tokens_offset,
			
 
				-                                 old_tokens.end());
			
 
				+    for (; old_tokens_it != old_tokens_range.end(); ++old_tokens_it) {
			
 
				+      buffer_->token_infos_.Add(old_tokens_it->second);
			
 
				+    }
			
 
				   }
			
 
				 
			
 
				   // Perform bracket matching to fix cross-references between tokens. This must
			
@@ -1583,12 +1584,12 @@ class Lexer::ErrorRecoveryBuffer {
 
				         CARBON_CHECK(!open_groups.empty(), "Failed to balance brackets");
			
 
				         auto opening_token = open_groups.pop_back_val();
			
 
				 
			
 
				-        CARBON_CHECK(
			
 
				-            kind ==
			
 
				-                buffer_->GetTokenInfo(opening_token).kind().closing_symbol(),
			
 
				-            "Failed to balance brackets");
			
 
				-        auto& opening_token_info = buffer_->GetTokenInfo(opening_token);
			
 
				-        auto& closing_token_info = buffer_->GetTokenInfo(token);
			
 
				+        CARBON_CHECK(kind == buffer_->token_infos_.Get(opening_token)
			
 
				+                                 .kind()
			
 
				+                                 .closing_symbol(),
			
 
				+                     "Failed to balance brackets");
			
 
				+        auto& opening_token_info = buffer_->token_infos_.Get(opening_token);
			
 
				+        auto& closing_token_info = buffer_->token_infos_.Get(token);
			
 
				         opening_token_info.set_closing_token_index(token);
			
 
				         closing_token_info.set_opening_token_index(opening_token);
			
 
				       }
			
@@ -1601,8 +1602,7 @@ class Lexer::ErrorRecoveryBuffer {
 
				   // A list of tokens to insert into the token stream to fix mismatched
			
 
				   // brackets. The first element in each pair is the original token index to
			
 
				   // insert the new token before.
			
 
				-  llvm::SmallVector<std::pair<TokenIndex, TokenizedBuffer::TokenInfo>>
			
 
				-      new_tokens_;
			
 
				+  llvm::SmallVector<std::pair<TokenIndex, TokenInfo>> new_tokens_;
			
 
				 
			
 
				   // Whether we have changed any tokens into error tokens.
			
 
				   bool any_error_tokens_ = false;
			
@@ -1652,8 +1652,9 @@ auto Lexer::DiagnoseAndFixMismatchedBrackets() -> void {
 
				     // Find the innermost matching opening symbol.
			
 
				     auto opening_it = llvm::find_if(
			
 
				         llvm::reverse(open_groups_), [&](TokenIndex opening_token) {
			
 
				-          return buffer_.GetTokenInfo(opening_token).kind().closing_symbol() ==
			
 
				-                 kind;
			
 
				+          return buffer_.token_infos_.Get(opening_token)
			
 
				+                     .kind()
			
 
				+                     .closing_symbol() == kind;
			
 
				         });
			
 
				     if (opening_it == open_groups_.rend()) {
			
 
				       CARBON_DIAGNOSTIC(
			
--- a/toolchain/lex/token_index.h
+++ b/toolchain/lex/token_index.h
@@ -10,6 +10,8 @@
 
				 
			
 
				 namespace Carbon::Lex {
			
 
				 
			
 
				+class TokenInfo;
			
 
				+
			
 
				 // A lightweight handle to a lexed token in a `TokenizedBuffer`.
			
 
				 //
			
 
				 // `TokenIndex` objects are designed to be passed by value, not reference or
			
@@ -24,6 +26,8 @@ namespace Carbon::Lex {
 
				 //
			
 
				 // All other APIs to query a `TokenIndex` are on the `TokenizedBuffer`.
			
 
				 struct TokenIndex : public IndexBase<TokenIndex> {
			
 
				+  using ValueType = TokenInfo;
			
 
				+
			
 
				   // The number of bits which must be allotted for `TokenIndex`.
			
 
				   static constexpr int Bits = 23;
			
 
				   // The maximum number of tokens that can be stored, including the FileStart
			
--- a/toolchain/lex/token_info.h
+++ b/toolchain/lex/token_info.h
@@ -0,0 +1,181 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+#ifndef CARBON_TOOLCHAIN_LEX_TOKEN_INFO_H_
			
 
				+#define CARBON_TOOLCHAIN_LEX_TOKEN_INFO_H_
			
 
				+
			
 
				+#include "common/check.h"
			
 
				+#include "toolchain/base/int.h"
			
 
				+#include "toolchain/base/value_ids.h"
			
 
				+#include "toolchain/lex/token_index.h"
			
 
				+#include "toolchain/lex/token_kind.h"
			
 
				+
			
 
				+namespace Carbon::Lex {
			
 
				+
			
 
				+// Storage for the information about a specific token, as an implementation
			
 
				+// detail of `TokenizedBuffer`.
			
 
				+//
			
 
				+// This provides a friendly accessor API to the carefully space-optimized
			
 
				+// storage model of the information we associated with each token.
			
 
				+//
			
 
				+// There are four pieces of information stored here:
			
 
				+// - The kind of the token.
			
 
				+// - Whether that token has leading whitespace before it.
			
 
				+// - A kind-specific payload that can be compressed into a small integer.
			
 
				+//   - This class provides dedicated accessors for each different form of
			
 
				+//     payload that check the kind and payload correspond correctly.
			
 
				+// - A 32-bit byte offset of the token within the source text.
			
 
				+//
			
 
				+// These are compressed and stored in 8-bytes for each token.
			
 
				+//
			
 
				+// Note that while the class provides some limited setters for payloads and
			
 
				+// mutating methods, setters on this type may be unexpectedly expensive due to
			
 
				+// the bit-packed representation and should be avoided. As such, only the
			
 
				+// minimal necessary setters are provided.
			
 
				+//
			
 
				+// TODO: It might be worth considering a struct-of-arrays data layout in order
			
 
				+// to move the byte offset to a separate array from the rest as it is only hot
			
 
				+// during lexing, and then cold during parsing and semantic analysis. However,
			
 
				+// a trivial approach to that adds more overhead than it saves due to tracking
			
 
				+// two separate vectors and their growth. Making this profitable would likely
			
 
				+// at least require a highly specialized single vector that manages the growth
			
 
				+// once and then provides separate storage areas for the two arrays.
			
 
				+class TokenInfo {
			
 
				+ public:
			
 
				+  // The kind for this token.
			
 
				+  auto kind() const -> TokenKind { return kind_; }
			
 
				+
			
 
				+  // Whether this token is preceded by whitespace. We only store the preceding
			
 
				+  // state, and look at the next token to check for trailing whitespace.
			
 
				+  auto has_leading_space() const -> bool { return has_leading_space_; }
			
 
				+
			
 
				+  // A collection of methods to access the specific payload included with
			
 
				+  // particular kinds of tokens. Only the specific payload accessor below may
			
 
				+  // be used for an info entry of a token with a particular kind, and these
			
 
				+  // check that the kind is valid. Some tokens do not include a payload at all
			
 
				+  // and none of these methods may be called.
			
 
				+  auto ident_id() const -> IdentifierId {
			
 
				+    CARBON_DCHECK(kind() == TokenKind::Identifier);
			
 
				+    return IdentifierId(token_payload_);
			
 
				+  }
			
 
				+  auto set_ident_id(IdentifierId ident_id) -> void {
			
 
				+    CARBON_DCHECK(kind() == TokenKind::Identifier);
			
 
				+    token_payload_ = ident_id.index;
			
 
				+  }
			
 
				+
			
 
				+  auto string_literal_id() const -> StringLiteralValueId {
			
 
				+    CARBON_DCHECK(kind() == TokenKind::StringLiteral);
			
 
				+    return StringLiteralValueId(token_payload_);
			
 
				+  }
			
 
				+
			
 
				+  auto int_id() const -> IntId {
			
 
				+    CARBON_DCHECK(kind() == TokenKind::IntLiteral ||
			
 
				+                  kind() == TokenKind::IntTypeLiteral ||
			
 
				+                  kind() == TokenKind::UnsignedIntTypeLiteral ||
			
 
				+                  kind() == TokenKind::FloatTypeLiteral);
			
 
				+    return IntId::MakeFromTokenPayload(token_payload_);
			
 
				+  }
			
 
				+
			
 
				+  auto real_id() const -> RealId {
			
 
				+    CARBON_DCHECK(kind() == TokenKind::RealLiteral);
			
 
				+    return RealId(token_payload_);
			
 
				+  }
			
 
				+
			
 
				+  auto closing_token_index() const -> TokenIndex {
			
 
				+    CARBON_DCHECK(kind().is_opening_symbol());
			
 
				+    return TokenIndex(token_payload_);
			
 
				+  }
			
 
				+  auto set_closing_token_index(TokenIndex closing_index) -> void {
			
 
				+    CARBON_DCHECK(kind().is_opening_symbol());
			
 
				+    token_payload_ = closing_index.index;
			
 
				+  }
			
 
				+
			
 
				+  auto opening_token_index() const -> TokenIndex {
			
 
				+    CARBON_DCHECK(kind().is_closing_symbol());
			
 
				+    return TokenIndex(token_payload_);
			
 
				+  }
			
 
				+  auto set_opening_token_index(TokenIndex opening_index) -> void {
			
 
				+    CARBON_DCHECK(kind().is_closing_symbol());
			
 
				+    token_payload_ = opening_index.index;
			
 
				+  }
			
 
				+
			
 
				+  auto error_length() const -> int {
			
 
				+    CARBON_DCHECK(kind() == TokenKind::Error);
			
 
				+    return token_payload_;
			
 
				+  }
			
 
				+
			
 
				+  // Zero-based byte offset of the token within the file. This can be combined
			
 
				+  // with the buffer's line information to locate the line and column of the
			
 
				+  // token as well.
			
 
				+  auto byte_offset() const -> int32_t { return byte_offset_; }
			
 
				+
			
 
				+  // Transforms the token into an error token of the given length but at its
			
 
				+  // original position and with the same whitespace adjacency.
			
 
				+  auto ResetAsError(int error_length) -> void {
			
 
				+    // Construct a fresh token to establish any needed invariants and replace
			
 
				+    // this token with it.
			
 
				+    TokenInfo error(TokenKind::Error, has_leading_space(), error_length,
			
 
				+                    byte_offset());
			
 
				+    *this = error;
			
 
				+  }
			
 
				+
			
 
				+ private:
			
 
				+  friend class Lexer;
			
 
				+
			
 
				+  static constexpr int PayloadBits = 23;
			
 
				+
			
 
				+  // Make sure we have enough payload bits to represent token-associated IDs.
			
 
				+  static_assert(PayloadBits >= IntId::TokenIdBits);
			
 
				+  static_assert(PayloadBits >= TokenIndex::Bits);
			
 
				+
			
 
				+  // Constructor for a TokenKind that carries no payload, or where the payload
			
 
				+  // will be set later.
			
 
				+  //
			
 
				+  // Only used by the lexer which enforces only the correct kinds are used.
			
 
				+  //
			
 
				+  // When the payload is not being set, we leave it uninitialized. At least in
			
 
				+  // some cases, this will allow MSan to correctly detect erroneous attempts
			
 
				+  // to access the payload, as it works to track uninitialized memory
			
 
				+  // bit-for-bit specifically to handle complex cases like bitfields.
			
 
				+  TokenInfo(TokenKind kind, bool has_leading_space, int32_t byte_offset)
			
 
				+      : kind_(kind),
			
 
				+        has_leading_space_(has_leading_space),
			
 
				+        byte_offset_(byte_offset) {}
			
 
				+
			
 
				+  // Constructor for a TokenKind that carries a payload.
			
 
				+  //
			
 
				+  // Only used by the lexer which enforces the correct kind and payload types.
			
 
				+  TokenInfo(TokenKind kind, bool has_leading_space, int payload,
			
 
				+            int32_t byte_offset)
			
 
				+      : kind_(kind),
			
 
				+        has_leading_space_(has_leading_space),
			
 
				+        token_payload_(payload),
			
 
				+        byte_offset_(byte_offset) {}
			
 
				+
			
 
				+  // A bitfield that encodes the token's kind, the leading space flag, and the
			
 
				+  // remaining bits in a payload. These are encoded together as a bitfield for
			
 
				+  // density and because these are the hottest fields of tokens for consumers
			
 
				+  // after lexing.
			
 
				+  //
			
 
				+  // Payload values are typically ID types for which we create at most one per
			
 
				+  // token, so we ensure that `token_payload_` is large enough to fit any
			
 
				+  // token index. Stores to this field may overflow, but we produce an error
			
 
				+  // in `Lexer::Finalize` if the file has more than `TokenIndex::Max` tokens,
			
 
				+  // so this value never overflows if lexing succeeds.
			
 
				+  TokenKind kind_;
			
 
				+  static_assert(sizeof(kind_) == 1, "TokenKind must pack to 8 bits");
			
 
				+  bool has_leading_space_ : 1;
			
 
				+  unsigned token_payload_ : PayloadBits;
			
 
				+
			
 
				+  // Separate storage for the byte offset, this is hot while lexing but then
			
 
				+  // generally cold.
			
 
				+  int32_t byte_offset_;
			
 
				+};
			
 
				+
			
 
				+static_assert(sizeof(TokenInfo) == 8,
			
 
				+              "Expected `TokenInfo` to pack to an 8-byte structure.");
			
 
				+
			
 
				+}  // namespace Carbon::Lex
			
 
				+
			
 
				+#endif  // CARBON_TOOLCHAIN_LEX_TOKEN_INFO_H_
			
--- a/toolchain/lex/tokenized_buffer.cpp
+++ b/toolchain/lex/tokenized_buffer.cpp
@@ -24,7 +24,7 @@
 
				 namespace Carbon::Lex {
			
 
				 
			
 
				 auto TokenizedBuffer::GetLine(TokenIndex token) const -> LineIndex {
			
 
				-  return FindLineIndex(GetTokenInfo(token).byte_offset());
			
 
				+  return FindLineIndex(token_infos_.Get(token).byte_offset());
			
 
				 }
			
 
				 
			
 
				 auto TokenizedBuffer::GetLineNumber(TokenIndex token) const -> int {
			
@@ -32,7 +32,7 @@ auto TokenizedBuffer::GetLineNumber(TokenIndex token) const -> int {
 
				 }
			
 
				 
			
 
				 auto TokenizedBuffer::GetColumnNumber(TokenIndex token) const -> int {
			
 
				-  const auto& token_info = GetTokenInfo(token);
			
 
				+  const auto& token_info = token_infos_.Get(token);
			
 
				   const auto& line_info =
			
 
				       line_infos_.Get(FindLineIndex(token_info.byte_offset()));
			
 
				   return token_info.byte_offset() - line_info.start + 1;
			
@@ -58,7 +58,7 @@ auto TokenizedBuffer::GetEndLoc(TokenIndex token) const
 
				 }
			
 
				 
			
 
				 auto TokenizedBuffer::GetTokenText(TokenIndex token) const -> llvm::StringRef {
			
 
				-  const auto& token_info = GetTokenInfo(token);
			
 
				+  const auto& token_info = token_infos_.Get(token);
			
 
				   llvm::StringRef fixed_spelling = token_info.kind().fixed_spelling();
			
 
				   if (!fixed_spelling.empty()) {
			
 
				     return fixed_spelling;
			
@@ -109,21 +109,21 @@ auto TokenizedBuffer::GetTokenText(TokenIndex token) const -> llvm::StringRef {
 
				 }
			
 
				 
			
 
				 auto TokenizedBuffer::GetIdentifier(TokenIndex token) const -> IdentifierId {
			
 
				-  const auto& token_info = GetTokenInfo(token);
			
 
				+  const auto& token_info = token_infos_.Get(token);
			
 
				   CARBON_CHECK(token_info.kind() == TokenKind::Identifier, "{0}",
			
 
				                token_info.kind());
			
 
				   return token_info.ident_id();
			
 
				 }
			
 
				 
			
 
				 auto TokenizedBuffer::GetIntLiteral(TokenIndex token) const -> IntId {
			
 
				-  const auto& token_info = GetTokenInfo(token);
			
 
				+  const auto& token_info = token_infos_.Get(token);
			
 
				   CARBON_CHECK(token_info.kind() == TokenKind::IntLiteral, "{0}",
			
 
				                token_info.kind());
			
 
				   return token_info.int_id();
			
 
				 }
			
 
				 
			
 
				 auto TokenizedBuffer::GetRealLiteral(TokenIndex token) const -> RealId {
			
 
				-  const auto& token_info = GetTokenInfo(token);
			
 
				+  const auto& token_info = token_infos_.Get(token);
			
 
				   CARBON_CHECK(token_info.kind() == TokenKind::RealLiteral, "{0}",
			
 
				                token_info.kind());
			
 
				   return token_info.real_id();
			
@@ -131,14 +131,14 @@ auto TokenizedBuffer::GetRealLiteral(TokenIndex token) const -> RealId {
 
				 
			
 
				 auto TokenizedBuffer::GetStringLiteralValue(TokenIndex token) const
			
 
				     -> StringLiteralValueId {
			
 
				-  const auto& token_info = GetTokenInfo(token);
			
 
				+  const auto& token_info = token_infos_.Get(token);
			
 
				   CARBON_CHECK(token_info.kind() == TokenKind::StringLiteral, "{0}",
			
 
				                token_info.kind());
			
 
				   return token_info.string_literal_id();
			
 
				 }
			
 
				 
			
 
				 auto TokenizedBuffer::GetTypeLiteralSize(TokenIndex token) const -> IntId {
			
 
				-  const auto& token_info = GetTokenInfo(token);
			
 
				+  const auto& token_info = token_infos_.Get(token);
			
 
				   CARBON_CHECK(token_info.kind().is_sized_type_literal(), "{0}",
			
 
				                token_info.kind());
			
 
				   return token_info.int_id();
			
@@ -146,7 +146,7 @@ auto TokenizedBuffer::GetTypeLiteralSize(TokenIndex token) const -> IntId {
 
				 
			
 
				 auto TokenizedBuffer::GetMatchedClosingToken(TokenIndex opening_token) const
			
 
				     -> TokenIndex {
			
 
				-  const auto& opening_token_info = GetTokenInfo(opening_token);
			
 
				+  const auto& opening_token_info = token_infos_.Get(opening_token);
			
 
				   CARBON_CHECK(opening_token_info.kind().is_opening_symbol(), "{0}",
			
 
				                opening_token_info.kind());
			
 
				   return opening_token_info.closing_token_index();
			
@@ -154,7 +154,7 @@ auto TokenizedBuffer::GetMatchedClosingToken(TokenIndex opening_token) const
 
				 
			
 
				 auto TokenizedBuffer::GetMatchedOpeningToken(TokenIndex closing_token) const
			
 
				     -> TokenIndex {
			
 
				-  const auto& closing_token_info = GetTokenInfo(closing_token);
			
 
				+  const auto& closing_token_info = token_infos_.Get(closing_token);
			
 
				   CARBON_CHECK(closing_token_info.kind().is_closing_symbol(), "{0}",
			
 
				                closing_token_info.kind());
			
 
				   return closing_token_info.opening_token_index();
			
@@ -246,7 +246,7 @@ auto TokenizedBuffer::PrintToken(llvm::raw_ostream& output_stream,
 
				     -> void {
			
 
				   widths.Widen(GetTokenPrintWidths(token));
			
 
				   int token_index = token.index;
			
 
				-  const auto& token_info = GetTokenInfo(token);
			
 
				+  const auto& token_info = token_infos_.Get(token);
			
 
				   LineIndex line_index = FindLineIndex(token_info.byte_offset());
			
 
				   llvm::StringRef token_text = GetTokenText(token);
			
 
				 
			
@@ -338,7 +338,7 @@ auto TokenizedBuffer::FindLineIndex(int32_t byte_offset) const -> LineIndex {
 
				 auto TokenizedBuffer::IsAfterComment(TokenIndex token,
			
 
				                                      CommentIndex comment_index) const -> bool {
			
 
				   const auto& comment_data = comments_.Get(comment_index);
			
 
				-  return GetTokenInfo(token).byte_offset() > comment_data.start;
			
 
				+  return token_infos_.Get(token).byte_offset() > comment_data.start;
			
 
				 }
			
 
				 
			
 
				 auto TokenizedBuffer::GetCommentText(CommentIndex comment_index) const
			
@@ -411,7 +411,7 @@ auto TokenizedBuffer::TokenToDiagnosticLoc(TokenIndex token) const
 
				     -> Diagnostics::ConvertedLoc {
			
 
				   // Map the token location into a position within the source buffer.
			
 
				   const char* token_start =
			
 
				-      source_->text().begin() + GetTokenInfo(token).byte_offset();
			
 
				+      source_->text().begin() + token_infos_.Get(token).byte_offset();
			
 
				 
			
 
				   // Find the corresponding file location.
			
 
				   // TODO: Should we somehow indicate in the diagnostic location if this token
			
--- a/toolchain/lex/tokenized_buffer.h
+++ b/toolchain/lex/tokenized_buffer.h
@@ -19,6 +19,7 @@
 
				 #include "toolchain/base/shared_value_stores.h"
			
 
				 #include "toolchain/diagnostics/diagnostic_emitter.h"
			
 
				 #include "toolchain/lex/token_index.h"
			
 
				+#include "toolchain/lex/token_info.h"
			
 
				 #include "toolchain/lex/token_kind.h"
			
 
				 #include "toolchain/source/source_buffer.h"
			
 
				 
			
@@ -177,7 +178,7 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
 
				   auto GetIndentColumnNumber(LineIndex line) const -> int;
			
 
				 
			
 
				   auto GetByteOffset(TokenIndex token) const -> int32_t {
			
 
				-    return GetTokenInfo(token).byte_offset();
			
 
				+    return token_infos_.Get(token).byte_offset();
			
 
				   }
			
 
				 
			
 
				   // Returns true if the token comes after the comment.
			
@@ -299,167 +300,6 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
 
				     int indent;
			
 
				   };
			
 
				 
			
 
				-  // Storage for the information about a specific token in the buffer.
			
 
				-  //
			
 
				-  // This provides a friendly accessor API to the carefully space-optimized
			
 
				-  // storage model of the information we associated with each token.
			
 
				-  //
			
 
				-  // There are four pieces of information stored here:
			
 
				-  // - The kind of the token.
			
 
				-  // - Whether that token has leading whitespace before it.
			
 
				-  // - A kind-specific payload that can be compressed into a small integer.
			
 
				-  //   - This class provides dedicated accessors for each different form of
			
 
				-  //     payload that check the kind and payload correspond correctly.
			
 
				-  // - A 32-bit byte offset of the token within the source text.
			
 
				-  //
			
 
				-  // These are compressed and stored in 8-bytes for each token.
			
 
				-  //
			
 
				-  // Note that while the class provides some limited setters for payloads and
			
 
				-  // mutating methods, setters on this type may be unexpectedly expensive due to
			
 
				-  // the bit-packed representation and should be avoided. As such, only the
			
 
				-  // minimal necessary setters are provided.
			
 
				-  //
			
 
				-  // TODO: It might be worth considering a struct-of-arrays data layout in order
			
 
				-  // to move the byte offset to a separate array from the rest as it is only hot
			
 
				-  // during lexing, and then cold during parsing and semantic analysis. However,
			
 
				-  // a trivial approach to that adds more overhead than it saves due to tracking
			
 
				-  // two separate vectors and their growth. Making this profitable would likely
			
 
				-  // at least require a highly specialized single vector that manages the growth
			
 
				-  // once and then provides separate storage areas for the two arrays.
			
 
				-  class TokenInfo {
			
 
				-   public:
			
 
				-    // The kind for this token.
			
 
				-    auto kind() const -> TokenKind { return kind_; }
			
 
				-
			
 
				-    // Whether this token is preceded by whitespace. We only store the preceding
			
 
				-    // state, and look at the next token to check for trailing whitespace.
			
 
				-    auto has_leading_space() const -> bool { return has_leading_space_; }
			
 
				-
			
 
				-    // A collection of methods to access the specific payload included with
			
 
				-    // particular kinds of tokens. Only the specific payload accessor below may
			
 
				-    // be used for an info entry of a token with a particular kind, and these
			
 
				-    // check that the kind is valid. Some tokens do not include a payload at all
			
 
				-    // and none of these methods may be called.
			
 
				-    auto ident_id() const -> IdentifierId {
			
 
				-      CARBON_DCHECK(kind() == TokenKind::Identifier);
			
 
				-      return IdentifierId(token_payload_);
			
 
				-    }
			
 
				-    auto set_ident_id(IdentifierId ident_id) -> void {
			
 
				-      CARBON_DCHECK(kind() == TokenKind::Identifier);
			
 
				-      token_payload_ = ident_id.index;
			
 
				-    }
			
 
				-
			
 
				-    auto string_literal_id() const -> StringLiteralValueId {
			
 
				-      CARBON_DCHECK(kind() == TokenKind::StringLiteral);
			
 
				-      return StringLiteralValueId(token_payload_);
			
 
				-    }
			
 
				-
			
 
				-    auto int_id() const -> IntId {
			
 
				-      CARBON_DCHECK(kind() == TokenKind::IntLiteral ||
			
 
				-                    kind() == TokenKind::IntTypeLiteral ||
			
 
				-                    kind() == TokenKind::UnsignedIntTypeLiteral ||
			
 
				-                    kind() == TokenKind::FloatTypeLiteral);
			
 
				-      return IntId::MakeFromTokenPayload(token_payload_);
			
 
				-    }
			
 
				-
			
 
				-    auto real_id() const -> RealId {
			
 
				-      CARBON_DCHECK(kind() == TokenKind::RealLiteral);
			
 
				-      return RealId(token_payload_);
			
 
				-    }
			
 
				-
			
 
				-    auto closing_token_index() const -> TokenIndex {
			
 
				-      CARBON_DCHECK(kind().is_opening_symbol());
			
 
				-      return TokenIndex(token_payload_);
			
 
				-    }
			
 
				-    auto set_closing_token_index(TokenIndex closing_index) -> void {
			
 
				-      CARBON_DCHECK(kind().is_opening_symbol());
			
 
				-      token_payload_ = closing_index.index;
			
 
				-    }
			
 
				-
			
 
				-    auto opening_token_index() const -> TokenIndex {
			
 
				-      CARBON_DCHECK(kind().is_closing_symbol());
			
 
				-      return TokenIndex(token_payload_);
			
 
				-    }
			
 
				-    auto set_opening_token_index(TokenIndex opening_index) -> void {
			
 
				-      CARBON_DCHECK(kind().is_closing_symbol());
			
 
				-      token_payload_ = opening_index.index;
			
 
				-    }
			
 
				-
			
 
				-    auto error_length() const -> int {
			
 
				-      CARBON_DCHECK(kind() == TokenKind::Error);
			
 
				-      return token_payload_;
			
 
				-    }
			
 
				-
			
 
				-    // Zero-based byte offset of the token within the file. This can be combined
			
 
				-    // with the buffer's line information to locate the line and column of the
			
 
				-    // token as well.
			
 
				-    auto byte_offset() const -> int32_t { return byte_offset_; }
			
 
				-
			
 
				-    // Transforms the token into an error token of the given length but at its
			
 
				-    // original position and with the same whitespace adjacency.
			
 
				-    auto ResetAsError(int error_length) -> void {
			
 
				-      // Construct a fresh token to establish any needed invariants and replace
			
 
				-      // this token with it.
			
 
				-      TokenInfo error(TokenKind::Error, has_leading_space(), error_length,
			
 
				-                      byte_offset());
			
 
				-      *this = error;
			
 
				-    }
			
 
				-
			
 
				-   private:
			
 
				-    friend class Lexer;
			
 
				-
			
 
				-    static constexpr int PayloadBits = 23;
			
 
				-
			
 
				-    // Make sure we have enough payload bits to represent token-associated IDs.
			
 
				-    static_assert(PayloadBits >= IntId::TokenIdBits);
			
 
				-    static_assert(PayloadBits >= TokenIndex::Bits);
			
 
				-
			
 
				-    // Constructor for a TokenKind that carries no payload, or where the payload
			
 
				-    // will be set later.
			
 
				-    //
			
 
				-    // Only used by the lexer which enforces only the correct kinds are used.
			
 
				-    //
			
 
				-    // When the payload is not being set, we leave it uninitialized. At least in
			
 
				-    // some cases, this will allow MSan to correctly detect erroneous attempts
			
 
				-    // to access the payload, as it works to track uninitialized memory
			
 
				-    // bit-for-bit specifically to handle complex cases like bitfields.
			
 
				-    TokenInfo(TokenKind kind, bool has_leading_space, int32_t byte_offset)
			
 
				-        : kind_(kind),
			
 
				-          has_leading_space_(has_leading_space),
			
 
				-          byte_offset_(byte_offset) {}
			
 
				-
			
 
				-    // Constructor for a TokenKind that carries a payload.
			
 
				-    //
			
 
				-    // Only used by the lexer which enforces the correct kind and payload types.
			
 
				-    TokenInfo(TokenKind kind, bool has_leading_space, int payload,
			
 
				-              int32_t byte_offset)
			
 
				-        : kind_(kind),
			
 
				-          has_leading_space_(has_leading_space),
			
 
				-          token_payload_(payload),
			
 
				-          byte_offset_(byte_offset) {}
			
 
				-
			
 
				-    // A bitfield that encodes the token's kind, the leading space flag, and the
			
 
				-    // remaining bits in a payload. These are encoded together as a bitfield for
			
 
				-    // density and because these are the hottest fields of tokens for consumers
			
 
				-    // after lexing.
			
 
				-    //
			
 
				-    // Payload values are typically ID types for which we create at most one per
			
 
				-    // token, so we ensure that `token_payload_` is large enough to fit any
			
 
				-    // token index. Stores to this field may overflow, but we produce an error
			
 
				-    // in `Lexer::Finalize` if the file has more than `TokenIndex::Max` tokens,
			
 
				-    // so this value never overflows if lexing succeeds.
			
 
				-    TokenKind kind_;
			
 
				-    static_assert(sizeof(kind_) == 1, "TokenKind must pack to 8 bits");
			
 
				-    bool has_leading_space_ : 1;
			
 
				-    unsigned token_payload_ : PayloadBits;
			
 
				-
			
 
				-    // Separate storage for the byte offset, this is hot while lexing but then
			
 
				-    // generally cold.
			
 
				-    int32_t byte_offset_;
			
 
				-  };
			
 
				-  static_assert(sizeof(TokenInfo) == 8,
			
 
				-                "Expected `TokenInfo` to pack to an 8-byte structure.");
			
 
				-
			
 
				   // The constructor is merely responsible for trivial initialization of
			
 
				   // members. A working object of this type is built with `Lex::Lex` so that its
			
 
				   // return can indicate if an error was encountered while lexing.
			
@@ -469,9 +309,10 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
 
				       : value_stores_(&value_stores), source_(&source) {}
			
 
				 
			
 
				   auto FindLineIndex(int32_t byte_offset) const -> LineIndex;
			
 
				-  auto GetTokenInfo(TokenIndex token) -> TokenInfo&;
			
 
				-  auto GetTokenInfo(TokenIndex token) const -> const TokenInfo&;
			
 
				+
			
 
				+  // Adds the token and adjusts the expected tree size.
			
 
				   auto AddToken(TokenInfo info) -> TokenIndex;
			
 
				+
			
 
				   auto GetTokenPrintWidths(TokenIndex token) const -> PrintWidths;
			
 
				   auto PrintToken(llvm::raw_ostream& output_stream, TokenIndex token,
			
 
				                   PrintWidths widths) const -> void;
			
@@ -486,7 +327,7 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
 
				   SharedValueStores* value_stores_;
			
 
				   SourceBuffer* source_;
			
 
				 
			
 
				-  llvm::SmallVector<TokenInfo> token_infos_;
			
 
				+  ValueStore<TokenIndex> token_infos_;
			
 
				 
			
 
				   ValueStore<LineIndex> line_infos_;
			
 
				 
			
@@ -514,35 +355,24 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
 
				 };
			
 
				 
			
 
				 inline auto TokenizedBuffer::GetKind(TokenIndex token) const -> TokenKind {
			
 
				-  return GetTokenInfo(token).kind();
			
 
				+  return token_infos_.Get(token).kind();
			
 
				 }
			
 
				 
			
 
				 inline auto TokenizedBuffer::HasLeadingWhitespace(TokenIndex token) const
			
 
				     -> bool {
			
 
				-  return GetTokenInfo(token).has_leading_space();
			
 
				+  return token_infos_.Get(token).has_leading_space();
			
 
				 }
			
 
				 
			
 
				 inline auto TokenizedBuffer::HasTrailingWhitespace(TokenIndex token) const
			
 
				     -> bool {
			
 
				   TokenIterator it(token);
			
 
				   ++it;
			
 
				-  return it != tokens().end() && GetTokenInfo(*it).has_leading_space();
			
 
				-}
			
 
				-
			
 
				-inline auto TokenizedBuffer::GetTokenInfo(TokenIndex token) -> TokenInfo& {
			
 
				-  return token_infos_[token.index];
			
 
				-}
			
 
				-
			
 
				-inline auto TokenizedBuffer::GetTokenInfo(TokenIndex token) const
			
 
				-    -> const TokenInfo& {
			
 
				-  return token_infos_[token.index];
			
 
				+  return it != tokens().end() && token_infos_.Get(*it).has_leading_space();
			
 
				 }
			
 
				 
			
 
				 inline auto TokenizedBuffer::AddToken(TokenInfo info) -> TokenIndex {
			
 
				-  TokenIndex index(token_infos_.size());
			
 
				-  token_infos_.push_back(info);
			
 
				   expected_max_parse_tree_size_ += info.kind().expected_max_parse_tree_size();
			
 
				-  return index;
			
 
				+  return token_infos_.Add(info);
			
 
				 }
			
 
				 
			
 
				 }  // namespace Carbon::Lex