Procházet zdrojové kódy

Switch token_infos_ to a ValueStore (#5633)

Split out `TokenInfo` to be able to easily write `using ValueType =
TokenInfo;` on `TokenIndex`. Also fixes a small type issue on
`ValueStore` that affected `mapped_iterator` behavior when writing
`old_tokens_it->first < next_offset`.
Jon Ross-Perkins před 11 měsíci
rodič
revize
6683cf3b1c

+ 4 - 1
toolchain/base/value_store.h

@@ -156,7 +156,10 @@ class ValueStore
   // for (auto [id, value] : store.enumerate()) { ... }
   // ```
   auto enumerate() const [[clang::lifetimebound]] -> auto {
-    auto index_to_id = [&](int32_t i) -> std::pair<IdT, ConstRefType> {
+    // For `it->val`, writing `const std::pair` is required; otherwise
+    // `mapped_iterator` incorrectly infers the pointer type for `PointerProxy`.
+    // NOLINTNEXTLINE(readability-const-return-type)
+    auto index_to_id = [&](int32_t i) -> const std::pair<IdT, ConstRefType> {
       return std::pair<IdT, ConstRefType>(IdT(i), Get(IdT(i)));
     };
     // Because indices into `ValueStore` are all sequential values from 0, we

+ 13 - 0
toolchain/lex/BUILD

@@ -222,6 +222,18 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "token_info",
+    hdrs = ["token_info.h"],
+    deps = [
+        ":token_index",
+        ":token_kind",
+        "//common:check",
+        "//toolchain/base:int",
+        "//toolchain/base:value_ids",
+    ],
+)
+
 cc_library(
     name = "tokenized_buffer",
     srcs = ["tokenized_buffer.cpp"],
@@ -232,6 +244,7 @@ cc_library(
         ":numeric_literal",
         ":string_literal",
         ":token_index",
+        ":token_info",
         ":token_kind",
         "//common:check",
         "//common:ostream",

+ 27 - 26
toolchain/lex/lex.cpp

@@ -53,8 +53,6 @@ namespace Carbon::Lex {
 // `TokenizedBuffer` or undermining the performance constraints of the lexer.
 class [[clang::internal_linkage]] Lexer {
  public:
-  using TokenInfo = TokenizedBuffer::TokenInfo;
-
   // Symbolic result of a lexing action. This indicates whether we successfully
   // lexed a token, or whether other lexing actions should be attempted.
   //
@@ -1225,10 +1223,10 @@ auto Lexer::LexClosingSymbolToken(llvm::StringRef source_text, TokenKind kind,
   TokenIndex token =
       LexTokenWithPayload(kind, opening_token.index, byte_offset);
 
-  auto& opening_token_info = buffer_.GetTokenInfo(opening_token);
+  auto& opening_token_info = buffer_.token_infos_.Get(opening_token);
   if (LLVM_UNLIKELY(opening_token_info.kind() != kind.opening_symbol())) {
     has_mismatched_brackets_ = true;
-    buffer_.GetTokenInfo(token).set_opening_token_index(TokenIndex::None);
+    buffer_.token_infos_.Get(token).set_opening_token_index(TokenIndex::None);
     return token;
   }
 
@@ -1376,7 +1374,8 @@ auto Lexer::LexHash(llvm::StringRef source_text, ssize_t& position)
 
   // Look for the `r` token. Note that this is always in bounds because we
   // create a start of file token.
-  auto& prev_token_info = buffer_.token_infos_.back();
+  auto& prev_token_info =
+      buffer_.token_infos_.Get(TokenIndex(buffer_.token_infos_.size() - 1));
 
   // If the previous token isn't the identifier `r`, or the character after `#`
   // isn't the start of an identifier, this is not a raw identifier.
@@ -1534,7 +1533,7 @@ class Lexer::ErrorRecoveryBuffer {
     // Find the end of the token before the target token, and add the new token
     // there.
     TokenIndex insert_after(insert_before.index - 1);
-    const auto& prev_info = buffer_->GetTokenInfo(insert_after);
+    const auto& prev_info = buffer_->token_infos_.Get(insert_after);
     int32_t byte_offset =
         prev_info.byte_offset() + buffer_->GetTokenText(insert_after).size();
     new_tokens_.push_back(
@@ -1544,7 +1543,7 @@ class Lexer::ErrorRecoveryBuffer {
   // Replace the given token with an error token. We do this immediately,
   // because we don't benefit from buffering it.
   auto ReplaceWithError(TokenIndex token) -> void {
-    auto& token_info = buffer_->GetTokenInfo(token);
+    auto& token_info = buffer_->token_infos_.Get(token);
     int error_length = buffer_->GetTokenText(token).size();
     token_info.ResetAsError(error_length);
     any_error_tokens_ = true;
@@ -1552,22 +1551,24 @@ class Lexer::ErrorRecoveryBuffer {
 
   // Merge the recovery tokens into the token list of the tokenized buffer.
   auto Apply() -> void {
-    auto old_tokens = std::move(buffer_->token_infos_);
-    buffer_->token_infos_.clear();
+    ValueStore<TokenIndex> old_tokens =
+        std::exchange(buffer_->token_infos_, {});
     int new_size = old_tokens.size() + new_tokens_.size();
-    buffer_->token_infos_.reserve(new_size);
+    buffer_->token_infos_.Reserve(new_size);
     buffer_->recovery_tokens_.resize(new_size);
 
-    int old_tokens_offset = 0;
+    auto old_tokens_range = old_tokens.enumerate();
+    auto old_tokens_it = old_tokens_range.begin();
     for (auto [next_offset, info] : new_tokens_) {
-      buffer_->token_infos_.append(old_tokens.begin() + old_tokens_offset,
-                                   old_tokens.begin() + next_offset.index);
+      for (; old_tokens_it->first < next_offset; ++old_tokens_it) {
+        buffer_->token_infos_.Add(old_tokens_it->second);
+      }
       buffer_->AddToken(info);
       buffer_->recovery_tokens_.set(next_offset.index);
-      old_tokens_offset = next_offset.index;
     }
-    buffer_->token_infos_.append(old_tokens.begin() + old_tokens_offset,
-                                 old_tokens.end());
+    for (; old_tokens_it != old_tokens_range.end(); ++old_tokens_it) {
+      buffer_->token_infos_.Add(old_tokens_it->second);
+    }
   }
 
   // Perform bracket matching to fix cross-references between tokens. This must
@@ -1583,12 +1584,12 @@ class Lexer::ErrorRecoveryBuffer {
         CARBON_CHECK(!open_groups.empty(), "Failed to balance brackets");
         auto opening_token = open_groups.pop_back_val();
 
-        CARBON_CHECK(
-            kind ==
-                buffer_->GetTokenInfo(opening_token).kind().closing_symbol(),
-            "Failed to balance brackets");
-        auto& opening_token_info = buffer_->GetTokenInfo(opening_token);
-        auto& closing_token_info = buffer_->GetTokenInfo(token);
+        CARBON_CHECK(kind == buffer_->token_infos_.Get(opening_token)
+                                 .kind()
+                                 .closing_symbol(),
+                     "Failed to balance brackets");
+        auto& opening_token_info = buffer_->token_infos_.Get(opening_token);
+        auto& closing_token_info = buffer_->token_infos_.Get(token);
         opening_token_info.set_closing_token_index(token);
         closing_token_info.set_opening_token_index(opening_token);
       }
@@ -1601,8 +1602,7 @@ class Lexer::ErrorRecoveryBuffer {
   // A list of tokens to insert into the token stream to fix mismatched
   // brackets. The first element in each pair is the original token index to
   // insert the new token before.
-  llvm::SmallVector<std::pair<TokenIndex, TokenizedBuffer::TokenInfo>>
-      new_tokens_;
+  llvm::SmallVector<std::pair<TokenIndex, TokenInfo>> new_tokens_;
 
   // Whether we have changed any tokens into error tokens.
   bool any_error_tokens_ = false;
@@ -1652,8 +1652,9 @@ auto Lexer::DiagnoseAndFixMismatchedBrackets() -> void {
     // Find the innermost matching opening symbol.
     auto opening_it = llvm::find_if(
         llvm::reverse(open_groups_), [&](TokenIndex opening_token) {
-          return buffer_.GetTokenInfo(opening_token).kind().closing_symbol() ==
-                 kind;
+          return buffer_.token_infos_.Get(opening_token)
+                     .kind()
+                     .closing_symbol() == kind;
         });
     if (opening_it == open_groups_.rend()) {
       CARBON_DIAGNOSTIC(

+ 4 - 0
toolchain/lex/token_index.h

@@ -10,6 +10,8 @@
 
 namespace Carbon::Lex {
 
+class TokenInfo;
+
 // A lightweight handle to a lexed token in a `TokenizedBuffer`.
 //
 // `TokenIndex` objects are designed to be passed by value, not reference or
@@ -24,6 +26,8 @@ namespace Carbon::Lex {
 //
 // All other APIs to query a `TokenIndex` are on the `TokenizedBuffer`.
 struct TokenIndex : public IndexBase<TokenIndex> {
+  using ValueType = TokenInfo;
+
   // The number of bits which must be allotted for `TokenIndex`.
   static constexpr int Bits = 23;
   // The maximum number of tokens that can be stored, including the FileStart

+ 181 - 0
toolchain/lex/token_info.h

@@ -0,0 +1,181 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef CARBON_TOOLCHAIN_LEX_TOKEN_INFO_H_
+#define CARBON_TOOLCHAIN_LEX_TOKEN_INFO_H_
+
+#include "common/check.h"
+#include "toolchain/base/int.h"
+#include "toolchain/base/value_ids.h"
+#include "toolchain/lex/token_index.h"
+#include "toolchain/lex/token_kind.h"
+
+namespace Carbon::Lex {
+
+// Storage for the information about a specific token, as an implementation
+// detail of `TokenizedBuffer`.
+//
+// This provides a friendly accessor API to the carefully space-optimized
+// storage model of the information we associated with each token.
+//
+// There are four pieces of information stored here:
+// - The kind of the token.
+// - Whether that token has leading whitespace before it.
+// - A kind-specific payload that can be compressed into a small integer.
+//   - This class provides dedicated accessors for each different form of
+//     payload that check the kind and payload correspond correctly.
+// - A 32-bit byte offset of the token within the source text.
+//
+// These are compressed and stored in 8-bytes for each token.
+//
+// Note that while the class provides some limited setters for payloads and
+// mutating methods, setters on this type may be unexpectedly expensive due to
+// the bit-packed representation and should be avoided. As such, only the
+// minimal necessary setters are provided.
+//
+// TODO: It might be worth considering a struct-of-arrays data layout in order
+// to move the byte offset to a separate array from the rest as it is only hot
+// during lexing, and then cold during parsing and semantic analysis. However,
+// a trivial approach to that adds more overhead than it saves due to tracking
+// two separate vectors and their growth. Making this profitable would likely
+// at least require a highly specialized single vector that manages the growth
+// once and then provides separate storage areas for the two arrays.
+class TokenInfo {
+ public:
+  // The kind for this token.
+  auto kind() const -> TokenKind { return kind_; }
+
+  // Whether this token is preceded by whitespace. We only store the preceding
+  // state, and look at the next token to check for trailing whitespace.
+  auto has_leading_space() const -> bool { return has_leading_space_; }
+
+  // A collection of methods to access the specific payload included with
+  // particular kinds of tokens. Only the specific payload accessor below may
+  // be used for an info entry of a token with a particular kind, and these
+  // check that the kind is valid. Some tokens do not include a payload at all
+  // and none of these methods may be called.
+  auto ident_id() const -> IdentifierId {
+    CARBON_DCHECK(kind() == TokenKind::Identifier);
+    return IdentifierId(token_payload_);
+  }
+  auto set_ident_id(IdentifierId ident_id) -> void {
+    CARBON_DCHECK(kind() == TokenKind::Identifier);
+    token_payload_ = ident_id.index;
+  }
+
+  auto string_literal_id() const -> StringLiteralValueId {
+    CARBON_DCHECK(kind() == TokenKind::StringLiteral);
+    return StringLiteralValueId(token_payload_);
+  }
+
+  auto int_id() const -> IntId {
+    CARBON_DCHECK(kind() == TokenKind::IntLiteral ||
+                  kind() == TokenKind::IntTypeLiteral ||
+                  kind() == TokenKind::UnsignedIntTypeLiteral ||
+                  kind() == TokenKind::FloatTypeLiteral);
+    return IntId::MakeFromTokenPayload(token_payload_);
+  }
+
+  auto real_id() const -> RealId {
+    CARBON_DCHECK(kind() == TokenKind::RealLiteral);
+    return RealId(token_payload_);
+  }
+
+  auto closing_token_index() const -> TokenIndex {
+    CARBON_DCHECK(kind().is_opening_symbol());
+    return TokenIndex(token_payload_);
+  }
+  auto set_closing_token_index(TokenIndex closing_index) -> void {
+    CARBON_DCHECK(kind().is_opening_symbol());
+    token_payload_ = closing_index.index;
+  }
+
+  auto opening_token_index() const -> TokenIndex {
+    CARBON_DCHECK(kind().is_closing_symbol());
+    return TokenIndex(token_payload_);
+  }
+  auto set_opening_token_index(TokenIndex opening_index) -> void {
+    CARBON_DCHECK(kind().is_closing_symbol());
+    token_payload_ = opening_index.index;
+  }
+
+  auto error_length() const -> int {
+    CARBON_DCHECK(kind() == TokenKind::Error);
+    return token_payload_;
+  }
+
+  // Zero-based byte offset of the token within the file. This can be combined
+  // with the buffer's line information to locate the line and column of the
+  // token as well.
+  auto byte_offset() const -> int32_t { return byte_offset_; }
+
+  // Transforms the token into an error token of the given length but at its
+  // original position and with the same whitespace adjacency.
+  auto ResetAsError(int error_length) -> void {
+    // Construct a fresh token to establish any needed invariants and replace
+    // this token with it.
+    TokenInfo error(TokenKind::Error, has_leading_space(), error_length,
+                    byte_offset());
+    *this = error;
+  }
+
+ private:
+  friend class Lexer;
+
+  static constexpr int PayloadBits = 23;
+
+  // Make sure we have enough payload bits to represent token-associated IDs.
+  static_assert(PayloadBits >= IntId::TokenIdBits);
+  static_assert(PayloadBits >= TokenIndex::Bits);
+
+  // Constructor for a TokenKind that carries no payload, or where the payload
+  // will be set later.
+  //
+  // Only used by the lexer which enforces only the correct kinds are used.
+  //
+  // When the payload is not being set, we leave it uninitialized. At least in
+  // some cases, this will allow MSan to correctly detect erroneous attempts
+  // to access the payload, as it works to track uninitialized memory
+  // bit-for-bit specifically to handle complex cases like bitfields.
+  TokenInfo(TokenKind kind, bool has_leading_space, int32_t byte_offset)
+      : kind_(kind),
+        has_leading_space_(has_leading_space),
+        byte_offset_(byte_offset) {}
+
+  // Constructor for a TokenKind that carries a payload.
+  //
+  // Only used by the lexer which enforces the correct kind and payload types.
+  TokenInfo(TokenKind kind, bool has_leading_space, int payload,
+            int32_t byte_offset)
+      : kind_(kind),
+        has_leading_space_(has_leading_space),
+        token_payload_(payload),
+        byte_offset_(byte_offset) {}
+
+  // A bitfield that encodes the token's kind, the leading space flag, and the
+  // remaining bits in a payload. These are encoded together as a bitfield for
+  // density and because these are the hottest fields of tokens for consumers
+  // after lexing.
+  //
+  // Payload values are typically ID types for which we create at most one per
+  // token, so we ensure that `token_payload_` is large enough to fit any
+  // token index. Stores to this field may overflow, but we produce an error
+  // in `Lexer::Finalize` if the file has more than `TokenIndex::Max` tokens,
+  // so this value never overflows if lexing succeeds.
+  TokenKind kind_;
+  static_assert(sizeof(kind_) == 1, "TokenKind must pack to 8 bits");
+  bool has_leading_space_ : 1;
+  unsigned token_payload_ : PayloadBits;
+
+  // Separate storage for the byte offset, this is hot while lexing but then
+  // generally cold.
+  int32_t byte_offset_;
+};
+
+static_assert(sizeof(TokenInfo) == 8,
+              "Expected `TokenInfo` to pack to an 8-byte structure.");
+
+}  // namespace Carbon::Lex
+
+#endif  // CARBON_TOOLCHAIN_LEX_TOKEN_INFO_H_

+ 13 - 13
toolchain/lex/tokenized_buffer.cpp

@@ -24,7 +24,7 @@
 namespace Carbon::Lex {
 
 auto TokenizedBuffer::GetLine(TokenIndex token) const -> LineIndex {
-  return FindLineIndex(GetTokenInfo(token).byte_offset());
+  return FindLineIndex(token_infos_.Get(token).byte_offset());
 }
 
 auto TokenizedBuffer::GetLineNumber(TokenIndex token) const -> int {
@@ -32,7 +32,7 @@ auto TokenizedBuffer::GetLineNumber(TokenIndex token) const -> int {
 }
 
 auto TokenizedBuffer::GetColumnNumber(TokenIndex token) const -> int {
-  const auto& token_info = GetTokenInfo(token);
+  const auto& token_info = token_infos_.Get(token);
   const auto& line_info =
       line_infos_.Get(FindLineIndex(token_info.byte_offset()));
   return token_info.byte_offset() - line_info.start + 1;
@@ -58,7 +58,7 @@ auto TokenizedBuffer::GetEndLoc(TokenIndex token) const
 }
 
 auto TokenizedBuffer::GetTokenText(TokenIndex token) const -> llvm::StringRef {
-  const auto& token_info = GetTokenInfo(token);
+  const auto& token_info = token_infos_.Get(token);
   llvm::StringRef fixed_spelling = token_info.kind().fixed_spelling();
   if (!fixed_spelling.empty()) {
     return fixed_spelling;
@@ -109,21 +109,21 @@ auto TokenizedBuffer::GetTokenText(TokenIndex token) const -> llvm::StringRef {
 }
 
 auto TokenizedBuffer::GetIdentifier(TokenIndex token) const -> IdentifierId {
-  const auto& token_info = GetTokenInfo(token);
+  const auto& token_info = token_infos_.Get(token);
   CARBON_CHECK(token_info.kind() == TokenKind::Identifier, "{0}",
                token_info.kind());
   return token_info.ident_id();
 }
 
 auto TokenizedBuffer::GetIntLiteral(TokenIndex token) const -> IntId {
-  const auto& token_info = GetTokenInfo(token);
+  const auto& token_info = token_infos_.Get(token);
   CARBON_CHECK(token_info.kind() == TokenKind::IntLiteral, "{0}",
                token_info.kind());
   return token_info.int_id();
 }
 
 auto TokenizedBuffer::GetRealLiteral(TokenIndex token) const -> RealId {
-  const auto& token_info = GetTokenInfo(token);
+  const auto& token_info = token_infos_.Get(token);
   CARBON_CHECK(token_info.kind() == TokenKind::RealLiteral, "{0}",
                token_info.kind());
   return token_info.real_id();
@@ -131,14 +131,14 @@ auto TokenizedBuffer::GetRealLiteral(TokenIndex token) const -> RealId {
 
 auto TokenizedBuffer::GetStringLiteralValue(TokenIndex token) const
     -> StringLiteralValueId {
-  const auto& token_info = GetTokenInfo(token);
+  const auto& token_info = token_infos_.Get(token);
   CARBON_CHECK(token_info.kind() == TokenKind::StringLiteral, "{0}",
                token_info.kind());
   return token_info.string_literal_id();
 }
 
 auto TokenizedBuffer::GetTypeLiteralSize(TokenIndex token) const -> IntId {
-  const auto& token_info = GetTokenInfo(token);
+  const auto& token_info = token_infos_.Get(token);
   CARBON_CHECK(token_info.kind().is_sized_type_literal(), "{0}",
                token_info.kind());
   return token_info.int_id();
@@ -146,7 +146,7 @@ auto TokenizedBuffer::GetTypeLiteralSize(TokenIndex token) const -> IntId {
 
 auto TokenizedBuffer::GetMatchedClosingToken(TokenIndex opening_token) const
     -> TokenIndex {
-  const auto& opening_token_info = GetTokenInfo(opening_token);
+  const auto& opening_token_info = token_infos_.Get(opening_token);
   CARBON_CHECK(opening_token_info.kind().is_opening_symbol(), "{0}",
                opening_token_info.kind());
   return opening_token_info.closing_token_index();
@@ -154,7 +154,7 @@ auto TokenizedBuffer::GetMatchedClosingToken(TokenIndex opening_token) const
 
 auto TokenizedBuffer::GetMatchedOpeningToken(TokenIndex closing_token) const
     -> TokenIndex {
-  const auto& closing_token_info = GetTokenInfo(closing_token);
+  const auto& closing_token_info = token_infos_.Get(closing_token);
   CARBON_CHECK(closing_token_info.kind().is_closing_symbol(), "{0}",
                closing_token_info.kind());
   return closing_token_info.opening_token_index();
@@ -246,7 +246,7 @@ auto TokenizedBuffer::PrintToken(llvm::raw_ostream& output_stream,
     -> void {
   widths.Widen(GetTokenPrintWidths(token));
   int token_index = token.index;
-  const auto& token_info = GetTokenInfo(token);
+  const auto& token_info = token_infos_.Get(token);
   LineIndex line_index = FindLineIndex(token_info.byte_offset());
   llvm::StringRef token_text = GetTokenText(token);
 
@@ -338,7 +338,7 @@ auto TokenizedBuffer::FindLineIndex(int32_t byte_offset) const -> LineIndex {
 auto TokenizedBuffer::IsAfterComment(TokenIndex token,
                                      CommentIndex comment_index) const -> bool {
   const auto& comment_data = comments_.Get(comment_index);
-  return GetTokenInfo(token).byte_offset() > comment_data.start;
+  return token_infos_.Get(token).byte_offset() > comment_data.start;
 }
 
 auto TokenizedBuffer::GetCommentText(CommentIndex comment_index) const
@@ -411,7 +411,7 @@ auto TokenizedBuffer::TokenToDiagnosticLoc(TokenIndex token) const
     -> Diagnostics::ConvertedLoc {
   // Map the token location into a position within the source buffer.
   const char* token_start =
-      source_->text().begin() + GetTokenInfo(token).byte_offset();
+      source_->text().begin() + token_infos_.Get(token).byte_offset();
 
   // Find the corresponding file location.
   // TODO: Should we somehow indicate in the diagnostic location if this token

+ 10 - 180
toolchain/lex/tokenized_buffer.h

@@ -19,6 +19,7 @@
 #include "toolchain/base/shared_value_stores.h"
 #include "toolchain/diagnostics/diagnostic_emitter.h"
 #include "toolchain/lex/token_index.h"
+#include "toolchain/lex/token_info.h"
 #include "toolchain/lex/token_kind.h"
 #include "toolchain/source/source_buffer.h"
 
@@ -177,7 +178,7 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
   auto GetIndentColumnNumber(LineIndex line) const -> int;
 
   auto GetByteOffset(TokenIndex token) const -> int32_t {
-    return GetTokenInfo(token).byte_offset();
+    return token_infos_.Get(token).byte_offset();
   }
 
   // Returns true if the token comes after the comment.
@@ -299,167 +300,6 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
     int indent;
   };
 
-  // Storage for the information about a specific token in the buffer.
-  //
-  // This provides a friendly accessor API to the carefully space-optimized
-  // storage model of the information we associated with each token.
-  //
-  // There are four pieces of information stored here:
-  // - The kind of the token.
-  // - Whether that token has leading whitespace before it.
-  // - A kind-specific payload that can be compressed into a small integer.
-  //   - This class provides dedicated accessors for each different form of
-  //     payload that check the kind and payload correspond correctly.
-  // - A 32-bit byte offset of the token within the source text.
-  //
-  // These are compressed and stored in 8-bytes for each token.
-  //
-  // Note that while the class provides some limited setters for payloads and
-  // mutating methods, setters on this type may be unexpectedly expensive due to
-  // the bit-packed representation and should be avoided. As such, only the
-  // minimal necessary setters are provided.
-  //
-  // TODO: It might be worth considering a struct-of-arrays data layout in order
-  // to move the byte offset to a separate array from the rest as it is only hot
-  // during lexing, and then cold during parsing and semantic analysis. However,
-  // a trivial approach to that adds more overhead than it saves due to tracking
-  // two separate vectors and their growth. Making this profitable would likely
-  // at least require a highly specialized single vector that manages the growth
-  // once and then provides separate storage areas for the two arrays.
-  class TokenInfo {
-   public:
-    // The kind for this token.
-    auto kind() const -> TokenKind { return kind_; }
-
-    // Whether this token is preceded by whitespace. We only store the preceding
-    // state, and look at the next token to check for trailing whitespace.
-    auto has_leading_space() const -> bool { return has_leading_space_; }
-
-    // A collection of methods to access the specific payload included with
-    // particular kinds of tokens. Only the specific payload accessor below may
-    // be used for an info entry of a token with a particular kind, and these
-    // check that the kind is valid. Some tokens do not include a payload at all
-    // and none of these methods may be called.
-    auto ident_id() const -> IdentifierId {
-      CARBON_DCHECK(kind() == TokenKind::Identifier);
-      return IdentifierId(token_payload_);
-    }
-    auto set_ident_id(IdentifierId ident_id) -> void {
-      CARBON_DCHECK(kind() == TokenKind::Identifier);
-      token_payload_ = ident_id.index;
-    }
-
-    auto string_literal_id() const -> StringLiteralValueId {
-      CARBON_DCHECK(kind() == TokenKind::StringLiteral);
-      return StringLiteralValueId(token_payload_);
-    }
-
-    auto int_id() const -> IntId {
-      CARBON_DCHECK(kind() == TokenKind::IntLiteral ||
-                    kind() == TokenKind::IntTypeLiteral ||
-                    kind() == TokenKind::UnsignedIntTypeLiteral ||
-                    kind() == TokenKind::FloatTypeLiteral);
-      return IntId::MakeFromTokenPayload(token_payload_);
-    }
-
-    auto real_id() const -> RealId {
-      CARBON_DCHECK(kind() == TokenKind::RealLiteral);
-      return RealId(token_payload_);
-    }
-
-    auto closing_token_index() const -> TokenIndex {
-      CARBON_DCHECK(kind().is_opening_symbol());
-      return TokenIndex(token_payload_);
-    }
-    auto set_closing_token_index(TokenIndex closing_index) -> void {
-      CARBON_DCHECK(kind().is_opening_symbol());
-      token_payload_ = closing_index.index;
-    }
-
-    auto opening_token_index() const -> TokenIndex {
-      CARBON_DCHECK(kind().is_closing_symbol());
-      return TokenIndex(token_payload_);
-    }
-    auto set_opening_token_index(TokenIndex opening_index) -> void {
-      CARBON_DCHECK(kind().is_closing_symbol());
-      token_payload_ = opening_index.index;
-    }
-
-    auto error_length() const -> int {
-      CARBON_DCHECK(kind() == TokenKind::Error);
-      return token_payload_;
-    }
-
-    // Zero-based byte offset of the token within the file. This can be combined
-    // with the buffer's line information to locate the line and column of the
-    // token as well.
-    auto byte_offset() const -> int32_t { return byte_offset_; }
-
-    // Transforms the token into an error token of the given length but at its
-    // original position and with the same whitespace adjacency.
-    auto ResetAsError(int error_length) -> void {
-      // Construct a fresh token to establish any needed invariants and replace
-      // this token with it.
-      TokenInfo error(TokenKind::Error, has_leading_space(), error_length,
-                      byte_offset());
-      *this = error;
-    }
-
-   private:
-    friend class Lexer;
-
-    static constexpr int PayloadBits = 23;
-
-    // Make sure we have enough payload bits to represent token-associated IDs.
-    static_assert(PayloadBits >= IntId::TokenIdBits);
-    static_assert(PayloadBits >= TokenIndex::Bits);
-
-    // Constructor for a TokenKind that carries no payload, or where the payload
-    // will be set later.
-    //
-    // Only used by the lexer which enforces only the correct kinds are used.
-    //
-    // When the payload is not being set, we leave it uninitialized. At least in
-    // some cases, this will allow MSan to correctly detect erroneous attempts
-    // to access the payload, as it works to track uninitialized memory
-    // bit-for-bit specifically to handle complex cases like bitfields.
-    TokenInfo(TokenKind kind, bool has_leading_space, int32_t byte_offset)
-        : kind_(kind),
-          has_leading_space_(has_leading_space),
-          byte_offset_(byte_offset) {}
-
-    // Constructor for a TokenKind that carries a payload.
-    //
-    // Only used by the lexer which enforces the correct kind and payload types.
-    TokenInfo(TokenKind kind, bool has_leading_space, int payload,
-              int32_t byte_offset)
-        : kind_(kind),
-          has_leading_space_(has_leading_space),
-          token_payload_(payload),
-          byte_offset_(byte_offset) {}
-
-    // A bitfield that encodes the token's kind, the leading space flag, and the
-    // remaining bits in a payload. These are encoded together as a bitfield for
-    // density and because these are the hottest fields of tokens for consumers
-    // after lexing.
-    //
-    // Payload values are typically ID types for which we create at most one per
-    // token, so we ensure that `token_payload_` is large enough to fit any
-    // token index. Stores to this field may overflow, but we produce an error
-    // in `Lexer::Finalize` if the file has more than `TokenIndex::Max` tokens,
-    // so this value never overflows if lexing succeeds.
-    TokenKind kind_;
-    static_assert(sizeof(kind_) == 1, "TokenKind must pack to 8 bits");
-    bool has_leading_space_ : 1;
-    unsigned token_payload_ : PayloadBits;
-
-    // Separate storage for the byte offset, this is hot while lexing but then
-    // generally cold.
-    int32_t byte_offset_;
-  };
-  static_assert(sizeof(TokenInfo) == 8,
-                "Expected `TokenInfo` to pack to an 8-byte structure.");
-
   // The constructor is merely responsible for trivial initialization of
   // members. A working object of this type is built with `Lex::Lex` so that its
   // return can indicate if an error was encountered while lexing.
@@ -469,9 +309,10 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
       : value_stores_(&value_stores), source_(&source) {}
 
   auto FindLineIndex(int32_t byte_offset) const -> LineIndex;
-  auto GetTokenInfo(TokenIndex token) -> TokenInfo&;
-  auto GetTokenInfo(TokenIndex token) const -> const TokenInfo&;
+
+  // Adds the token and adjusts the expected tree size.
   auto AddToken(TokenInfo info) -> TokenIndex;
+
   auto GetTokenPrintWidths(TokenIndex token) const -> PrintWidths;
   auto PrintToken(llvm::raw_ostream& output_stream, TokenIndex token,
                   PrintWidths widths) const -> void;
@@ -486,7 +327,7 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
   SharedValueStores* value_stores_;
   SourceBuffer* source_;
 
-  llvm::SmallVector<TokenInfo> token_infos_;
+  ValueStore<TokenIndex> token_infos_;
 
   ValueStore<LineIndex> line_infos_;
 
@@ -514,35 +355,24 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
 };
 
 inline auto TokenizedBuffer::GetKind(TokenIndex token) const -> TokenKind {
-  return GetTokenInfo(token).kind();
+  return token_infos_.Get(token).kind();
 }
 
 inline auto TokenizedBuffer::HasLeadingWhitespace(TokenIndex token) const
     -> bool {
-  return GetTokenInfo(token).has_leading_space();
+  return token_infos_.Get(token).has_leading_space();
 }
 
 inline auto TokenizedBuffer::HasTrailingWhitespace(TokenIndex token) const
     -> bool {
   TokenIterator it(token);
   ++it;
-  return it != tokens().end() && GetTokenInfo(*it).has_leading_space();
-}
-
-inline auto TokenizedBuffer::GetTokenInfo(TokenIndex token) -> TokenInfo& {
-  return token_infos_[token.index];
-}
-
-inline auto TokenizedBuffer::GetTokenInfo(TokenIndex token) const
-    -> const TokenInfo& {
-  return token_infos_[token.index];
+  return it != tokens().end() && token_infos_.Get(*it).has_leading_space();
 }
 
 inline auto TokenizedBuffer::AddToken(TokenInfo info) -> TokenIndex {
-  TokenIndex index(token_infos_.size());
-  token_infos_.push_back(info);
   expected_max_parse_tree_size_ += info.kind().expected_max_parse_tree_size();
-  return index;
+  return token_infos_.Add(info);
 }
 
 }  // namespace Carbon::Lex