فهرست منبع

Add value store to be shared across compile stages. (#3311)

This updates lexing to use the data. I'll do checking separately, just
to split changes.

Note the ValueStore structure is also set up such that SemIR::File can
use it for other fields.

---------

Co-authored-by: Richard Smith <richard@metafoo.co.uk>
Jon Ross-Perkins 2 سال پیش
والد
کامیت
d13f76e001

+ 3 - 1
language_server/language_server.cpp

@@ -89,13 +89,15 @@ static auto getName(Parse::Tree& p, Parse::Node node)
 void LanguageServer::OnDocumentSymbol(
     clang::clangd::DocumentSymbolParams const& params,
     clang::clangd::Callback<std::vector<clang::clangd::DocumentSymbol>> cb) {
+  SharedValueStores value_stores;
   llvm::vfs::InMemoryFileSystem vfs;
   auto file = params.textDocument.uri.file().str();
   vfs.addFile(file, /*mtime=*/0,
               llvm::MemoryBuffer::getMemBufferCopy(files_.at(file)));
 
   auto buf = SourceBuffer::CreateFromFile(vfs, file, NullDiagnosticConsumer());
-  auto lexed = Lex::TokenizedBuffer::Lex(*buf, NullDiagnosticConsumer());
+  auto lexed =
+      Lex::TokenizedBuffer::Lex(value_stores, *buf, NullDiagnosticConsumer());
   auto parsed = Parse::Tree::Parse(lexed, NullDiagnosticConsumer(), nullptr);
   std::vector<clang::clangd::DocumentSymbol> result;
   for (const auto& node : parsed.postorder()) {

+ 23 - 1
toolchain/base/BUILD

@@ -2,7 +2,7 @@
 # Exceptions. See /LICENSE for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-load("@rules_cc//cc:defs.bzl", "cc_library")
+load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
 
 package(default_visibility = ["//visibility:public"])
 
@@ -22,3 +22,25 @@ cc_library(
         "@llvm-project//llvm:Support",
     ],
 )
+
+cc_library(
+    name = "value_store",
+    hdrs = ["value_store.h"],
+    deps = [
+        ":index_base",
+        "//common:check",
+        "//common:ostream",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+cc_test(
+    name = "value_store_test",
+    size = "small",
+    srcs = ["value_store_test.cpp"],
+    deps = [
+        ":value_store",
+        "//testing/base:gtest_main",
+        "@com_google_googletest//:gtest",
+    ],
+)

+ 141 - 0
toolchain/base/value_store.h

@@ -0,0 +1,141 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef CARBON_TOOLCHAIN_BASE_VALUE_STORE_H_
+#define CARBON_TOOLCHAIN_BASE_VALUE_STORE_H_
+
+#include "common/check.h"
+#include "common/ostream.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "toolchain/base/index_base.h"
+
+namespace Carbon {
+
+// The value of a real literal.
+//
+// This is either a dyadic fraction (mantissa * 2^exponent) or a decadic
+// fraction (mantissa * 10^exponent).
+class Real : public Printable<Real> {
+ public:
+  auto Print(llvm::raw_ostream& output_stream) const -> void {
+    mantissa.print(output_stream, /*isSigned=*/false);
+    output_stream << "*" << (is_decimal ? "10" : "2") << "^" << exponent;
+  }
+
+  // The mantissa, represented as an unsigned integer.
+  llvm::APInt mantissa;
+
+  // The exponent, represented as a signed integer.
+  llvm::APInt exponent;
+
+  // If false, the value is mantissa * 2^exponent.
+  // If true, the value is mantissa * 10^exponent.
+  // TODO: This field increases Real from 32 bytes to 40 bytes. Consider
+  // changing how it's tracked for space savings.
+  bool is_decimal;
+};
+
+// Corresponds to an integer value represented by an APInt.
+struct IntegerId : public IndexBase, public Printable<IntegerId> {
+  using IndexedType = llvm::APInt;
+  static const IntegerId Invalid;
+  using IndexBase::IndexBase;
+  auto Print(llvm::raw_ostream& out) const -> void {
+    out << "int";
+    IndexBase::Print(out);
+  }
+};
+constexpr IntegerId IntegerId::Invalid(IntegerId::InvalidIndex);
+
+// Corresponds to a Real value.
+struct RealId : public IndexBase, public Printable<RealId> {
+  using IndexedType = Real;
+  static const RealId Invalid;
+  using IndexBase::IndexBase;
+  auto Print(llvm::raw_ostream& out) const -> void {
+    out << "real";
+    IndexBase::Print(out);
+  }
+};
+constexpr RealId RealId::Invalid(RealId::InvalidIndex);
+
+// Corresponds to a StringRef.
+struct StringId : public IndexBase, public Printable<StringId> {
+  using IndexedType = std::string;
+  static const StringId Invalid;
+  using IndexBase::IndexBase;
+  auto Print(llvm::raw_ostream& out) const -> void {
+    out << "str";
+    IndexBase::Print(out);
+  }
+};
+constexpr StringId StringId::Invalid(StringId::InvalidIndex);
+
+// A simple wrapper for accumulating values, providing IDs to later retrieve the
+// value. This does not do deduplication.
+template <typename IdT>
+class ValueStore {
+ public:
+  // Stores the value and returns an ID to reference it.
+  auto Add(typename IdT::IndexedType value) -> IdT {
+    auto id = IdT(values_.size());
+    values_.push_back(std::move(value));
+    return id;
+  }
+
+  // Returns the value for an ID.
+  auto Get(IdT id) const -> const typename IdT::IndexedType& {
+    CARBON_CHECK(id.index >= 0) << id.index;
+    return values_[id.index];
+  }
+
+ private:
+  llvm::SmallVector<typename IdT::IndexedType> values_;
+};
+
+// Storage for StringRefs. The caller is responsible for ensuring storage is
+// allocated.
+template <>
+class ValueStore<StringId> {
+ public:
+  // Returns an ID to reference the value. May return an existing ID if the
+  // string was previously added.
+  auto Add(llvm::StringRef value) -> StringId {
+    auto [it, inserted] = map_.insert({value, StringId(values_.size())});
+    if (inserted) {
+      values_.push_back(value);
+    }
+    return it->second;
+  }
+
+  // Returns the value for an ID.
+  auto Get(StringId id) const -> llvm::StringRef {
+    CARBON_CHECK(id.is_valid());
+    return values_[id.index];
+  }
+
+ private:
+  llvm::DenseMap<llvm::StringRef, StringId> map_;
+  llvm::SmallVector<llvm::StringRef> values_;
+};
+
+// Stores that will be used across compiler steps. This is provided mainly so
+// that they don't need to be passed separately.
+class SharedValueStores {
+ public:
+  auto integers() -> ValueStore<IntegerId>& { return integers_; }
+  auto reals() -> ValueStore<RealId>& { return reals_; }
+  auto strings() -> ValueStore<StringId>& { return strings_; }
+
+ private:
+  ValueStore<IntegerId> integers_;
+  ValueStore<RealId> reals_;
+  ValueStore<StringId> strings_;
+};
+
+}  // namespace Carbon
+
+#endif  // CARBON_TOOLCHAIN_BASE_VALUE_STORE_H_

+ 72 - 0
toolchain/base/value_store_test.cpp

@@ -0,0 +1,72 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "toolchain/base/value_store.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace Carbon {
+namespace {
+
+using ::testing::Eq;
+using ::testing::Not;
+
+TEST(ValueStore, Integer) {
+  SharedValueStores value_stores;
+  IntegerId id1 = value_stores.integers().Add(llvm::APInt(64, 1));
+  IntegerId id2 = value_stores.integers().Add(llvm::APInt(64, 2));
+
+  ASSERT_TRUE(id1.is_valid());
+  ASSERT_TRUE(id2.is_valid());
+  EXPECT_THAT(id1, Not(Eq(id2)));
+
+  EXPECT_THAT(value_stores.integers().Get(id1), Eq(1));
+  EXPECT_THAT(value_stores.integers().Get(id2), Eq(2));
+}
+
+TEST(ValueStore, Real) {
+  Real real1{.mantissa = llvm::APInt(64, 1),
+             .exponent = llvm::APInt(64, 11),
+             .is_decimal = true};
+  Real real2{.mantissa = llvm::APInt(64, 2),
+             .exponent = llvm::APInt(64, 22),
+             .is_decimal = false};
+
+  SharedValueStores value_stores;
+  RealId id1 = value_stores.reals().Add(real1);
+  RealId id2 = value_stores.reals().Add(real2);
+
+  ASSERT_TRUE(id1.is_valid());
+  ASSERT_TRUE(id2.is_valid());
+  EXPECT_THAT(id1, Not(Eq(id2)));
+
+  const auto& real1_copy = value_stores.reals().Get(id1);
+  EXPECT_THAT(real1.mantissa, Eq(real1_copy.mantissa));
+  EXPECT_THAT(real1.exponent, Eq(real1_copy.exponent));
+  EXPECT_THAT(real1.is_decimal, Eq(real1_copy.is_decimal));
+
+  const auto& real2_copy = value_stores.reals().Get(id2);
+  EXPECT_THAT(real2.mantissa, Eq(real2_copy.mantissa));
+  EXPECT_THAT(real2.exponent, Eq(real2_copy.exponent));
+  EXPECT_THAT(real2.is_decimal, Eq(real2_copy.is_decimal));
+}
+
+TEST(ValueStore, String) {
+  std::string a = "a";
+  std::string b = "b";
+  SharedValueStores value_stores;
+  StringId a_id = value_stores.strings().Add(a);
+  StringId b_id = value_stores.strings().Add(b);
+
+  ASSERT_TRUE(a_id.is_valid());
+  ASSERT_TRUE(b_id.is_valid());
+
+  EXPECT_THAT(a_id, Not(Eq(b_id)));
+  EXPECT_THAT(value_stores.strings().Get(a_id), Eq(a));
+  EXPECT_THAT(value_stores.strings().Get(b_id), Eq(b));
+}
+
+}  // namespace
+}  // namespace Carbon

+ 4 - 4
toolchain/check/context.cpp

@@ -82,8 +82,8 @@ auto Context::DiagnoseDuplicateName(Parse::Node parse_node,
       .Emit();
 }
 
-auto Context::DiagnoseNameNotFound(Parse::Node parse_node,
-                                   SemIR::StringId name_id) -> void {
+auto Context::DiagnoseNameNotFound(Parse::Node parse_node, StringId name_id)
+    -> void {
   CARBON_DIAGNOSTIC(NameNotFound, Error, "Name `{0}` not found.",
                     llvm::StringRef);
   emitter_->Emit(parse_node, NameNotFound, semantics_ir_->GetString(name_id));
@@ -96,7 +96,7 @@ auto Context::NoteIncompleteClass(SemIR::ClassDeclaration class_decl,
   builder.Note(class_decl.parse_node, ClassForwardDeclaredHere);
 }
 
-auto Context::AddNameToLookup(Parse::Node name_node, SemIR::StringId name_id,
+auto Context::AddNameToLookup(Parse::Node name_node, StringId name_id,
                               SemIR::NodeId target_id) -> void {
   if (current_scope().names.insert(name_id).second) {
     name_lookup_[name_id].push_back(target_id);
@@ -105,7 +105,7 @@ auto Context::AddNameToLookup(Parse::Node name_node, SemIR::StringId name_id,
   }
 }
 
-auto Context::LookupName(Parse::Node parse_node, SemIR::StringId name_id,
+auto Context::LookupName(Parse::Node parse_node, StringId name_id,
                          SemIR::NameScopeId scope_id, bool print_diagnostics)
     -> SemIR::NodeId {
   if (scope_id == SemIR::NameScopeId::Invalid) {

+ 5 - 7
toolchain/check/context.h

@@ -49,12 +49,12 @@ class Context {
   auto AddNodeAndPush(Parse::Node parse_node, SemIR::Node node) -> void;
 
   // Adds a name to name lookup. Prints a diagnostic for name conflicts.
-  auto AddNameToLookup(Parse::Node name_node, SemIR::StringId name_id,
+  auto AddNameToLookup(Parse::Node name_node, StringId name_id,
                        SemIR::NodeId target_id) -> void;
 
   // Performs name lookup in a specified scope, returning the referenced node.
   // If scope_id is invalid, uses the current contextual scope.
-  auto LookupName(Parse::Node parse_node, SemIR::StringId name_id,
+  auto LookupName(Parse::Node parse_node, StringId name_id,
                   SemIR::NameScopeId scope_id, bool print_diagnostics)
       -> SemIR::NodeId;
 
@@ -63,8 +63,7 @@ class Context {
       -> void;
 
   // Prints a diagnostic for a missing name.
-  auto DiagnoseNameNotFound(Parse::Node parse_node, SemIR::StringId name_id)
-      -> void;
+  auto DiagnoseNameNotFound(Parse::Node parse_node, StringId name_id) -> void;
 
   // Adds a note to a diagnostic explaining that a class is incomplete.
   auto NoteIncompleteClass(SemIR::ClassDeclaration class_decl,
@@ -246,7 +245,7 @@ class Context {
 
     // Names which are registered with name_lookup_, and will need to be
     // deregistered when the scope ends.
-    llvm::DenseSet<SemIR::StringId> names;
+    llvm::DenseSet<StringId> names;
 
     // TODO: This likely needs to track things which need to be destructed.
   };
@@ -326,8 +325,7 @@ class Context {
   // reference.
   //
   // Names which no longer have lookup results are erased.
-  llvm::DenseMap<SemIR::StringId, llvm::SmallVector<SemIR::NodeId>>
-      name_lookup_;
+  llvm::DenseMap<StringId, llvm::SmallVector<SemIR::NodeId>> name_lookup_;
 
   // Cache of the mapping from nodes to types, to avoid recomputing the folding
   // set ID.

+ 3 - 4
toolchain/check/declaration_name_stack.cpp

@@ -13,7 +13,7 @@ auto DeclarationNameStack::MakeEmptyNameContext() -> NameContext {
 }
 
 auto DeclarationNameStack::MakeUnqualifiedName(Parse::Node parse_node,
-                                               SemIR::StringId name_id)
+                                               StringId name_id)
     -> NameContext {
   NameContext context = MakeEmptyNameContext();
   ApplyNameQualifierTo(context, parse_node, name_id);
@@ -106,14 +106,13 @@ auto DeclarationNameStack::ApplyExpressionQualifier(Parse::Node parse_node,
 }
 
 auto DeclarationNameStack::ApplyNameQualifier(Parse::Node parse_node,
-                                              SemIR::StringId name_id) -> void {
+                                              StringId name_id) -> void {
   ApplyNameQualifierTo(declaration_name_stack_.back(), parse_node, name_id);
 }
 
 auto DeclarationNameStack::ApplyNameQualifierTo(NameContext& name_context,
                                                 Parse::Node parse_node,
-                                                SemIR::StringId name_id)
-    -> void {
+                                                StringId name_id) -> void {
   if (CanResolveQualifier(name_context, parse_node)) {
     // For identifier nodes, we need to perform a lookup on the identifier.
     // This means the input node_id is actually a string ID.

+ 4 - 5
toolchain/check/declaration_name_stack.h

@@ -89,7 +89,7 @@ class DeclarationNameStack {
       SemIR::NodeId resolved_node_id = SemIR::NodeId::Invalid;
 
       // The ID of an unresolved identifier.
-      SemIR::StringId unresolved_name_id;
+      StringId unresolved_name_id;
     };
   };
 
@@ -108,7 +108,7 @@ class DeclarationNameStack {
   // unqualified name in the current context. This is suitable for adding to
   // name lookup in situations where a qualified name is not permitted, such as
   // a pattern binding.
-  auto MakeUnqualifiedName(Parse::Node parse_node, SemIR::StringId name_id)
+  auto MakeUnqualifiedName(Parse::Node parse_node, StringId name_id)
       -> NameContext;
 
   // Applies an expression from the node stack to the top of the declaration
@@ -118,8 +118,7 @@ class DeclarationNameStack {
 
   // Applies a Name from the node stack to the top of the declaration name
   // stack.
-  auto ApplyNameQualifier(Parse::Node parse_node, SemIR::StringId name_id)
-      -> void;
+  auto ApplyNameQualifier(Parse::Node parse_node, StringId name_id) -> void;
 
   // Adds a name to name lookup. Prints a diagnostic for name conflicts.
   auto AddNameToLookup(NameContext name_context, SemIR::NodeId target_id)
@@ -136,7 +135,7 @@ class DeclarationNameStack {
 
   // Applies a Name from the node stack to given name context.
   auto ApplyNameQualifierTo(NameContext& name_context, Parse::Node parse_node,
-                            SemIR::StringId name_id) -> void;
+                            StringId name_id) -> void;
 
   // Returns true if the context is in a state where it can resolve qualifiers.
   // Updates name_context as needed.

+ 1 - 1
toolchain/check/handle_class.cpp

@@ -55,7 +55,7 @@ static auto BuildClassDeclaration(Context& context)
         {.name_id = name_context.state ==
                             DeclarationNameStack::NameContext::State::Unresolved
                         ? name_context.unresolved_name_id
-                        : SemIR::StringId(SemIR::StringId::InvalidIndex)});
+                        : StringId::Invalid});
   }
 
   // Write the class ID into the ClassDeclaration.

+ 1 - 1
toolchain/check/handle_function.cpp

@@ -97,7 +97,7 @@ static auto BuildFunctionDeclaration(Context& context, bool is_definition)
         {.name_id = name_context.state ==
                             DeclarationNameStack::NameContext::State::Unresolved
                         ? name_context.unresolved_name_id
-                        : SemIR::StringId(SemIR::StringId::InvalidIndex),
+                        : StringId::Invalid,
          .param_refs_id = param_refs_id,
          .return_type_id = return_type_id,
          .return_slot_id = return_slot_id});

+ 2 - 3
toolchain/check/handle_name.cpp

@@ -38,7 +38,7 @@ static auto GetAsNameScope(Context& context, SemIR::NodeId base_id)
 
 auto HandleMemberAccessExpression(Context& context, Parse::Node parse_node)
     -> bool {
-  SemIR::StringId name_id = context.node_stack().Pop<Parse::NodeKind::Name>();
+  StringId name_id = context.node_stack().Pop<Parse::NodeKind::Name>();
   auto base_id = context.node_stack().PopExpression();
 
   // If the base is a name scope, such as a class or namespace, perform lookup
@@ -152,8 +152,7 @@ auto HandleQualifiedDeclaration(Context& context, Parse::Node parse_node)
 
   Parse::Node parse_node2 = context.node_stack().PeekParseNode();
   if (context.parse_tree().node_kind(parse_node2) == Parse::NodeKind::Name) {
-    SemIR::StringId name_id2 =
-        context.node_stack().Pop<Parse::NodeKind::Name>();
+    StringId name_id2 = context.node_stack().Pop<Parse::NodeKind::Name>();
     pop_and_apply_first_child();
     context.declaration_name_stack().ApplyNameQualifier(parse_node2, name_id2);
   } else {

+ 1 - 1
toolchain/check/handle_struct.cpp

@@ -41,7 +41,7 @@ auto HandleStructFieldUnknown(Context& context, Parse::Node parse_node)
 auto HandleStructFieldValue(Context& context, Parse::Node parse_node) -> bool {
   auto [value_parse_node, value_node_id] =
       context.node_stack().PopExpressionWithParseNode();
-  SemIR::StringId name_id = context.node_stack().Pop<Parse::NodeKind::Name>();
+  StringId name_id = context.node_stack().Pop<Parse::NodeKind::Name>();
 
   // Store the name for the type.
   context.args_type_info_stack().AddNode(SemIR::StructTypeField(

+ 6 - 6
toolchain/check/node_stack.h

@@ -114,7 +114,7 @@ class NodeStack {
       return back;
     }
     if constexpr (RequiredIdKind == IdKind::StringId) {
-      auto back = PopWithParseNode<SemIR::StringId>();
+      auto back = PopWithParseNode<StringId>();
       RequireParseKind<RequiredParseKind>(back.first);
       return back;
     }
@@ -163,7 +163,7 @@ class NodeStack {
       return back.id<SemIR::ClassId>();
     }
     if constexpr (RequiredIdKind == IdKind::StringId) {
-      return back.id<SemIR::StringId>();
+      return back.id<StringId>();
     }
     if constexpr (RequiredIdKind == IdKind::TypeId) {
       return back.id<SemIR::TypeId>();
@@ -202,7 +202,7 @@ class NodeStack {
         : parse_node(parse_node), node_block_id(node_block_id) {}
     explicit Entry(Parse::Node parse_node, SemIR::FunctionId function_id)
         : parse_node(parse_node), function_id(function_id) {}
-    explicit Entry(Parse::Node parse_node, SemIR::StringId name_id)
+    explicit Entry(Parse::Node parse_node, StringId name_id)
         : parse_node(parse_node), name_id(name_id) {}
     explicit Entry(Parse::Node parse_node, SemIR::TypeId type_id)
         : parse_node(parse_node), type_id(type_id) {}
@@ -219,7 +219,7 @@ class NodeStack {
       if constexpr (std::is_same<T, SemIR::FunctionId>()) {
         return function_id;
       }
-      if constexpr (std::is_same<T, SemIR::StringId>()) {
+      if constexpr (std::is_same<T, StringId>()) {
         return name_id;
       }
       if constexpr (std::is_same<T, SemIR::TypeId>()) {
@@ -239,7 +239,7 @@ class NodeStack {
       SemIR::NodeId node_id;
       SemIR::NodeBlockId node_block_id;
       SemIR::FunctionId function_id;
-      SemIR::StringId name_id;
+      StringId name_id;
       SemIR::TypeId type_id;
     };
   };
@@ -315,7 +315,7 @@ class NodeStack {
     if constexpr (std::is_same_v<IdT, SemIR::ClassId>) {
       return IdKind::ClassId;
     }
-    if constexpr (std::is_same_v<IdT, SemIR::StringId>) {
+    if constexpr (std::is_same_v<IdT, StringId>) {
       return IdKind::StringId;
     }
     if constexpr (std::is_same_v<IdT, SemIR::TypeId>) {

+ 10 - 5
toolchain/driver/driver.cpp

@@ -383,9 +383,11 @@ auto Driver::ValidateCompileOptions(const CompileOptions& options) const
 // Ties together information for a file being compiled.
 class Driver::CompilationUnit {
  public:
-  explicit CompilationUnit(Driver* driver, const CompileOptions& options,
+  explicit CompilationUnit(Driver* driver, SharedValueStores* value_stores,
+                           const CompileOptions& options,
                            llvm::StringRef input_file_name)
       : driver_(driver),
+        value_stores_(value_stores),
         options_(options),
         input_file_name_(input_file_name),
         vlog_stream_(driver_->vlog_stream_),
@@ -410,8 +412,9 @@ class Driver::CompilationUnit {
     CARBON_VLOG() << "*** SourceBuffer ***\n```\n"
                   << source_->text() << "\n```\n";
 
-    LogCall("Lex::TokenizedBuffer::Lex",
-            [&] { tokens_ = Lex::TokenizedBuffer::Lex(*source_, *consumer_); });
+    LogCall("Lex::TokenizedBuffer::Lex", [&] {
+      tokens_ = Lex::TokenizedBuffer::Lex(*value_stores_, *source_, *consumer_);
+    });
     if (options_.dump_tokens) {
       consumer_->Flush();
       driver_->output_stream_ << tokens_;
@@ -570,6 +573,7 @@ class Driver::CompilationUnit {
   }
 
   Driver* driver_;
+  SharedValueStores* value_stores_;
   const CompileOptions& options_;
   llvm::StringRef input_file_name_;
 
@@ -595,6 +599,7 @@ auto Driver::Compile(const CompileOptions& options) -> bool {
     return false;
   }
 
+  SharedValueStores value_stores;
   llvm::SmallVector<std::unique_ptr<CompilationUnit>> units;
   auto flush = llvm::make_scope_exit([&]() {
     // The diagnostics consumer must be flushed before compilation artifacts are
@@ -605,8 +610,8 @@ auto Driver::Compile(const CompileOptions& options) -> bool {
     }
   });
   for (const auto& input_file_name : options.input_file_names) {
-    units.push_back(
-        std::make_unique<CompilationUnit>(this, options, input_file_name));
+    units.push_back(std::make_unique<CompilationUnit>(
+        this, &value_stores, options, input_file_name));
   }
 
   // Lex.

+ 4 - 0
toolchain/lex/BUILD

@@ -188,6 +188,7 @@ cc_library(
         "//common:ostream",
         "//common:string_helpers",
         "//toolchain/base:index_base",
+        "//toolchain/base:value_store",
         "//toolchain/diagnostics:diagnostic_emitter",
         "//toolchain/source:source_buffer",
         "@llvm-project//llvm:Support",
@@ -215,6 +216,7 @@ cc_test(
         ":tokenized_buffer_test_helpers",
         "//testing/base:gtest_main",
         "//testing/base:test_raw_ostream",
+        "//toolchain/base:value_store",
         "//toolchain/diagnostics:diagnostic_emitter",
         "//toolchain/diagnostics:mocks",
         "//toolchain/testing:yaml_test_helpers",
@@ -231,6 +233,7 @@ cc_fuzz_test(
     deps = [
         ":tokenized_buffer",
         "//common:check",
+        "//toolchain/base:value_store",
         "//toolchain/diagnostics:diagnostic_emitter",
         "//toolchain/diagnostics:null_diagnostics",
         "@llvm-project//llvm:Support",
@@ -245,6 +248,7 @@ cc_binary(
         ":token_kind",
         ":tokenized_buffer",
         "//common:check",
+        "//toolchain/base:value_store",
         "//toolchain/diagnostics:diagnostic_emitter",
         "//toolchain/diagnostics:null_diagnostics",
         "@com_github_google_benchmark//:benchmark_main",

+ 5 - 2
toolchain/lex/testdata/multifile.carbon

@@ -19,8 +19,11 @@ a;
 // CHECK:STDOUT:   tokens: [
 // CHECK:STDOUT:     { index: 0, kind: 'StartOfFile', line: {{ *\d+}}, column:  1, indent: 1, spelling: '', has_trailing_space: true },
 b;
-// CHECK:STDOUT:     { index: 1, kind:  'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'b', identifier: 0 },
+// CHECK:STDOUT:     { index: 1, kind:  'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'b', identifier: 1 },
 // CHECK:STDOUT:     { index: 2, kind:        'Semi', line: {{ *}}[[@LINE-2]], column:  2, indent: 1, spelling: ';', has_trailing_space: true },
+a;
+// CHECK:STDOUT:     { index: 3, kind:  'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'a', identifier: 0 },
+// CHECK:STDOUT:     { index: 4, kind:        'Semi', line: {{ *}}[[@LINE-2]], column:  2, indent: 1, spelling: ';', has_trailing_space: true },
 
-// CHECK:STDOUT:     { index: 3, kind:   'EndOfFile', line: {{ *}}[[@LINE+1]], column: {{ *\d+}}, indent: 1, spelling: '' },
+// CHECK:STDOUT:     { index: 5, kind:   'EndOfFile', line: {{ *}}[[@LINE+1]], column: {{ *\d+}}, indent: 1, spelling: '' },
 // CHECK:STDOUT:   ]

+ 40 - 62
toolchain/lex/tokenized_buffer.cpp

@@ -16,6 +16,7 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
+#include "toolchain/base/value_store.h"
 #include "toolchain/lex/character_set.h"
 #include "toolchain/lex/helpers.h"
 #include "toolchain/lex/numeric_literal.h"
@@ -272,8 +273,9 @@ class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
     bool formed_token_;
   };
 
-  Lexer(SourceBuffer& source, DiagnosticConsumer& consumer)
-      : buffer_(source),
+  Lexer(SharedValueStores& value_stores, SourceBuffer& source,
+        DiagnosticConsumer& consumer)
+      : buffer_(value_stores, source),
         consumer_(consumer),
         translator_(&buffer_),
         emitter_(translator_, consumer_),
@@ -534,21 +536,20 @@ class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
           auto token = buffer_.AddToken({.kind = TokenKind::IntegerLiteral,
                                          .token_line = current_line(),
                                          .column = int_column});
-          buffer_.GetTokenInfo(token).literal_index =
-              buffer_.literal_int_storage_.size();
-          buffer_.literal_int_storage_.push_back(std::move(value.value));
+          buffer_.GetTokenInfo(token).integer_id =
+              buffer_.value_stores_->integers().Add(std::move(value.value));
           return token;
         },
         [&](NumericLiteral::RealValue&& value) {
           auto token = buffer_.AddToken({.kind = TokenKind::RealLiteral,
                                          .token_line = current_line(),
                                          .column = int_column});
-          buffer_.GetTokenInfo(token).literal_index =
-              buffer_.literal_int_storage_.size();
-          buffer_.literal_int_storage_.push_back(std::move(value.mantissa));
-          buffer_.literal_int_storage_.push_back(std::move(value.exponent));
-          CARBON_CHECK(buffer_.GetRealLiteral(token).is_decimal ==
-                       (value.radix == NumericLiteral::Radix::Decimal));
+          buffer_.GetTokenInfo(token).real_id =
+              buffer_.value_stores_->reals().Add(
+                  Real{.mantissa = value.mantissa,
+                       .exponent = value.exponent,
+                       .is_decimal =
+                           (value.radix == NumericLiteral::Radix::Decimal)});
           return token;
         },
         [&](NumericLiteral::UnrecoverableError) {
@@ -588,14 +589,16 @@ class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
     }
 
     if (literal->is_terminated()) {
-      auto token =
-          buffer_.AddToken({.kind = TokenKind::StringLiteral,
-                            .token_line = string_line,
-                            .column = string_column,
-                            .literal_index = static_cast<int32_t>(
-                                buffer_.literal_string_storage_.size())});
-      buffer_.literal_string_storage_.push_back(
-          literal->ComputeValue(emitter_));
+      // TODO: Refactor to reduce copies.
+      // https://github.com/carbon-language/carbon-lang/pull/3311#discussion_r1366048360
+      buffer_.computed_strings_.push_back(
+          std::make_unique<std::string>(literal->ComputeValue(emitter_)));
+      auto string_id = buffer_.value_stores_->strings().Add(
+          *buffer_.computed_strings_.back());
+      auto token = buffer_.AddToken({.kind = TokenKind::StringLiteral,
+                                     .token_line = string_line,
+                                     .column = string_column,
+                                     .string_id = string_id});
       return token;
     } else {
       CARBON_DIAGNOSTIC(UnterminatedString, Error,
@@ -745,9 +748,8 @@ class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
 
     auto token = buffer_.AddToken(
         {.kind = *kind, .token_line = current_line(), .column = column});
-    buffer_.GetTokenInfo(token).literal_index =
-        buffer_.literal_int_storage_.size();
-    buffer_.literal_int_storage_.push_back(std::move(suffix_value));
+    buffer_.GetTokenInfo(token).integer_id =
+        buffer_.value_stores_->integers().Add(std::move(suffix_value));
     return token;
   }
 
@@ -792,15 +794,6 @@ class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
     } while (!open_groups_.empty());
   }
 
-  auto GetOrCreateIdentifier(llvm::StringRef text) -> Identifier {
-    auto insert_result = buffer_.identifier_map_.insert(
-        {text, Identifier(buffer_.identifier_infos_.size())});
-    if (insert_result.second) {
-      buffer_.identifier_infos_.push_back({text});
-    }
-    return insert_result.first->second;
-  }
-
   auto LexKeywordOrIdentifier(llvm::StringRef source_text, ssize_t& position)
       -> LexResult {
     if (static_cast<unsigned char>(source_text[position]) > 0x7F) {
@@ -835,10 +828,11 @@ class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
     }
 
     // Otherwise we have a generic identifier.
-    return buffer_.AddToken({.kind = TokenKind::Identifier,
-                             .token_line = current_line(),
-                             .column = column,
-                             .id = GetOrCreateIdentifier(identifier_text)});
+    return buffer_.AddToken(
+        {.kind = TokenKind::Identifier,
+         .token_line = current_line(),
+         .column = column,
+         .string_id = buffer_.value_stores_->strings().Add(identifier_text)});
   }
 
   auto LexError(llvm::StringRef source_text, ssize_t& position) -> LexResult {
@@ -1155,9 +1149,9 @@ constexpr std::array<TokenKind, 256>
       return table;
     }();
 
-auto TokenizedBuffer::Lex(SourceBuffer& source, DiagnosticConsumer& consumer)
-    -> TokenizedBuffer {
-  Lexer lexer(source, consumer);
+auto TokenizedBuffer::Lex(SharedValueStores& value_stores, SourceBuffer& source,
+                          DiagnosticConsumer& consumer) -> TokenizedBuffer {
+  Lexer lexer(value_stores, source, consumer);
   return std::move(lexer).Lex();
 }
 
@@ -1229,50 +1223,39 @@ auto TokenizedBuffer::GetTokenText(Token token) const -> llvm::StringRef {
   }
 
   CARBON_CHECK(token_info.kind == TokenKind::Identifier) << token_info.kind;
-  return GetIdentifierText(token_info.id);
+  return value_stores_->strings().Get(token_info.string_id);
 }
 
-auto TokenizedBuffer::GetIdentifier(Token token) const -> Identifier {
+auto TokenizedBuffer::GetIdentifier(Token token) const -> StringId {
   const auto& token_info = GetTokenInfo(token);
   CARBON_CHECK(token_info.kind == TokenKind::Identifier) << token_info.kind;
-  return token_info.id;
+  return token_info.string_id;
 }
 
 auto TokenizedBuffer::GetIntegerLiteral(Token token) const
     -> const llvm::APInt& {
   const auto& token_info = GetTokenInfo(token);
   CARBON_CHECK(token_info.kind == TokenKind::IntegerLiteral) << token_info.kind;
-  return literal_int_storage_[token_info.literal_index];
+  return value_stores_->integers().Get(token_info.integer_id);
 }
 
-auto TokenizedBuffer::GetRealLiteral(Token token) const -> RealLiteralValue {
+auto TokenizedBuffer::GetRealLiteral(Token token) const -> Real {
   const auto& token_info = GetTokenInfo(token);
   CARBON_CHECK(token_info.kind == TokenKind::RealLiteral) << token_info.kind;
-
-  // Note that every real literal is at least three characters long, so we can
-  // safely look at the second character to determine whether we have a
-  // decimal or hexadecimal literal.
-  const auto& line_info = GetLineInfo(token_info.token_line);
-  int64_t token_start = line_info.start + token_info.column;
-  char second_char = source_->text()[token_start + 1];
-  bool is_decimal = second_char != 'x' && second_char != 'b';
-
-  return {.mantissa = literal_int_storage_[token_info.literal_index],
-          .exponent = literal_int_storage_[token_info.literal_index + 1],
-          .is_decimal = is_decimal};
+  return value_stores_->reals().Get(token_info.real_id);
 }
 
 auto TokenizedBuffer::GetStringLiteral(Token token) const -> llvm::StringRef {
   const auto& token_info = GetTokenInfo(token);
   CARBON_CHECK(token_info.kind == TokenKind::StringLiteral) << token_info.kind;
-  return literal_string_storage_[token_info.literal_index];
+  return value_stores_->strings().Get(token_info.string_id);
 }
 
 auto TokenizedBuffer::GetTypeLiteralSize(Token token) const
     -> const llvm::APInt& {
   const auto& token_info = GetTokenInfo(token);
   CARBON_CHECK(token_info.kind.is_sized_type_literal()) << token_info.kind;
-  return literal_int_storage_[token_info.literal_index];
+  return value_stores_->integers().Get(token_info.integer_id);
 }
 
 auto TokenizedBuffer::GetMatchedClosingToken(Token opening_token) const
@@ -1323,11 +1306,6 @@ auto TokenizedBuffer::GetIndentColumnNumber(Line line) const -> int {
   return GetLineInfo(line).indent + 1;
 }
 
-auto TokenizedBuffer::GetIdentifierText(Identifier identifier) const
-    -> llvm::StringRef {
-  return identifier_infos_[identifier.index].text;
-}
-
 auto TokenizedBuffer::PrintWidths::Widen(const PrintWidths& widths) -> void {
   index = std::max(widths.index, index);
   kind = std::max(widths.kind, kind);

+ 19 - 68
toolchain/lex/tokenized_buffer.h

@@ -10,13 +10,13 @@
 
 #include "common/ostream.h"
 #include "llvm/ADT/APInt.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/raw_ostream.h"
 #include "toolchain/base/index_base.h"
+#include "toolchain/base/value_store.h"
 #include "toolchain/diagnostics/diagnostic_emitter.h"
 #include "toolchain/lex/token_kind.h"
 #include "toolchain/source/source_buffer.h"
@@ -61,25 +61,6 @@ struct Line : public ComparableIndexBase {
 
 constexpr Line Line::Invalid(Line::InvalidIndex);
 
-// A lightweight handle to a lexed identifier in a `TokenizedBuffer`.
-//
-// `Identifier` objects are designed to be passed by value, not reference or
-// pointer. They are also designed to be small and efficient to store in data
-// structures.
-//
-// Each identifier lexed is canonicalized to a single entry in the identifier
-// table. `Identifier` objects will compare equal if they refer to the same
-// identifier spelling. Where the identifier was written is not preserved.
-//
-// All other APIs to query a `Identifier` are on the `TokenizedBuffer`.
-struct Identifier : public IndexBase {
-  using IndexBase::IndexBase;
-
-  static const Identifier Invalid;
-};
-
-constexpr Identifier Identifier::Invalid = Identifier(Identifier::InvalidIndex);
-
 // Random-access iterator over tokens within the buffer.
 class TokenIterator
     : public llvm::iterator_facade_base<
@@ -122,31 +103,6 @@ class TokenIterator
   Token token_;
 };
 
-// The value of a real literal.
-//
-// This is either a dyadic fraction (mantissa * 2^exponent) or a decadic
-// fraction (mantissa * 10^exponent).
-//
-// `RealLiteralValue` carries a reference back to `TokenizedBuffer` which can be
-// invalidated if the buffer is edited or destroyed.
-class RealLiteralValue : public Printable<RealLiteralValue> {
- public:
-  auto Print(llvm::raw_ostream& output_stream) const -> void {
-    mantissa.print(output_stream, /*isSigned=*/false);
-    output_stream << "*" << (is_decimal ? "10" : "2") << "^" << exponent;
-  }
-
-  // The mantissa, represented as an unsigned integer.
-  const llvm::APInt& mantissa;
-
-  // The exponent, represented as a signed integer.
-  const llvm::APInt& exponent;
-
-  // If false, the value is mantissa * 2^exponent.
-  // If true, the value is mantissa * 10^exponent.
-  bool is_decimal;
-};
-
 // A diagnostic location translator that maps token locations into source
 // buffer locations.
 class TokenLocationTranslator : public DiagnosticLocationTranslator<Token> {
@@ -175,8 +131,8 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
   //
   // The provided source buffer must outlive any returned `TokenizedBuffer`
   // which will refer into the source.
-  static auto Lex(SourceBuffer& source, DiagnosticConsumer& consumer)
-      -> TokenizedBuffer;
+  static auto Lex(SharedValueStores& value_stores, SourceBuffer& source,
+                  DiagnosticConsumer& consumer) -> TokenizedBuffer;
 
   [[nodiscard]] auto GetKind(Token token) const -> TokenKind;
   [[nodiscard]] auto GetLine(Token token) const -> Line;
@@ -192,13 +148,13 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
 
   // Returns the identifier associated with this token. The token kind must be
   // an `Identifier`.
-  [[nodiscard]] auto GetIdentifier(Token token) const -> Identifier;
+  [[nodiscard]] auto GetIdentifier(Token token) const -> StringId;
 
   // Returns the value of an `IntegerLiteral()` token.
   [[nodiscard]] auto GetIntegerLiteral(Token token) const -> const llvm::APInt&;
 
   // Returns the value of an `RealLiteral()` token.
-  [[nodiscard]] auto GetRealLiteral(Token token) const -> RealLiteralValue;
+  [[nodiscard]] auto GetRealLiteral(Token token) const -> Real;
 
   // Returns the value of a `StringLiteral()` token.
   [[nodiscard]] auto GetStringLiteral(Token token) const -> llvm::StringRef;
@@ -239,9 +195,6 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
   // Returns the previous line handle.
   [[nodiscard]] auto GetPrevLine(Line line) const -> Line;
 
-  // Returns the text for an identifier.
-  [[nodiscard]] auto GetIdentifierText(Identifier id) const -> llvm::StringRef;
-
   // Prints a description of the tokenized stream to the provided `raw_ostream`.
   //
   // It prints one line of information for each token in the buffer, including
@@ -320,6 +273,10 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
     int indent;
   };
 
+  struct LiteralStringId : public IndexBase {
+    using IndexBase::IndexBase;
+  };
+
   struct TokenInfo {
     TokenKind kind;
 
@@ -341,8 +298,9 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
           sizeof(Token) <= sizeof(int32_t),
           "Unable to pack token and identifier index into the same space!");
 
-      Identifier id = Identifier::Invalid;
-      int32_t literal_index;
+      StringId string_id = StringId::Invalid;
+      IntegerId integer_id;
+      RealId real_id;
       Token closing_token;
       Token opening_token;
       int32_t error_length;
@@ -373,15 +331,13 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
     int32_t indent;
   };
 
-  struct IdentifierInfo {
-    llvm::StringRef text;
-  };
-
   // The constructor is merely responsible for trivial initialization of
   // members. A working object of this type is built with the `lex` function
   // above so that its return can indicate if an error was encountered while
   // lexing.
-  explicit TokenizedBuffer(SourceBuffer& source) : source_(&source) {}
+  explicit TokenizedBuffer(SharedValueStores& value_stores,
+                           SourceBuffer& source)
+      : value_stores_(&value_stores), source_(&source) {}
 
   auto GetLineInfo(Line line) -> LineInfo&;
   [[nodiscard]] auto GetLineInfo(Line line) const -> const LineInfo&;
@@ -393,21 +349,16 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
   auto PrintToken(llvm::raw_ostream& output_stream, Token token,
                   PrintWidths widths) const -> void;
 
+  SharedValueStores* value_stores_;
   SourceBuffer* source_;
 
   llvm::SmallVector<TokenInfo> token_infos_;
 
   llvm::SmallVector<LineInfo> line_infos_;
 
-  llvm::SmallVector<IdentifierInfo> identifier_infos_;
-
-  // Storage for integers that form part of the value of a numeric or type
-  // literal.
-  llvm::SmallVector<llvm::APInt> literal_int_storage_;
-
-  llvm::SmallVector<std::string> literal_string_storage_;
-
-  llvm::DenseMap<llvm::StringRef, Identifier> identifier_map_;
+  // Stores the computed value of string literals so that StringRefs are
+  // durable.
+  llvm::SmallVector<std::unique_ptr<std::string>> computed_strings_;
 
   // The number of parse tree nodes that we expect to be created for the tokens
   // in this buffer.

+ 4 - 2
toolchain/lex/tokenized_buffer_benchmark.cpp

@@ -11,6 +11,7 @@
 #include "common/check.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/StringExtras.h"
+#include "toolchain/base/value_store.h"
 #include "toolchain/diagnostics/diagnostic_emitter.h"
 #include "toolchain/diagnostics/null_diagnostics.h"
 #include "toolchain/lex/token_kind.h"
@@ -374,14 +375,14 @@ class LexerBenchHelper {
 
   auto Lex() -> TokenizedBuffer {
     DiagnosticConsumer& consumer = NullDiagnosticConsumer();
-    return TokenizedBuffer::Lex(source_, consumer);
+    return TokenizedBuffer::Lex(value_stores_, source_, consumer);
   }
 
   auto DiagnoseErrors() -> std::string {
     std::string result;
     llvm::raw_string_ostream out(result);
     StreamDiagnosticConsumer consumer(out);
-    auto buffer = TokenizedBuffer::Lex(source_, consumer);
+    auto buffer = TokenizedBuffer::Lex(value_stores_, source_, consumer);
     consumer.Flush();
     CARBON_CHECK(buffer.has_errors())
         << "Asked to diagnose errors but none found!";
@@ -398,6 +399,7 @@ class LexerBenchHelper {
         fs_, filename_, ConsoleDiagnosticConsumer()));
   }
 
+  SharedValueStores value_stores_;
   llvm::vfs::InMemoryFileSystem fs_;
   std::string filename_ = "test.carbon";
   SourceBuffer source_;

+ 4 - 1
toolchain/lex/tokenized_buffer_fuzzer.cpp

@@ -6,6 +6,7 @@
 
 #include "common/check.h"
 #include "llvm/ADT/StringRef.h"
+#include "toolchain/base/value_store.h"
 #include "toolchain/diagnostics/null_diagnostics.h"
 #include "toolchain/lex/tokenized_buffer.h"
 
@@ -33,7 +34,9 @@ extern "C" int LLVMFuzzerTestOneInput(const unsigned char* data,
   auto source =
       SourceBuffer::CreateFromFile(fs, TestFileName, NullDiagnosticConsumer());
 
-  auto buffer = Lex::TokenizedBuffer::Lex(*source, NullDiagnosticConsumer());
+  SharedValueStores value_stores;
+  auto buffer = Lex::TokenizedBuffer::Lex(value_stores, *source,
+                                          NullDiagnosticConsumer());
   if (buffer.has_errors()) {
     return 0;
   }

+ 7 - 4
toolchain/lex/tokenized_buffer_test.cpp

@@ -12,6 +12,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "testing/base/test_raw_ostream.h"
+#include "toolchain/base/value_store.h"
 #include "toolchain/diagnostics/diagnostic_emitter.h"
 #include "toolchain/diagnostics/mocks.h"
 #include "toolchain/lex/tokenized_buffer_test_helpers.h"
@@ -45,9 +46,10 @@ class LexerTest : public ::testing::Test {
   auto Lex(llvm::StringRef text,
            DiagnosticConsumer& consumer = ConsoleDiagnosticConsumer())
       -> TokenizedBuffer {
-    return TokenizedBuffer::Lex(GetSourceBuffer(text), consumer);
+    return TokenizedBuffer::Lex(value_stores_, GetSourceBuffer(text), consumer);
   }
 
+  SharedValueStores value_stores_;
   llvm::vfs::InMemoryFileSystem fs_;
   int file_index_ = 0;
   std::forward_list<SourceBuffer> source_storage_;
@@ -439,7 +441,8 @@ TEST_F(LexerTest, MatchingGroups) {
     auto it = ++buffer.tokens().begin();
     auto open_paren_token = *it++;
     auto open_curly_token = *it++;
-    ASSERT_EQ("x", buffer.GetIdentifierText(buffer.GetIdentifier(*it++)));
+
+    ASSERT_EQ("x", value_stores_.strings().Get(buffer.GetIdentifier(*it++)));
     auto close_curly_token = *it++;
     auto close_paren_token = *it++;
     EXPECT_EQ(close_paren_token,
@@ -453,7 +456,7 @@ TEST_F(LexerTest, MatchingGroups) {
 
     open_curly_token = *it++;
     open_paren_token = *it++;
-    ASSERT_EQ("y", buffer.GetIdentifierText(buffer.GetIdentifier(*it++)));
+    ASSERT_EQ("y", value_stores_.strings().Get(buffer.GetIdentifier(*it++)));
     close_paren_token = *it++;
     close_curly_token = *it++;
     EXPECT_EQ(close_curly_token,
@@ -469,7 +472,7 @@ TEST_F(LexerTest, MatchingGroups) {
     auto inner_open_curly_token = *it++;
     open_paren_token = *it++;
     auto inner_open_paren_token = *it++;
-    ASSERT_EQ("z", buffer.GetIdentifierText(buffer.GetIdentifier(*it++)));
+    ASSERT_EQ("z", value_stores_.strings().Get(buffer.GetIdentifier(*it++)));
     auto inner_close_paren_token = *it++;
     close_paren_token = *it++;
     auto inner_close_curly_token = *it++;

+ 2 - 0
toolchain/parse/BUILD

@@ -71,6 +71,7 @@ cc_test(
         "//common:ostream",
         "//testing/base:gtest_main",
         "//testing/base:test_raw_ostream",
+        "//toolchain/base:value_store",
         "//toolchain/diagnostics:diagnostic_emitter",
         "//toolchain/diagnostics:mocks",
         "//toolchain/lex:tokenized_buffer",
@@ -88,6 +89,7 @@ cc_fuzz_test(
     deps = [
         ":tree",
         "//common:check",
+        "//toolchain/base:value_store",
         "//toolchain/diagnostics:diagnostic_emitter",
         "//toolchain/diagnostics:null_diagnostics",
         "//toolchain/lex:tokenized_buffer",

+ 4 - 1
toolchain/parse/parse_fuzzer.cpp

@@ -6,6 +6,7 @@
 #include <cstring>
 
 #include "llvm/ADT/StringRef.h"
+#include "toolchain/base/value_store.h"
 #include "toolchain/diagnostics/null_diagnostics.h"
 #include "toolchain/lex/tokenized_buffer.h"
 #include "toolchain/parse/tree.h"
@@ -31,7 +32,9 @@ extern "C" int LLVMFuzzerTestOneInput(const unsigned char* data,
       SourceBuffer::CreateFromFile(fs, TestFileName, NullDiagnosticConsumer());
 
   // Lex the input.
-  auto tokens = Lex::TokenizedBuffer::Lex(*source, NullDiagnosticConsumer());
+  SharedValueStores value_stores;
+  auto tokens = Lex::TokenizedBuffer::Lex(value_stores, *source,
+                                          NullDiagnosticConsumer());
   if (tokens.has_errors()) {
     return 0;
   }

+ 21 - 19
toolchain/parse/tree_test.cpp

@@ -10,6 +10,7 @@
 #include <forward_list>
 
 #include "testing/base/test_raw_ostream.h"
+#include "toolchain/base/value_store.h"
 #include "toolchain/diagnostics/diagnostic_emitter.h"
 #include "toolchain/diagnostics/mocks.h"
 #include "toolchain/lex/tokenized_buffer.h"
@@ -27,34 +28,35 @@ namespace Yaml = ::Carbon::Testing::Yaml;
 class TreeTest : public ::testing::Test {
  protected:
   auto GetSourceBuffer(llvm::StringRef t) -> SourceBuffer& {
-    CARBON_CHECK(fs.addFile("test.carbon", /*ModificationTime=*/0,
-                            llvm::MemoryBuffer::getMemBuffer(t)));
-    source_storage.push_front(
-        std::move(*SourceBuffer::CreateFromFile(fs, "test.carbon", consumer)));
-    return source_storage.front();
+    CARBON_CHECK(fs_.addFile("test.carbon", /*ModificationTime=*/0,
+                             llvm::MemoryBuffer::getMemBuffer(t)));
+    source_storage_.push_front(std::move(
+        *SourceBuffer::CreateFromFile(fs_, "test.carbon", consumer_)));
+    return source_storage_.front();
   }
 
   auto GetTokenizedBuffer(llvm::StringRef t) -> Lex::TokenizedBuffer& {
-    token_storage.push_front(
-        Lex::TokenizedBuffer::Lex(GetSourceBuffer(t), consumer));
-    return token_storage.front();
+    token_storage_.push_front(Lex::TokenizedBuffer::Lex(
+        value_stores_, GetSourceBuffer(t), consumer_));
+    return token_storage_.front();
   }
 
-  llvm::vfs::InMemoryFileSystem fs;
-  std::forward_list<SourceBuffer> source_storage;
-  std::forward_list<Lex::TokenizedBuffer> token_storage;
-  DiagnosticConsumer& consumer = ConsoleDiagnosticConsumer();
+  SharedValueStores value_stores_;
+  llvm::vfs::InMemoryFileSystem fs_;
+  std::forward_list<SourceBuffer> source_storage_;
+  std::forward_list<Lex::TokenizedBuffer> token_storage_;
+  DiagnosticConsumer& consumer_ = ConsoleDiagnosticConsumer();
 };
 
 TEST_F(TreeTest, IsValid) {
-  Lex::TokenizedBuffer tokens = GetTokenizedBuffer("");
-  Tree tree = Tree::Parse(tokens, consumer, /*vlog_stream=*/nullptr);
+  Lex::TokenizedBuffer& tokens = GetTokenizedBuffer("");
+  Tree tree = Tree::Parse(tokens, consumer_, /*vlog_stream=*/nullptr);
   EXPECT_TRUE((*tree.postorder().begin()).is_valid());
 }
 
 TEST_F(TreeTest, PrintPostorderAsYAML) {
-  Lex::TokenizedBuffer tokens = GetTokenizedBuffer("fn F();");
-  Tree tree = Tree::Parse(tokens, consumer, /*vlog_stream=*/nullptr);
+  Lex::TokenizedBuffer& tokens = GetTokenizedBuffer("fn F();");
+  Tree tree = Tree::Parse(tokens, consumer_, /*vlog_stream=*/nullptr);
   EXPECT_FALSE(tree.has_errors());
   TestRawOstream print_stream;
   tree.Print(print_stream);
@@ -80,8 +82,8 @@ TEST_F(TreeTest, PrintPostorderAsYAML) {
 }
 
 TEST_F(TreeTest, PrintPreorderAsYAML) {
-  Lex::TokenizedBuffer tokens = GetTokenizedBuffer("fn F();");
-  Tree tree = Tree::Parse(tokens, consumer, /*vlog_stream=*/nullptr);
+  Lex::TokenizedBuffer& tokens = GetTokenizedBuffer("fn F();");
+  Tree tree = Tree::Parse(tokens, consumer_, /*vlog_stream=*/nullptr);
   EXPECT_FALSE(tree.has_errors());
   TestRawOstream print_stream;
   tree.Print(print_stream, /*preorder=*/true);
@@ -123,7 +125,7 @@ TEST_F(TreeTest, HighRecursion) {
   code.append(10000, '(');
   code.append(10000, ')');
   code += "; }";
-  Lex::TokenizedBuffer tokens = GetTokenizedBuffer(code);
+  Lex::TokenizedBuffer& tokens = GetTokenizedBuffer(code);
   ASSERT_FALSE(tokens.has_errors());
   Testing::MockDiagnosticConsumer consumer;
   Tree tree = Tree::Parse(tokens, consumer, /*vlog_stream=*/nullptr);

+ 1 - 0
toolchain/sem_ir/file.h

@@ -5,6 +5,7 @@
 #ifndef CARBON_TOOLCHAIN_SEM_IR_FILE_H_
 #define CARBON_TOOLCHAIN_SEM_IR_FILE_H_
 
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/iterator_range.h"

+ 2 - 29
toolchain/sem_ir/node.h

@@ -115,15 +115,6 @@ struct BoolValue : public IndexBase, public Printable<BoolValue> {
 constexpr BoolValue BoolValue::False = BoolValue(0);
 constexpr BoolValue BoolValue::True = BoolValue(1);
 
-// The ID of an integer value.
-struct IntegerId : public IndexBase, public Printable<IntegerId> {
-  using IndexBase::IndexBase;
-  auto Print(llvm::raw_ostream& out) const -> void {
-    out << "int";
-    IndexBase::Print(out);
-  }
-};
-
 // The ID of a name scope.
 struct NameScopeId : public IndexBase, public Printable<NameScopeId> {
   // An explicitly invalid ID.
@@ -167,24 +158,6 @@ constexpr NodeBlockId NodeBlockId::Invalid =
 constexpr NodeBlockId NodeBlockId::Unreachable =
     NodeBlockId(NodeBlockId::InvalidIndex - 1);
 
-// The ID of a real number value.
-struct RealId : public IndexBase, public Printable<RealId> {
-  using IndexBase::IndexBase;
-  auto Print(llvm::raw_ostream& out) const -> void {
-    out << "real";
-    IndexBase::Print(out);
-  }
-};
-
-// The ID of a string.
-struct StringId : public IndexBase, public Printable<StringId> {
-  using IndexBase::IndexBase;
-  auto Print(llvm::raw_ostream& out) const -> void {
-    out << "str";
-    IndexBase::Print(out);
-  }
-};
-
 // The ID of a node block.
 struct TypeId : public IndexBase, public Printable<TypeId> {
   // The builtin TypeType.
@@ -850,7 +823,7 @@ template <>
 struct llvm::DenseMapInfo<Carbon::SemIR::NodeId>
     : public Carbon::SemIR::IdMapInfo<Carbon::SemIR::NodeId> {};
 template <>
-struct llvm::DenseMapInfo<Carbon::SemIR::StringId>
-    : public Carbon::SemIR::IdMapInfo<Carbon::SemIR::StringId> {};
+struct llvm::DenseMapInfo<Carbon::StringId>
+    : public Carbon::SemIR::IdMapInfo<Carbon::StringId> {};
 
 #endif  // CARBON_TOOLCHAIN_SEM_IR_NODE_H_