2 anni fa · a79ea4b28d
--- a/toolchain/lex/token_kind.def
+++ b/toolchain/lex/token_kind.def
@@ -32,6 +32,10 @@
 
				 #define CARBON_TOKEN(Name)
			
 
				 #endif
			
 
				 
			
 
				+// The error token comes first because we want it to get the zero value, which
			
 
				+// will also be used in default initialization.
			
 
				+CARBON_TOKEN(Error)
			
 
				+
			
 
				 #ifndef CARBON_SYMBOL_TOKEN
			
 
				 #define CARBON_SYMBOL_TOKEN(Name, Spelling) CARBON_TOKEN(Name)
			
 
				 #endif
			
@@ -209,7 +213,6 @@ CARBON_TOKEN(StringLiteral)
 
				 CARBON_TOKEN(IntegerTypeLiteral)
			
 
				 CARBON_TOKEN(UnsignedIntegerTypeLiteral)
			
 
				 CARBON_TOKEN(FloatingPointTypeLiteral)
			
 
				-CARBON_TOKEN(Error)
			
 
				 CARBON_TOKEN(StartOfFile)
			
 
				 CARBON_TOKEN(EndOfFile)
			
 
				 
			
--- a/toolchain/lex/token_kind.h
+++ b/toolchain/lex/token_kind.h
@@ -28,6 +28,8 @@ class TokenKind : public CARBON_ENUM_BASE(TokenKind) {
 
				   // An array of all the keyword tokens.
			
 
				   static const llvm::ArrayRef<TokenKind> KeywordTokens;
			
 
				 
			
 
				+  using EnumBase::EnumBase;
			
 
				+
			
 
				   // Test whether this kind of token is a simple symbol sequence (punctuation,
			
 
				   // not letters) that appears directly in the source text and can be
			
 
				   // unambiguously lexed with `starts_with` logic. While these may appear
			
--- a/toolchain/lex/tokenized_buffer.cpp
+++ b/toolchain/lex/tokenized_buffer.cpp
@@ -464,77 +464,106 @@ class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
 
				     }
			
 
				   }
			
 
				 
			
 
				-  auto LexSymbolToken(llvm::StringRef& source_text,
			
 
				-                      TokenKind kind = TokenKind::Error) -> LexResult {
			
 
				-    auto compute_symbol_kind = [](llvm::StringRef source_text) {
			
 
				-      return llvm::StringSwitch<TokenKind>(source_text)
			
 
				-#define CARBON_SYMBOL_TOKEN(Name, Spelling) \
			
 
				-  .StartsWith(Spelling, TokenKind::Name)
			
 
				-#include "toolchain/lex/token_kind.def"
			
 
				-          .Default(TokenKind::Error);
			
 
				-    };
			
 
				-
			
 
				-    // We use the `error` token as a place-holder for cases where one character
			
 
				-    // isn't enough to pick a definitive symbol token. Recompute the kind using
			
 
				-    // the full symbol set.
			
 
				-    if (LLVM_UNLIKELY(kind == TokenKind::Error)) {
			
 
				-      kind = compute_symbol_kind(source_text);
			
 
				-      if (kind == TokenKind::Error) {
			
 
				-        return LexError(source_text);
			
 
				-      }
			
 
				-    } else {
			
 
				-      // Verify in a debug build that the incoming token kind is correct.
			
 
				-      CARBON_DCHECK(kind == compute_symbol_kind(source_text))
			
 
				-          << "Incoming token kind '" << kind
			
 
				-          << "' does not match computed kind '"
			
 
				-          << compute_symbol_kind(source_text) << "'!";
			
 
				-    }
			
 
				+  auto LexOneCharSymbolToken(llvm::StringRef& source_text, TokenKind kind)
			
 
				+      -> Token {
			
 
				+    // Verify in a debug build that the incoming token kind is correct.
			
 
				+    CARBON_DCHECK(kind != TokenKind::Error);
			
 
				+    CARBON_DCHECK(kind.fixed_spelling().size() == 1);
			
 
				+    CARBON_DCHECK(source_text.front() == kind.fixed_spelling().front())
			
 
				+        << "Source text starts with '" << source_text.front()
			
 
				+        << "' instead of the spelling '" << kind.fixed_spelling()
			
 
				+        << "' of the incoming token kind '" << kind << "'";
			
 
				 
			
 
				     if (!set_indent_) {
			
 
				       current_line_info_->indent = current_column_;
			
 
				       set_indent_ = true;
			
 
				     }
			
 
				 
			
 
				-    CloseInvalidOpenGroups(kind);
			
 
				-
			
 
				-    const char* location = source_text.begin();
			
 
				     Token token = buffer_->AddToken(
			
 
				         {.kind = kind, .token_line = current_line_, .column = current_column_});
			
 
				-    current_column_ += kind.fixed_spelling().size();
			
 
				-    source_text = source_text.drop_front(kind.fixed_spelling().size());
			
 
				+    ++current_column_;
			
 
				+    source_text = source_text.drop_front();
			
 
				+    return token;
			
 
				+  }
			
 
				+
			
 
				+  auto LexOpeningSymbolToken(llvm::StringRef& source_text, TokenKind kind)
			
 
				+      -> LexResult {
			
 
				+    Token token = LexOneCharSymbolToken(source_text, kind);
			
 
				+    open_groups_.push_back(token);
			
 
				+    return token;
			
 
				+  }
			
 
				 
			
 
				-    // Opening symbols just need to be pushed onto our queue of opening groups.
			
 
				-    if (kind.is_opening_symbol()) {
			
 
				-      open_groups_.push_back(token);
			
 
				+  auto LexClosingSymbolToken(llvm::StringRef& source_text, TokenKind kind)
			
 
				+      -> LexResult {
			
 
				+    auto unmatched_error = [&] {
			
 
				+      CARBON_DIAGNOSTIC(
			
 
				+          UnmatchedClosing, Error,
			
 
				+          "Closing symbol without a corresponding opening symbol.");
			
 
				+      emitter_.Emit(source_text.begin(), UnmatchedClosing);
			
 
				+      Token token = buffer_->AddToken({.kind = TokenKind::Error,
			
 
				+                                       .token_line = current_line_,
			
 
				+                                       .column = current_column_,
			
 
				+                                       .error_length = 1});
			
 
				+      ++current_column_;
			
 
				+      source_text = source_text.drop_front();
			
 
				       return token;
			
 
				+    };
			
 
				+
			
 
				+    // If we have no open groups, this is an error.
			
 
				+    if (LLVM_UNLIKELY(open_groups_.empty())) {
			
 
				+      return unmatched_error();
			
 
				     }
			
 
				 
			
 
				-    // Only closing symbols need further special handling.
			
 
				-    if (!kind.is_closing_symbol()) {
			
 
				-      return token;
			
 
				+    Token opening_token = open_groups_.back();
			
 
				+    // Close any invalid open groups first.
			
 
				+    if (LLVM_UNLIKELY(buffer_->GetTokenInfo(opening_token).kind !=
			
 
				+                      kind.opening_symbol())) {
			
 
				+      CloseInvalidOpenGroups(kind);
			
 
				+      // This may exhaust the open groups so re-check and re-error if needed.
			
 
				+      if (open_groups_.empty()) {
			
 
				+        return unmatched_error();
			
 
				+      }
			
 
				+      opening_token = open_groups_.back();
			
 
				+      CARBON_DCHECK(buffer_->GetTokenInfo(opening_token).kind ==
			
 
				+                    kind.opening_symbol());
			
 
				     }
			
 
				+    open_groups_.pop_back();
			
 
				 
			
 
				-    TokenInfo& closing_token_info = buffer_->GetTokenInfo(token);
			
 
				+    // Now that the groups are all matched up, lex the actual token.
			
 
				+    Token token = LexOneCharSymbolToken(source_text, kind);
			
 
				 
			
 
				-    // Check that there is a matching opening symbol before we consume this as
			
 
				-    // a closing symbol.
			
 
				-    if (open_groups_.empty()) {
			
 
				-      closing_token_info.kind = TokenKind::Error;
			
 
				-      closing_token_info.error_length = kind.fixed_spelling().size();
			
 
				+    // Note that it is important to get fresh token infos here as lexing the
			
 
				+    // open token would invalidate any pointers.
			
 
				+    buffer_->GetTokenInfo(opening_token).closing_token = token;
			
 
				+    buffer_->GetTokenInfo(token).opening_token = opening_token;
			
 
				 
			
 
				-      CARBON_DIAGNOSTIC(
			
 
				-          UnmatchedClosing, Error,
			
 
				-          "Closing symbol without a corresponding opening symbol.");
			
 
				-      emitter_.Emit(location, UnmatchedClosing);
			
 
				-      // Note that this still returns true as we do consume a symbol.
			
 
				-      return token;
			
 
				+    return token;
			
 
				+  }
			
 
				+
			
 
				+  auto LexSymbolToken(llvm::StringRef& source_text) -> LexResult {
			
 
				+    // One character symbols and grouping symbols are handled with dedicated
			
 
				+    // dispatch. We only lex the multi-character tokens here.
			
 
				+    TokenKind kind = llvm::StringSwitch<TokenKind>(source_text)
			
 
				+#define CARBON_SYMBOL_TOKEN(Name, Spelling) \
			
 
				+  .StartsWith(Spelling, TokenKind::Name)
			
 
				+#define CARBON_ONE_CHAR_SYMBOL_TOKEN(TokenName, Spelling)
			
 
				+#define CARBON_OPENING_GROUP_SYMBOL_TOKEN(TokenName, Spelling, ClosingName)
			
 
				+#define CARBON_CLOSING_GROUP_SYMBOL_TOKEN(TokenName, Spelling, OpeningName)
			
 
				+#include "toolchain/lex/token_kind.def"
			
 
				+                         .Default(TokenKind::Error);
			
 
				+    if (kind == TokenKind::Error) {
			
 
				+      return LexError(source_text);
			
 
				     }
			
 
				 
			
 
				-    // Finally can handle a normal closing symbol.
			
 
				-    Token opening_token = open_groups_.pop_back_val();
			
 
				-    TokenInfo& opening_token_info = buffer_->GetTokenInfo(opening_token);
			
 
				-    opening_token_info.closing_token = token;
			
 
				-    closing_token_info.opening_token = opening_token;
			
 
				+    if (!set_indent_) {
			
 
				+      current_line_info_->indent = current_column_;
			
 
				+      set_indent_ = true;
			
 
				+    }
			
 
				+
			
 
				+    Token token = buffer_->AddToken(
			
 
				+        {.kind = kind, .token_line = current_line_, .column = current_column_});
			
 
				+    current_column_ += kind.fixed_spelling().size();
			
 
				+    source_text = source_text.drop_front(kind.fixed_spelling().size());
			
 
				     return token;
			
 
				   }
			
 
				 
			
@@ -587,30 +616,9 @@ class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
 
				     return token;
			
 
				   }
			
 
				 
			
 
				-  // Closes all open groups that cannot remain open across the symbol `K`.
			
 
				+  // Closes all open groups that cannot remain open across a closing symbol.
			
 
				   // Users may pass `Error` to close all open groups.
			
 
				-  auto CloseInvalidOpenGroups(TokenKind kind) -> void {
			
 
				-    // There are two common cases that result in nothing to close. Short circuit
			
 
				-    // those here.
			
 
				-    if ((!kind.is_closing_symbol() && kind != TokenKind::Error) ||
			
 
				-        open_groups_.empty()) {
			
 
				-      return;
			
 
				-    }
			
 
				-
			
 
				-    // Also check the first open group token to see if it matches this closing
			
 
				-    // token, in which case there is nothing to do. This is redundant with the
			
 
				-    // work inside the main loop, but we peel it out to allow inlining.
			
 
				-    Token opening_token = open_groups_.back();
			
 
				-    TokenKind opening_kind = buffer_->GetTokenInfo(opening_token).kind;
			
 
				-    if (kind == opening_kind.closing_symbol()) {
			
 
				-      return;
			
 
				-    }
			
 
				-
			
 
				-    // Otherwise, delegate to a separate function to help with inlining.
			
 
				-    CloseInvalidOpenGroupsSlow(kind);
			
 
				-  }
			
 
				-
			
 
				-  [[gnu::noinline]] auto CloseInvalidOpenGroupsSlow(TokenKind kind) -> void {
			
 
				+  [[gnu::noinline]] auto CloseInvalidOpenGroups(TokenKind kind) -> void {
			
 
				     CARBON_CHECK(kind.is_closing_symbol() || kind == TokenKind::Error);
			
 
				     CARBON_CHECK(!open_groups_.empty());
			
 
				 
			
@@ -770,7 +778,9 @@ class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
 
				 
			
 
				     // Close any open groups. We do this after marking whitespace, it will
			
 
				     // preserve that.
			
 
				-    CloseInvalidOpenGroups(TokenKind::Error);
			
 
				+    if (!open_groups_.empty()) {
			
 
				+      CloseInvalidOpenGroups(TokenKind::Error);
			
 
				+    }
			
 
				 
			
 
				     buffer_->AddToken({.kind = TokenKind::EndOfFile,
			
 
				                        .token_line = current_line_,
			
@@ -818,14 +828,19 @@ class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
 
				   CARBON_DISPATCH_LEX_TOKEN(LexNumericLiteral)
			
 
				   CARBON_DISPATCH_LEX_TOKEN(LexStringLiteral)
			
 
				 
			
 
				-  // A custom dispatch function that pre-selects a symbol token to lex.
			
 
				-  template <const TokenKind& Token>
			
 
				-  static auto DispatchLexOneCharSymbol(Lexer& lexer,
			
 
				-                                       llvm::StringRef& source_text) -> void {
			
 
				-    LexResult result = lexer.LexSymbolToken(source_text, Token);
			
 
				-    CARBON_CHECK(result) << "Failed to form a token!";
			
 
				-    [[clang::musttail]] return DispatchNext(lexer, source_text);
			
 
				+  // A custom dispatch functions that pre-select the symbol token to lex.
			
 
				+#define CARBON_DISPATCH_LEX_SYMBOL_TOKEN(LexMethod)                          \
			
 
				+  static auto Dispatch##LexMethod##SymbolToken(Lexer& lexer,                 \
			
 
				+                                               llvm::StringRef& source_text) \
			
 
				+      ->void {                                                               \
			
 
				+    LexResult result = lexer.LexMethod##SymbolToken(                         \
			
 
				+        source_text, OneCharTokenKindTable[source_text.front()]);            \
			
 
				+    CARBON_CHECK(result) << "Failed to form a token!";                       \
			
 
				+    [[clang::musttail]] return DispatchNext(lexer, source_text);             \
			
 
				   }
			
 
				+  CARBON_DISPATCH_LEX_SYMBOL_TOKEN(LexOneChar)
			
 
				+  CARBON_DISPATCH_LEX_SYMBOL_TOKEN(LexOpening)
			
 
				+  CARBON_DISPATCH_LEX_SYMBOL_TOKEN(LexClosing)
			
 
				 
			
 
				   // Define a set of non-token dispatch functions that handle things like
			
 
				   // whitespace and comments.
			
@@ -915,7 +930,11 @@ class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
 
				     // needs to override some of the generic handling above, and provide a
			
 
				     // custom token.
			
 
				 #define CARBON_ONE_CHAR_SYMBOL_TOKEN(TokenName, Spelling) \
			
 
				-  table[(Spelling)[0]] = &DispatchLexOneCharSymbol<TokenKind::TokenName>;
			
 
				+  table[(Spelling)[0]] = &DispatchLexOneCharSymbolToken;
			
 
				+#define CARBON_OPENING_GROUP_SYMBOL_TOKEN(TokenName, Spelling, ClosingName) \
			
 
				+  table[(Spelling)[0]] = &DispatchLexOpeningSymbolToken;
			
 
				+#define CARBON_CLOSING_GROUP_SYMBOL_TOKEN(TokenName, Spelling, OpeningName) \
			
 
				+  table[(Spelling)[0]] = &DispatchLexClosingSymbolToken;
			
 
				 #include "toolchain/lex/token_kind.def"
			
 
				 
			
 
				     // Override the handling for `/` to consider comments as well as a `/`
			
@@ -956,6 +975,8 @@ class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
 
				 
			
 
				   static const DispatchTableT DispatchTable;
			
 
				 
			
 
				+  static const std::array<TokenKind, 256> OneCharTokenKindTable;
			
 
				+
			
 
				   TokenizedBuffer* buffer_;
			
 
				 
			
 
				   SourceBufferLocationTranslator translator_;
			
@@ -976,6 +997,19 @@ class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
 
				 constexpr TokenizedBuffer::Lexer::DispatchTableT
			
 
				     TokenizedBuffer::Lexer::DispatchTable = MakeDispatchTable();
			
 
				 
			
 
				+constexpr std::array<TokenKind, 256>
			
 
				+    TokenizedBuffer::Lexer::OneCharTokenKindTable = [] {
			
 
				+      std::array<TokenKind, 256> table = {};
			
 
				+#define CARBON_ONE_CHAR_SYMBOL_TOKEN(TokenName, Spelling) \
			
 
				+  table[(Spelling)[0]] = TokenKind::TokenName;
			
 
				+#define CARBON_OPENING_GROUP_SYMBOL_TOKEN(TokenName, Spelling, ClosingName) \
			
 
				+  table[(Spelling)[0]] = TokenKind::TokenName;
			
 
				+#define CARBON_CLOSING_GROUP_SYMBOL_TOKEN(TokenName, Spelling, OpeningName) \
			
 
				+  table[(Spelling)[0]] = TokenKind::TokenName;
			
 
				+#include "toolchain/lex/token_kind.def"
			
 
				+      return table;
			
 
				+    }();
			
 
				+
			
 
				 auto TokenizedBuffer::Lex(SourceBuffer& source, DiagnosticConsumer& consumer)
			
 
				     -> TokenizedBuffer {
			
 
				   TokenizedBuffer buffer(source);