преди 2 години · 6ba8712fbd
--- a/toolchain/lex/tokenized_buffer.cpp
+++ b/toolchain/lex/tokenized_buffer.cpp
@@ -247,18 +247,48 @@ class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
 
				         translator_(&buffer),
			
 
				         emitter_(translator_, consumer),
			
 
				         token_translator_(&buffer),
			
 
				-        token_emitter_(token_translator_, consumer),
			
 
				-        current_line_(buffer.AddLine(LineInfo(0))),
			
 
				-        current_line_info_(&buffer.GetLineInfo(current_line_)) {}
			
 
				+        token_emitter_(token_translator_, consumer) {}
			
 
				+
			
 
				+  // Find all line endings and create the line data structures. Explicitly kept
			
 
				+  // out-of-line because this is a significant loop that is useful to have in
			
 
				+  // the profile and it doesn't simplify by inlining at all. But because it can,
			
 
				+  // the compiler will flatten this otherwise.
			
 
				+  [[gnu::noinline]] auto CreateLines(llvm::StringRef source_text) -> void {
			
 
				+    // We currently use `memchr` here which typically is well optimized to use
			
 
				+    // SIMD or other significantly faster than byte-wise scanning. We also use
			
 
				+    // carefully selected variables and the `ssize_t` type for performance and
			
 
				+    // code size of this hot loop.
			
 
				+    //
			
 
				+    // TODO: Eventually, we'll likely need to roll our own SIMD-optimized
			
 
				+    // routine here in order to handle CR+LF line endings, as we'll want those
			
 
				+    // to stay on the fast path. We'll also need to detect and diagnose Unicode
			
 
				+    // vertical whitespace. Starting with `memchr` should give us a strong
			
 
				+    // baseline performance target when adding those features.
			
 
				+    const char* const text = source_text.data();
			
 
				+    const ssize_t size = source_text.size();
			
 
				+    ssize_t start = 0;
			
 
				+    while (const char* nl = reinterpret_cast<const char*>(
			
 
				+               memchr(&text[start], '\n', size - start))) {
			
 
				+      ssize_t nl_index = nl - text;
			
 
				+      buffer_->AddLine(LineInfo(start, nl_index - start));
			
 
				+      start = nl_index + 1;
			
 
				+    }
			
 
				+    // The last line ends at the end of the file.
			
 
				+    buffer_->AddLine(LineInfo(start, size - start));
			
 
				+
			
 
				+    // Now that all the infos are allocated, get a fresh pointer to the first
			
 
				+    // info for use while lexing.
			
 
				+    current_line_ = Line(0);
			
 
				+    current_line_info_ = &buffer_->GetLineInfo(current_line_);
			
 
				+  }
			
 
				 
			
 
				   // Perform the necessary bookkeeping to step past a newline at the current
			
 
				   // line and column.
			
 
				   auto HandleNewline() -> void {
			
 
				-    current_line_info_->length = current_column_;
			
 
				-
			
 
				-    current_line_ = buffer_->AddLine(
			
 
				-        LineInfo(current_line_info_->start + current_column_ + 1));
			
 
				+    int next_start = current_line_info_->start + current_column_ + 1;
			
 
				+    current_line_ = buffer_->GetNextLine(current_line_);
			
 
				     current_line_info_ = &buffer_->GetLineInfo(current_line_);
			
 
				+    CARBON_DCHECK(next_start == current_line_info_->start);
			
 
				     current_column_ = 0;
			
 
				     set_indent_ = false;
			
 
				   }
			
@@ -278,15 +308,6 @@ class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
 
				     CARBON_DCHECK(source_text.front() == '\n');
			
 
				     NoteWhitespace();
			
 
				     source_text = source_text.drop_front();
			
 
				-
			
 
				-    // If this is the last character in the source, directly return here
			
 
				-    // to avoid creating an empty line.
			
 
				-    if (LLVM_UNLIKELY(source_text.empty())) {
			
 
				-      current_line_info_->length = current_column_;
			
 
				-      return;
			
 
				-    }
			
 
				-
			
 
				-    // Otherwise, add a line and set up to continue lexing.
			
 
				     HandleNewline();
			
 
				   }
			
 
				 
			
@@ -325,14 +346,18 @@ class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
 
				                     NoWhitespaceAfterCommentIntroducer);
			
 
				     }
			
 
				 
			
 
				-    // Now just consume the text until a newline.
			
 
				-    while (!source_text.empty() && source_text.front() != '\n') {
			
 
				-      ++current_column_;
			
 
				-      source_text = source_text.drop_front();
			
 
				+    // Use the current line info to jump to the end of the line.
			
 
				+    source_text =
			
 
				+        source_text.drop_front(current_line_info_->length - current_column_);
			
 
				+    // This may be the end of the file in which case we immediately return.
			
 
				+    if (source_text.empty()) {
			
 
				+      // Finished lexing.
			
 
				+      return;
			
 
				     }
			
 
				 
			
 
				-    // We don't handle the newline, just fall back to the lex loop to handle it
			
 
				-    // generically.
			
 
				+    // Otherwise, lex the newline.
			
 
				+    current_column_ = current_line_info_->length;
			
 
				+    LexVerticalWhitespace(source_text);
			
 
				   }
			
 
				 
			
 
				   auto LexNumericLiteral(llvm::StringRef& source_text) -> LexResult {
			
@@ -725,10 +750,23 @@ class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
 
				   auto LexEndOfFile(llvm::StringRef& source_text) -> void {
			
 
				     CARBON_DCHECK(source_text.empty());
			
 
				 
			
 
				+    // Check if the last line is empty and not the first line (and only). If so,
			
 
				+    // re-pin the last line to be the prior one so that diagnostics and editors
			
 
				+    // can treat newlines as terminators even though we internally handle them
			
 
				+    // as separators in case of a missing newline on the last line. We do this
			
 
				+    // here instead of detecting this when we see the newline to avoid more
			
 
				+    // conditions along that fast path.
			
 
				+    if (current_column_ == 0 && buffer_->GetLineNumber(current_line_) != 1) {
			
 
				+      current_line_ = buffer_->GetPrevLine(current_line_);
			
 
				+      current_line_info_ = &buffer_->GetLineInfo(current_line_);
			
 
				+      current_column_ = current_line_info_->length;
			
 
				+    } else {
			
 
				+      // Update the line length as this is also the end of a line.
			
 
				+      current_line_info_->length = current_column_;
			
 
				+    }
			
 
				+
			
 
				     // The end-of-file token is always considered to be whitespace.
			
 
				     NoteWhitespace();
			
 
				-    // Update the line length as this is also the end of a line.
			
 
				-    current_line_info_->length = current_column_;
			
 
				 
			
 
				     // Close any open groups. We do this after marking whitespace, it will
			
 
				     // preserve that.
			
@@ -804,6 +842,9 @@ class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
 
				   // The main entry point for dispatching through the lexer's table. This method
			
 
				   // should always fully consume the source text.
			
 
				   auto Dispatch(llvm::StringRef& source_text) -> void {
			
 
				+    // First build up our line data structures.
			
 
				+    CreateLines(source_text);
			
 
				+
			
 
				     LexStartOfFile(source_text);
			
 
				 
			
 
				     // Manually enter the dispatch loop. This call will tail-recurse through the
			
@@ -923,7 +964,7 @@ class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
 
				   TokenLocationTranslator token_translator_;
			
 
				   TokenDiagnosticEmitter token_emitter_;
			
 
				 
			
 
				-  Line current_line_;
			
 
				+  Line current_line_ = Line::Invalid;
			
 
				   LineInfo* current_line_info_;
			
 
				 
			
 
				   int current_column_ = 0;
			
@@ -1098,6 +1139,17 @@ auto TokenizedBuffer::GetLineNumber(Line line) const -> int {
 
				   return line.index + 1;
			
 
				 }
			
 
				 
			
 
				+auto TokenizedBuffer::GetNextLine(Line line) const -> Line {
			
 
				+  Line next(line.index + 1);
			
 
				+  CARBON_DCHECK(static_cast<size_t>(next.index) < line_infos_.size());
			
 
				+  return next;
			
 
				+}
			
 
				+
			
 
				+auto TokenizedBuffer::GetPrevLine(Line line) const -> Line {
			
 
				+  CARBON_CHECK(line.index > 0);
			
 
				+  return Line(line.index - 1);
			
 
				+}
			
 
				+
			
 
				 auto TokenizedBuffer::GetIndentColumnNumber(Line line) const -> int {
			
 
				   return GetLineInfo(line).indent + 1;
			
 
				 }
			
--- a/toolchain/lex/tokenized_buffer.h
+++ b/toolchain/lex/tokenized_buffer.h
@@ -55,8 +55,12 @@ struct Token : public ComparableIndexBase {
 
				 // All other APIs to query a `Line` are on the `TokenizedBuffer`.
			
 
				 struct Line : public ComparableIndexBase {
			
 
				   using ComparableIndexBase::ComparableIndexBase;
			
 
				+
			
 
				+  static const Line Invalid;
			
 
				 };
			
 
				 
			
 
				+constexpr Line Line::Invalid(Line::InvalidIndex);
			
 
				+
			
 
				 // A lightweight handle to a lexed identifier in a `TokenizedBuffer`.
			
 
				 //
			
 
				 // `Identifier` objects are designed to be passed by value, not reference or
			
@@ -229,6 +233,12 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
 
				   // Returns the 1-based indentation column number.
			
 
				   [[nodiscard]] auto GetIndentColumnNumber(Line line) const -> int;
			
 
				 
			
 
				+  // Returns the next line handle.
			
 
				+  [[nodiscard]] auto GetNextLine(Line line) const -> Line;
			
 
				+
			
 
				+  // Returns the previous line handle.
			
 
				+  [[nodiscard]] auto GetPrevLine(Line line) const -> Line;
			
 
				+
			
 
				   // Returns the text for an identifier.
			
 
				   [[nodiscard]] auto GetIdentifierText(Identifier id) const -> llvm::StringRef;
			
 
				 
			
@@ -347,6 +357,9 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
 
				           length(static_cast<int32_t>(llvm::StringRef::npos)),
			
 
				           indent(0) {}
			
 
				 
			
 
				+    explicit LineInfo(int64_t start, int32_t length)
			
 
				+        : start(start), length(length), indent(0) {}
			
 
				+
			
 
				     // Zero-based byte offset of the start of the line within the source buffer
			
 
				     // provided.
			
 
				     int64_t start;