преди 2 години · ad42ef11ba
--- a/toolchain/lexer/tokenized_buffer.cpp
+++ b/toolchain/lexer/tokenized_buffer.cpp
@@ -24,6 +24,10 @@
 
				 #include "toolchain/lexer/numeric_literal.h"
			
 
				 #include "toolchain/lexer/string_literal.h"
			
 
				 
			
 
				+#if __x86_64__
			
 
				+#include <x86intrin.h>
			
 
				+#endif
			
 
				+
			
 
				 namespace Carbon {
			
 
				 
			
 
				 // TODO: Move Overload and VariantMatch somewhere more central.
			
@@ -48,6 +52,167 @@ auto VariantMatch(V&& v, Fs&&... fs) -> decltype(auto) {
 
				   return std::visit(Overload{std::forward<Fs&&>(fs)...}, std::forward<V&&>(v));
			
 
				 }
			
 
				 
			
 
				+// Scans the provided text and returns the prefix `StringRef` of contiguous
			
 
				+// identifier characters.
			
 
				+//
			
 
				+// This is a performance sensitive function and so uses vectorized code
			
 
				+// sequences to optimize its scanning. When modifying, the identifier lexing
			
 
				+// benchmarks should be checked for regressions.
			
 
				+//
			
 
				+// Identifier characters here are currently the ASCII characters `[0-9A-Za-z_]`.
			
 
				+//
			
 
				+// TODO: Currently, this code does not implement Carbon's design for Unicode
			
 
				+// characters in identifiers. It does work on UTF-8 code unit sequences, but
			
 
				+// currently considers non-ASCII characters to be non-identifier characters.
			
 
				+// Some work has been done to ensure the hot loop, while optimized, retains
			
 
				+// enough information to add Unicode handling without completely destroying the
			
 
				+// relevant optimizations.
			
 
				+static auto ScanForIdentifierPrefix(llvm::StringRef text) -> llvm::StringRef {
			
 
				+  // A table of booleans that we can use to classify bytes as being valid
			
 
				+  // identifier (or keyword) characters. This is used in the generic,
			
 
				+  // non-vectorized fallback code to scan for length of an identifier.
			
 
				+  constexpr std::array<bool, 256> IsIdentifierByteTable = []() constexpr {
			
 
				+    std::array<bool, 256> table = {};
			
 
				+    for (char c = '0'; c <= '9'; ++c) {
			
 
				+      table[c] = true;
			
 
				+    }
			
 
				+    for (char c = 'A'; c <= 'Z'; ++c) {
			
 
				+      table[c] = true;
			
 
				+    }
			
 
				+    for (char c = 'a'; c <= 'z'; ++c) {
			
 
				+      table[c] = true;
			
 
				+    }
			
 
				+    table['_'] = true;
			
 
				+    return table;
			
 
				+  }();
			
 
				+
			
 
				+#if __x86_64__
			
 
				+  // This code uses a scheme derived from the techniques in Geoff Langdale and
			
 
				+  // Daniel Lemire's work on parsing JSON[1]. Specifically, that paper outlines
			
 
				+  // a technique of using two 4-bit indexed in-register look-up tables (LUTs) to
			
 
				+  // classify bytes in a branchless SIMD code sequence.
			
 
				+  //
			
 
				+  // [1]: https://arxiv.org/pdf/1902.08318.pdf
			
 
				+  //
			
 
				+  // The goal is to get a bit mask classifying different sets of bytes. For each
			
 
				+  // input byte, we first test for a high bit indicating a UTF-8 encoded Unicode
			
 
				+  // character. Otherwise, we want the mask bits to be set with the following
			
 
				+  // logic derived by inspecting the high nibble and low nibble of the input:
			
 
				+  // bit0 = 1 for `_`: high `0x5` and low `0xF`
			
 
				+  // bit1 = 1 for `0-9`: high `0x3` and low `0x0` - `0x9`
			
 
				+  // bit2 = 1 for `A-O` and `a-o`: high `0x4` or `0x6` and low `0x1` - `0xF`
			
 
				+  // bit3 = 1 for `P-Z` and 'p-z': high `0x5` or `0x7` and low `0x0` - `0xA`
			
 
				+  // bit4 = unused
			
 
				+  // bit5 = unused
			
 
				+  // bit6 = unused
			
 
				+  // bit7 = unused
			
 
				+  //
			
 
				+  // No bits set means definitively non-ID ASCII character.
			
 
				+  //
			
 
				+  // bits 4-7 remain unused if we need to classify more characters.
			
 
				+  const auto high_lut = _mm_setr_epi8(
			
 
				+      /*0x0:*/ 0b0000'0000,
			
 
				+      /*0x1:*/ 0b0000'0000,
			
 
				+      /*0x2:*/ 0b0000'0000,
			
 
				+      /*0x3:*/ 0b0000'0010,
			
 
				+      /*0x4:*/ 0b0000'0100,
			
 
				+      /*0x5:*/ 0b0000'1001,
			
 
				+      /*0x6:*/ 0b0000'0100,
			
 
				+      /*0x7:*/ 0b0000'1000,
			
 
				+      /*0x8:*/ 0b0000'0000,
			
 
				+      /*0x9:*/ 0b0000'0000,
			
 
				+      /*0xA:*/ 0b0000'0000,
			
 
				+      /*0xB:*/ 0b0000'0000,
			
 
				+      /*0xC:*/ 0b0000'0000,
			
 
				+      /*0xD:*/ 0b0000'0000,
			
 
				+      /*0xE:*/ 0b0000'0000,
			
 
				+      /*0xF:*/ 0b0000'0000);
			
 
				+  const auto low_lut = _mm_setr_epi8(
			
 
				+      /*0x0:*/ 0b0000'1010,
			
 
				+      /*0x1:*/ 0b0000'1110,
			
 
				+      /*0x2:*/ 0b0000'1110,
			
 
				+      /*0x3:*/ 0b0000'1110,
			
 
				+      /*0x4:*/ 0b0000'1110,
			
 
				+      /*0x5:*/ 0b0000'1110,
			
 
				+      /*0x6:*/ 0b0000'1110,
			
 
				+      /*0x7:*/ 0b0000'1110,
			
 
				+      /*0x8:*/ 0b0000'1110,
			
 
				+      /*0x9:*/ 0b0000'1110,
			
 
				+      /*0xA:*/ 0b0000'1100,
			
 
				+      /*0xB:*/ 0b0000'0100,
			
 
				+      /*0xC:*/ 0b0000'0100,
			
 
				+      /*0xD:*/ 0b0000'0100,
			
 
				+      /*0xE:*/ 0b0000'0100,
			
 
				+      /*0xF:*/ 0b0000'0101);
			
 
				+
			
 
				+  // Use `ssize_t` for performance here as we index memory in a tight loop.
			
 
				+  ssize_t i = 0;
			
 
				+  const ssize_t size = text.size();
			
 
				+  while ((i + 16) <= size) {
			
 
				+    __m128i input =
			
 
				+        _mm_loadu_si128(reinterpret_cast<const __m128i*>(text.data() + i));
			
 
				+
			
 
				+    // The high bits of each byte indicate a non-ASCII character encoded using
			
 
				+    // UTF-8. Test those and fall back to the scalar code if present. These
			
 
				+    // bytes will also cause spurious zeros in the LUT results, but we can
			
 
				+    // ignore that because we track them independently here.
			
 
				+#if __SSE4_1__
			
 
				+    if (!_mm_test_all_zeros(_mm_set1_epi8(0x80), input)) {
			
 
				+      break;
			
 
				+    }
			
 
				+#else
			
 
				+    if (_mm_movemask_epi8(input) != 0) {
			
 
				+      break;
			
 
				+    }
			
 
				+#endif
			
 
				+
			
 
				+    // Do two LUT lookups and mask the results together to get the results for
			
 
				+    // both low and high nibbles. Note that we don't need to mask out the high
			
 
				+    // bit of input here because we track that above for UTF-8 handling.
			
 
				+    __m128i low_mask = _mm_shuffle_epi8(low_lut, input);
			
 
				+    // Note that the input needs to be masked to only include the high nibble or
			
 
				+    // we could end up with bit7 set forcing the result to a zero byte.
			
 
				+    __m128i input_high =
			
 
				+        _mm_and_si128(_mm_srli_epi32(input, 4), _mm_set1_epi8(0x0f));
			
 
				+    __m128i high_mask = _mm_shuffle_epi8(high_lut, input_high);
			
 
				+    __m128i mask = _mm_and_si128(low_mask, high_mask);
			
 
				+
			
 
				+    // Now compare to find the completely zero bytes.
			
 
				+    __m128i id_byte_mask_vec = _mm_cmpeq_epi8(mask, _mm_setzero_si128());
			
 
				+    int tail_ascii_mask = _mm_movemask_epi8(id_byte_mask_vec);
			
 
				+
			
 
				+    // Check if there are bits in the tail mask, which means zero bytes and the
			
 
				+    // end of the identifier. We could do this without materializing the scalar
			
 
				+    // mask on more recent CPUs, but we generally expect the median length we
			
 
				+    // encounter to be <16 characters and so we avoid the extra instruction in
			
 
				+    // that case and predict this branch to succeed so it is laid out in a
			
 
				+    // reasonable way.
			
 
				+    if (LLVM_LIKELY(tail_ascii_mask != 0)) {
			
 
				+      // Move past the definitively classified bytes that are part of the
			
 
				+      // identifier, and return the complete identifier text.
			
 
				+      i += __builtin_ctz(tail_ascii_mask);
			
 
				+      return text.substr(0, i);
			
 
				+    }
			
 
				+    i += 16;
			
 
				+  }
			
 
				+
			
 
				+  // Fallback to scalar loop. We only end up here when we don't have >=16
			
 
				+  // bytes to scan or we find a UTF-8 unicode character.
			
 
				+  // TODO: This assumes all Unicode characters are non-identifiers.
			
 
				+  while (i < size &&
			
 
				+         IsIdentifierByteTable[static_cast<unsigned char>(text[i])]) {
			
 
				+    ++i;
			
 
				+  }
			
 
				+
			
 
				+  return text.substr(0, i);
			
 
				+#else
			
 
				+  // TODO: Optimize this with SIMD for other architectures.
			
 
				+  return text.take_while([](char c) {
			
 
				+    return IsIdentifierByteTable[static_cast<unsigned char>(c)];
			
 
				+  });
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				 // Implementation of the lexer logic itself.
			
 
				 //
			
 
				 // The design is that lexing can loop over the source buffer, consuming it into
			
@@ -454,8 +619,7 @@ class TokenizedBuffer::Lexer {
 
				     }
			
 
				 
			
 
				     // Take the valid characters off the front of the source buffer.
			
 
				-    llvm::StringRef identifier_text =
			
 
				-        source_text.take_while([](char c) { return IsAlnum(c) || c == '_'; });
			
 
				+    llvm::StringRef identifier_text = ScanForIdentifierPrefix(source_text);
			
 
				     CARBON_CHECK(!identifier_text.empty())
			
 
				         << "Must have at least one character!";
			
 
				     int identifier_column = current_column_;