tokenized_buffer.cpp 60 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "toolchain/lex/tokenized_buffer.h"
  5. #include <algorithm>
  6. #include <array>
  7. #include <cmath>
  8. #include "common/check.h"
  9. #include "common/string_helpers.h"
  10. #include "llvm/ADT/StringRef.h"
  11. #include "llvm/ADT/StringSwitch.h"
  12. #include "llvm/Support/ErrorHandling.h"
  13. #include "llvm/Support/Format.h"
  14. #include "llvm/Support/FormatVariadic.h"
  15. #include "llvm/Support/raw_ostream.h"
  16. #include "toolchain/base/value_store.h"
  17. #include "toolchain/lex/character_set.h"
  18. #include "toolchain/lex/helpers.h"
  19. #include "toolchain/lex/numeric_literal.h"
  20. #include "toolchain/lex/string_literal.h"
  21. #if __ARM_NEON
  22. #include <arm_neon.h>
  23. #define CARBON_USE_SIMD 1
  24. #elif __x86_64__
  25. #include <x86intrin.h>
  26. #define CARBON_USE_SIMD 1
  27. #else
  28. #define CARBON_USE_SIMD 0
  29. #endif
  30. namespace Carbon::Lex {
  31. // TODO: Move Overload and VariantMatch somewhere more central.
  32. // Form an overload set from a list of functions. For example:
  33. //
  34. // ```
  35. // auto overloaded = Overload{[] (int) {}, [] (float) {}};
  36. // ```
  37. template <typename... Fs>
  38. struct Overload : Fs... {
  39. using Fs::operator()...;
  40. };
  41. template <typename... Fs>
  42. Overload(Fs...) -> Overload<Fs...>;
  43. // Pattern-match against the type of the value stored in the variant `V`. Each
  44. // element of `fs` should be a function that takes one or more of the variant
  45. // values in `V`.
  46. template <typename V, typename... Fs>
  47. auto VariantMatch(V&& v, Fs&&... fs) -> decltype(auto) {
  48. return std::visit(Overload{std::forward<Fs&&>(fs)...}, std::forward<V&&>(v));
  49. }
  50. #if CARBON_USE_SIMD
  51. namespace {
  52. #if __ARM_NEON
  53. using SIMDMaskT = uint8x16_t;
  54. #elif __x86_64__
  55. using SIMDMaskT = __m128i;
  56. #else
  57. #error "Unsupported SIMD architecture!"
  58. #endif
  59. using SIMDMaskArrayT = std::array<SIMDMaskT, sizeof(SIMDMaskT) + 1>;
  60. } // namespace
  61. // A table of masks to include 0-16 bytes of an SSE register.
  62. static constexpr SIMDMaskArrayT PrefixMasks = []() constexpr {
  63. SIMDMaskArrayT masks = {};
  64. for (int i = 1; i < static_cast<int>(masks.size()); ++i) {
  65. // The SIMD types and constexpr require a C-style cast.
  66. // NOLINTNEXTLINE(google-readability-casting)
  67. masks[i] = (SIMDMaskT)(std::numeric_limits<unsigned __int128>::max() >>
  68. ((sizeof(SIMDMaskT) - i) * 8));
  69. }
  70. return masks;
  71. }();
  72. #endif // CARBON_USE_SIMD
  73. // Scans the provided text and returns the prefix `StringRef` of contiguous
  74. // identifier characters.
  75. //
  76. // This is a performance sensitive function and so uses vectorized code
  77. // sequences to optimize its scanning. When modifying, the identifier lexing
  78. // benchmarks should be checked for regressions.
  79. //
  80. // Identifier characters here are currently the ASCII characters `[0-9A-Za-z_]`.
  81. //
  82. // TODO: Currently, this code does not implement Carbon's design for Unicode
  83. // characters in identifiers. It does work on UTF-8 code unit sequences, but
  84. // currently considers non-ASCII characters to be non-identifier characters.
  85. // Some work has been done to ensure the hot loop, while optimized, retains
  86. // enough information to add Unicode handling without completely destroying the
  87. // relevant optimizations.
  88. static auto ScanForIdentifierPrefix(llvm::StringRef text) -> llvm::StringRef {
  89. // A table of booleans that we can use to classify bytes as being valid
  90. // identifier (or keyword) characters. This is used in the generic,
  91. // non-vectorized fallback code to scan for length of an identifier.
  92. static constexpr std::array<bool, 256> IsIdByteTable = ([]() constexpr {
  93. std::array<bool, 256> table = {};
  94. for (char c = '0'; c <= '9'; ++c) {
  95. table[c] = true;
  96. }
  97. for (char c = 'A'; c <= 'Z'; ++c) {
  98. table[c] = true;
  99. }
  100. for (char c = 'a'; c <= 'z'; ++c) {
  101. table[c] = true;
  102. }
  103. table['_'] = true;
  104. return table;
  105. })();
  106. #if CARBON_USE_SIMD && __x86_64__
  107. // This code uses a scheme derived from the techniques in Geoff Langdale and
  108. // Daniel Lemire's work on parsing JSON[1]. Specifically, that paper outlines
  109. // a technique of using two 4-bit indexed in-register look-up tables (LUTs) to
  110. // classify bytes in a branchless SIMD code sequence.
  111. //
  112. // [1]: https://arxiv.org/pdf/1902.08318.pdf
  113. //
  114. // The goal is to get a bit mask classifying different sets of bytes. For each
  115. // input byte, we first test for a high bit indicating a UTF-8 encoded Unicode
  116. // character. Otherwise, we want the mask bits to be set with the following
  117. // logic derived by inspecting the high nibble and low nibble of the input:
  118. // bit0 = 1 for `_`: high `0x5` and low `0xF`
  119. // bit1 = 1 for `0-9`: high `0x3` and low `0x0` - `0x9`
  120. // bit2 = 1 for `A-O` and `a-o`: high `0x4` or `0x6` and low `0x1` - `0xF`
  121. // bit3 = 1 for `P-Z` and 'p-z': high `0x5` or `0x7` and low `0x0` - `0xA`
  122. // bit4 = unused
  123. // bit5 = unused
  124. // bit6 = unused
  125. // bit7 = unused
  126. //
  127. // No bits set means definitively non-ID ASCII character.
  128. //
  129. // bits 4-7 remain unused if we need to classify more characters.
  130. const auto high_lut = _mm_setr_epi8(
  131. /* __b0=*/0b0000'0000,
  132. /* __b1=*/0b0000'0000,
  133. /* __b2=*/0b0000'0000,
  134. /* __b3=*/0b0000'0010,
  135. /* __b4=*/0b0000'0100,
  136. /* __b5=*/0b0000'1001,
  137. /* __b6=*/0b0000'0100,
  138. /* __b7=*/0b0000'1000,
  139. /* __b8=*/0b0000'0000,
  140. /* __b9=*/0b0000'0000,
  141. /*__b10=*/0b0000'0000,
  142. /*__b11=*/0b0000'0000,
  143. /*__b12=*/0b0000'0000,
  144. /*__b13=*/0b0000'0000,
  145. /*__b14=*/0b0000'0000,
  146. /*__b15=*/0b0000'0000);
  147. const auto low_lut = _mm_setr_epi8(
  148. /* __b0=*/0b0000'1010,
  149. /* __b1=*/0b0000'1110,
  150. /* __b2=*/0b0000'1110,
  151. /* __b3=*/0b0000'1110,
  152. /* __b4=*/0b0000'1110,
  153. /* __b5=*/0b0000'1110,
  154. /* __b6=*/0b0000'1110,
  155. /* __b7=*/0b0000'1110,
  156. /* __b8=*/0b0000'1110,
  157. /* __b9=*/0b0000'1110,
  158. /*__b10=*/0b0000'1100,
  159. /*__b11=*/0b0000'0100,
  160. /*__b12=*/0b0000'0100,
  161. /*__b13=*/0b0000'0100,
  162. /*__b14=*/0b0000'0100,
  163. /*__b15=*/0b0000'0101);
  164. // Use `ssize_t` for performance here as we index memory in a tight loop.
  165. ssize_t i = 0;
  166. const ssize_t size = text.size();
  167. while ((i + 16) <= size) {
  168. __m128i input =
  169. _mm_loadu_si128(reinterpret_cast<const __m128i*>(text.data() + i));
  170. // The high bits of each byte indicate a non-ASCII character encoded using
  171. // UTF-8. Test those and fall back to the scalar code if present. These
  172. // bytes will also cause spurious zeros in the LUT results, but we can
  173. // ignore that because we track them independently here.
  174. #if __SSE4_1__
  175. if (!_mm_test_all_zeros(_mm_set1_epi8(0x80), input)) {
  176. break;
  177. }
  178. #else
  179. if (_mm_movemask_epi8(input) != 0) {
  180. break;
  181. }
  182. #endif
  183. // Do two LUT lookups and mask the results together to get the results for
  184. // both low and high nibbles. Note that we don't need to mask out the high
  185. // bit of input here because we track that above for UTF-8 handling.
  186. __m128i low_mask = _mm_shuffle_epi8(low_lut, input);
  187. // Note that the input needs to be masked to only include the high nibble or
  188. // we could end up with bit7 set forcing the result to a zero byte.
  189. __m128i input_high =
  190. _mm_and_si128(_mm_srli_epi32(input, 4), _mm_set1_epi8(0x0f));
  191. __m128i high_mask = _mm_shuffle_epi8(high_lut, input_high);
  192. __m128i mask = _mm_and_si128(low_mask, high_mask);
  193. // Now compare to find the completely zero bytes.
  194. __m128i id_byte_mask_vec = _mm_cmpeq_epi8(mask, _mm_setzero_si128());
  195. int tail_ascii_mask = _mm_movemask_epi8(id_byte_mask_vec);
  196. // Check if there are bits in the tail mask, which means zero bytes and the
  197. // end of the identifier. We could do this without materializing the scalar
  198. // mask on more recent CPUs, but we generally expect the median length we
  199. // encounter to be <16 characters and so we avoid the extra instruction in
  200. // that case and predict this branch to succeed so it is laid out in a
  201. // reasonable way.
  202. if (LLVM_LIKELY(tail_ascii_mask != 0)) {
  203. // Move past the definitively classified bytes that are part of the
  204. // identifier, and return the complete identifier text.
  205. i += __builtin_ctz(tail_ascii_mask);
  206. return text.substr(0, i);
  207. }
  208. i += 16;
  209. }
  210. // Fallback to scalar loop. We only end up here when we don't have >=16
  211. // bytes to scan or we find a UTF-8 unicode character.
  212. // TODO: This assumes all Unicode characters are non-identifiers.
  213. while (i < size && IsIdByteTable[static_cast<unsigned char>(text[i])]) {
  214. ++i;
  215. }
  216. return text.substr(0, i);
  217. #else
  218. // TODO: Optimize this with SIMD for other architectures.
  219. return text.take_while(
  220. [](char c) { return IsIdByteTable[static_cast<unsigned char>(c)]; });
  221. #endif
  222. }
  223. // Implementation of the lexer logic itself.
  224. //
  225. // The design is that lexing can loop over the source buffer, consuming it into
  226. // tokens by calling into this API. This class handles the state and breaks down
  227. // the different lexing steps that may be used. It directly updates the provided
  228. // tokenized buffer with the lexed tokens.
  229. class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
  230. public:
  231. // Symbolic result of a lexing action. This indicates whether we successfully
  232. // lexed a token, or whether other lexing actions should be attempted.
  233. //
  234. // While it wraps a simple boolean state, its API both helps make the failures
  235. // more self documenting, and by consuming the actual token constructively
  236. // when one is produced, it helps ensure the correct result is returned.
  237. class LexResult {
  238. public:
  239. // Consumes (and discard) a valid token to construct a result
  240. // indicating a token has been produced. Relies on implicit conversions.
  241. // NOLINTNEXTLINE(google-explicit-constructor)
  242. LexResult(Token /*discarded_token*/) : LexResult(true) {}
  243. // Returns a result indicating no token was produced.
  244. static auto NoMatch() -> LexResult { return LexResult(false); }
  245. // Tests whether a token was produced by the lexing routine, and
  246. // the lexer can continue forming tokens.
  247. explicit operator bool() const { return formed_token_; }
  248. private:
  249. explicit LexResult(bool formed_token) : formed_token_(formed_token) {}
  250. bool formed_token_;
  251. };
  252. Lexer(SharedValueStores& value_stores, SourceBuffer& source,
  253. DiagnosticConsumer& consumer)
  254. : buffer_(value_stores, source),
  255. consumer_(consumer),
  256. translator_(&buffer_),
  257. emitter_(translator_, consumer_),
  258. token_translator_(&buffer_),
  259. token_emitter_(token_translator_, consumer_) {}
  260. // Find all line endings and create the line data structures. Explicitly kept
  261. // out-of-line because this is a significant loop that is useful to have in
  262. // the profile and it doesn't simplify by inlining at all. But because it can,
  263. // the compiler will flatten this otherwise.
  264. [[gnu::noinline]] auto CreateLines(llvm::StringRef source_text) -> void {
  265. // We currently use `memchr` here which typically is well optimized to use
  266. // SIMD or other significantly faster than byte-wise scanning. We also use
  267. // carefully selected variables and the `ssize_t` type for performance and
  268. // code size of this hot loop.
  269. //
  270. // TODO: Eventually, we'll likely need to roll our own SIMD-optimized
  271. // routine here in order to handle CR+LF line endings, as we'll want those
  272. // to stay on the fast path. We'll also need to detect and diagnose Unicode
  273. // vertical whitespace. Starting with `memchr` should give us a strong
  274. // baseline performance target when adding those features.
  275. const char* const text = source_text.data();
  276. const ssize_t size = source_text.size();
  277. ssize_t start = 0;
  278. while (const char* nl = reinterpret_cast<const char*>(
  279. memchr(&text[start], '\n', size - start))) {
  280. ssize_t nl_index = nl - text;
  281. buffer_.AddLine(LineInfo(start, nl_index - start));
  282. start = nl_index + 1;
  283. }
  284. // The last line ends at the end of the file.
  285. buffer_.AddLine(LineInfo(start, size - start));
  286. // If the last line wasn't empty, the file ends with an unterminated line.
  287. // Add an extra blank line so that we never need to handle the special case
  288. // of being on the last line inside the lexer and needing to not increment
  289. // to the next line.
  290. if (start != size) {
  291. buffer_.AddLine(LineInfo(size, 0));
  292. }
  293. // Now that all the infos are allocated, get a fresh pointer to the first
  294. // info for use while lexing.
  295. line_index_ = 0;
  296. }
  297. auto current_line() -> Line { return Line(line_index_); }
  298. auto current_line_info() -> LineInfo* {
  299. return &buffer_.line_infos_[line_index_];
  300. }
  301. auto ComputeColumn(ssize_t position) -> int {
  302. CARBON_DCHECK(position >= current_line_info()->start);
  303. return position - current_line_info()->start;
  304. }
  305. auto NoteWhitespace() -> void {
  306. buffer_.token_infos_.back().has_trailing_space = true;
  307. }
  308. auto SkipHorizontalWhitespace(llvm::StringRef source_text, ssize_t& position)
  309. -> void {
  310. // Handle adjacent whitespace quickly. This comes up frequently for example
  311. // due to indentation. We don't expect *huge* runs, so just use a scalar
  312. // loop. While still scalar, this avoids repeated table dispatch and marking
  313. // whitespace.
  314. while (position < static_cast<ssize_t>(source_text.size()) &&
  315. (source_text[position] == ' ' || source_text[position] == '\t')) {
  316. ++position;
  317. }
  318. }
  319. auto LexHorizontalWhitespace(llvm::StringRef source_text, ssize_t& position)
  320. -> void {
  321. CARBON_DCHECK(source_text[position] == ' ' ||
  322. source_text[position] == '\t');
  323. NoteWhitespace();
  324. // Skip runs using an optimized code path.
  325. SkipHorizontalWhitespace(source_text, position);
  326. }
  327. auto LexVerticalWhitespace(llvm::StringRef source_text, ssize_t& position)
  328. -> void {
  329. NoteWhitespace();
  330. ++line_index_;
  331. auto* line_info = current_line_info();
  332. ssize_t line_start = line_info->start;
  333. position = line_start;
  334. SkipHorizontalWhitespace(source_text, position);
  335. line_info->indent = position - line_start;
  336. }
  337. auto LexCommentOrSlash(llvm::StringRef source_text, ssize_t& position)
  338. -> void {
  339. CARBON_DCHECK(source_text[position] == '/');
  340. // Both comments and slash symbols start with a `/`. We disambiguate with a
  341. // max-munch rule -- if the next character is another `/` then we lex it as
  342. // a comment start. If it isn't, then we lex as a slash. We also optimize
  343. // for the comment case as we expect that to be much more important for
  344. // overall lexer performance.
  345. if (LLVM_LIKELY(position + 1 < static_cast<ssize_t>(source_text.size()) &&
  346. source_text[position + 1] == '/')) {
  347. LexComment(source_text, position);
  348. return;
  349. }
  350. // This code path should produce a token, make sure that happens.
  351. LexResult result = LexSymbolToken(source_text, position);
  352. CARBON_CHECK(result) << "Failed to form a token!";
  353. }
  354. auto LexComment(llvm::StringRef source_text, ssize_t& position) -> void {
  355. CARBON_DCHECK(source_text.substr(position).startswith("//"));
  356. // Any comment must be the only non-whitespace on the line.
  357. const auto* line_info = current_line_info();
  358. if (LLVM_UNLIKELY(position != line_info->start + line_info->indent)) {
  359. CARBON_DIAGNOSTIC(TrailingComment, Error,
  360. "Trailing comments are not permitted.");
  361. emitter_.Emit(source_text.begin() + position, TrailingComment);
  362. // Note that we cannot fall-through here as the logic below doesn't handle
  363. // trailing comments. For simplicity, we just consume the trailing comment
  364. // itself and let the normal lexer handle the newline as if there weren't
  365. // a comment at all.
  366. position = line_info->start + line_info->length;
  367. return;
  368. }
  369. // The introducer '//' must be followed by whitespace or EOF.
  370. bool is_valid_after_slashes = true;
  371. if (position + 2 < static_cast<ssize_t>(source_text.size()) &&
  372. LLVM_UNLIKELY(!IsSpace(source_text[position + 2]))) {
  373. CARBON_DIAGNOSTIC(NoWhitespaceAfterCommentIntroducer, Error,
  374. "Whitespace is required after '//'.");
  375. emitter_.Emit(source_text.begin() + position + 2,
  376. NoWhitespaceAfterCommentIntroducer);
  377. // We use this to tweak the lexing of blocks below.
  378. is_valid_after_slashes = false;
  379. }
  380. // Skip over this line.
  381. ssize_t line_index = line_index_;
  382. ++line_index;
  383. position = buffer_.line_infos_[line_index].start;
  384. // A very common pattern is a long block of comment lines all with the same
  385. // indent and comment start. We skip these comment blocks in bulk both for
  386. // speed and to reduce redundant diagnostics if each line has the same
  387. // erroneous comment start like `//!`.
  388. //
  389. // When we have SIMD support this is even more important for speed, as short
  390. // indents can be scanned extremely quickly with SIMD and we expect these to
  391. // be the dominant cases.
  392. //
  393. // TODO: We should extend this to 32-byte SIMD on platforms with support.
  394. constexpr int MaxIndent = 13;
  395. const int indent = line_info->indent;
  396. const ssize_t first_line_start = line_info->start;
  397. ssize_t prefix_size = indent + (is_valid_after_slashes ? 3 : 2);
  398. auto skip_to_next_line = [this, indent, &line_index, &position] {
  399. // We're guaranteed to have a line here even on a comment on the last line
  400. // as we ensure there is an empty line structure at the end of every file.
  401. ++line_index;
  402. auto* next_line_info = &buffer_.line_infos_[line_index];
  403. next_line_info->indent = indent;
  404. position = next_line_info->start;
  405. };
  406. if (CARBON_USE_SIMD &&
  407. position + 16 < static_cast<ssize_t>(source_text.size()) &&
  408. indent <= MaxIndent) {
  409. // Load a mask based on the amount of text we want to compare.
  410. auto mask = PrefixMasks[prefix_size];
  411. #if __ARM_NEON
  412. // Load and mask the prefix of the current line.
  413. auto prefix = vld1q_u8(reinterpret_cast<const uint8_t*>(
  414. source_text.data() + first_line_start));
  415. prefix = vandq_u8(mask, prefix);
  416. do {
  417. // Load and mask the next line to consider's prefix.
  418. auto next_prefix = vld1q_u8(
  419. reinterpret_cast<const uint8_t*>(source_text.data() + position));
  420. next_prefix = vandq_u8(mask, next_prefix);
  421. // Compare the two prefixes and if any lanes differ, break.
  422. auto compare = vceqq_u8(prefix, next_prefix);
  423. if (vminvq_u8(compare) == 0) {
  424. break;
  425. }
  426. skip_to_next_line();
  427. } while (position + 16 < static_cast<ssize_t>(source_text.size()));
  428. #elif __x86_64__
  429. // Use the current line's prefix as the exemplar to compare against.
  430. // We don't mask here as we will mask when doing the comparison.
  431. auto prefix = _mm_loadu_si128(reinterpret_cast<const __m128i*>(
  432. source_text.data() + first_line_start));
  433. do {
  434. // Load the next line to consider's prefix.
  435. auto next_prefix = _mm_loadu_si128(
  436. reinterpret_cast<const __m128i*>(source_text.data() + position));
  437. // Compute the difference between the next line and our exemplar. Again,
  438. // we don't mask the difference because the comparison below will be
  439. // masked.
  440. auto prefix_diff = _mm_xor_si128(prefix, next_prefix);
  441. // If we have any differences (non-zero bits) within the mask, we can't
  442. // skip the next line too.
  443. if (!_mm_test_all_zeros(mask, prefix_diff)) {
  444. break;
  445. }
  446. skip_to_next_line();
  447. } while (position + 16 < static_cast<ssize_t>(source_text.size()));
  448. #else
  449. #error "Unsupported SIMD architecture!"
  450. #endif
  451. // TODO: If we finish the loop due to the position approaching the end of
  452. // the buffer we may fail to skip the last line in a comment block that
  453. // has an invalid initial sequence and thus emit extra diagnostics. We
  454. // should really fall through to the generic skipping logic, but the code
  455. // organization will need to change significantly to allow that.
  456. } else {
  457. while (position + prefix_size <
  458. static_cast<ssize_t>(source_text.size()) &&
  459. memcmp(source_text.data() + first_line_start,
  460. source_text.data() + position, prefix_size) == 0) {
  461. skip_to_next_line();
  462. }
  463. }
  464. // Now compute the indent of this next line before we finish.
  465. ssize_t line_start = position;
  466. SkipHorizontalWhitespace(source_text, position);
  467. // Now that we're done scanning, update to the latest line index and indent.
  468. line_index_ = line_index;
  469. current_line_info()->indent = position - line_start;
  470. }
  471. auto LexNumericLiteral(llvm::StringRef source_text, ssize_t& position)
  472. -> LexResult {
  473. std::optional<NumericLiteral> literal =
  474. NumericLiteral::Lex(source_text.substr(position));
  475. if (!literal) {
  476. return LexError(source_text, position);
  477. }
  478. int int_column = ComputeColumn(position);
  479. int token_size = literal->text().size();
  480. position += token_size;
  481. return VariantMatch(
  482. literal->ComputeValue(emitter_),
  483. [&](NumericLiteral::IntegerValue&& value) {
  484. auto token = buffer_.AddToken({.kind = TokenKind::IntegerLiteral,
  485. .token_line = current_line(),
  486. .column = int_column});
  487. buffer_.GetTokenInfo(token).integer_id =
  488. buffer_.value_stores_->integers().Add(std::move(value.value));
  489. return token;
  490. },
  491. [&](NumericLiteral::RealValue&& value) {
  492. auto token = buffer_.AddToken({.kind = TokenKind::RealLiteral,
  493. .token_line = current_line(),
  494. .column = int_column});
  495. buffer_.GetTokenInfo(token).real_id =
  496. buffer_.value_stores_->reals().Add(
  497. Real{.mantissa = value.mantissa,
  498. .exponent = value.exponent,
  499. .is_decimal =
  500. (value.radix == NumericLiteral::Radix::Decimal)});
  501. return token;
  502. },
  503. [&](NumericLiteral::UnrecoverableError) {
  504. auto token = buffer_.AddToken({
  505. .kind = TokenKind::Error,
  506. .token_line = current_line(),
  507. .column = int_column,
  508. .error_length = token_size,
  509. });
  510. return token;
  511. });
  512. }
  513. auto LexStringLiteral(llvm::StringRef source_text, ssize_t& position)
  514. -> LexResult {
  515. std::optional<StringLiteral> literal =
  516. StringLiteral::Lex(source_text.substr(position));
  517. if (!literal) {
  518. return LexError(source_text, position);
  519. }
  520. Line string_line = current_line();
  521. int string_column = ComputeColumn(position);
  522. ssize_t literal_size = literal->text().size();
  523. position += literal_size;
  524. // Update line and column information.
  525. if (literal->is_multi_line()) {
  526. while (current_line_info()->start + current_line_info()->length <
  527. position) {
  528. ++line_index_;
  529. current_line_info()->indent = string_column;
  530. }
  531. // Note that we've updated the current line at this point, but
  532. // `set_indent_` is already true from above. That remains correct as the
  533. // last line of the multi-line literal *also* has its indent set.
  534. }
  535. if (literal->is_terminated()) {
  536. // TODO: Refactor to reduce copies.
  537. // https://github.com/carbon-language/carbon-lang/pull/3311#discussion_r1366048360
  538. buffer_.computed_strings_.push_back(
  539. std::make_unique<std::string>(literal->ComputeValue(emitter_)));
  540. auto string_id = buffer_.value_stores_->strings().Add(
  541. *buffer_.computed_strings_.back());
  542. auto token = buffer_.AddToken({.kind = TokenKind::StringLiteral,
  543. .token_line = string_line,
  544. .column = string_column,
  545. .string_id = string_id});
  546. return token;
  547. } else {
  548. CARBON_DIAGNOSTIC(UnterminatedString, Error,
  549. "String is missing a terminator.");
  550. emitter_.Emit(literal->text().begin(), UnterminatedString);
  551. return buffer_.AddToken(
  552. {.kind = TokenKind::Error,
  553. .token_line = string_line,
  554. .column = string_column,
  555. .error_length = static_cast<int32_t>(literal_size)});
  556. }
  557. }
  558. auto LexOneCharSymbolToken(llvm::StringRef source_text, TokenKind kind,
  559. ssize_t& position) -> Token {
  560. // Verify in a debug build that the incoming token kind is correct.
  561. CARBON_DCHECK(kind != TokenKind::Error);
  562. CARBON_DCHECK(kind.fixed_spelling().size() == 1);
  563. CARBON_DCHECK(source_text[position] == kind.fixed_spelling().front())
  564. << "Source text starts with '" << source_text[position]
  565. << "' instead of the spelling '" << kind.fixed_spelling()
  566. << "' of the incoming token kind '" << kind << "'";
  567. Token token = buffer_.AddToken({.kind = kind,
  568. .token_line = current_line(),
  569. .column = ComputeColumn(position)});
  570. ++position;
  571. return token;
  572. }
  573. auto LexOpeningSymbolToken(llvm::StringRef source_text, TokenKind kind,
  574. ssize_t& position) -> LexResult {
  575. Token token = LexOneCharSymbolToken(source_text, kind, position);
  576. open_groups_.push_back(token);
  577. return token;
  578. }
  579. auto LexClosingSymbolToken(llvm::StringRef source_text, TokenKind kind,
  580. ssize_t& position) -> LexResult {
  581. auto unmatched_error = [&] {
  582. CARBON_DIAGNOSTIC(
  583. UnmatchedClosing, Error,
  584. "Closing symbol without a corresponding opening symbol.");
  585. emitter_.Emit(source_text.begin() + position, UnmatchedClosing);
  586. Token token = buffer_.AddToken({.kind = TokenKind::Error,
  587. .token_line = current_line(),
  588. .column = ComputeColumn(position),
  589. .error_length = 1});
  590. ++position;
  591. return token;
  592. };
  593. // If we have no open groups, this is an error.
  594. if (LLVM_UNLIKELY(open_groups_.empty())) {
  595. return unmatched_error();
  596. }
  597. Token opening_token = open_groups_.back();
  598. // Close any invalid open groups first.
  599. if (LLVM_UNLIKELY(buffer_.GetTokenInfo(opening_token).kind !=
  600. kind.opening_symbol())) {
  601. CloseInvalidOpenGroups(kind, position);
  602. // This may exhaust the open groups so re-check and re-error if needed.
  603. if (open_groups_.empty()) {
  604. return unmatched_error();
  605. }
  606. opening_token = open_groups_.back();
  607. CARBON_DCHECK(buffer_.GetTokenInfo(opening_token).kind ==
  608. kind.opening_symbol());
  609. }
  610. open_groups_.pop_back();
  611. // Now that the groups are all matched up, lex the actual token.
  612. Token token = LexOneCharSymbolToken(source_text, kind, position);
  613. // Note that it is important to get fresh token infos here as lexing the
  614. // open token would invalidate any pointers.
  615. buffer_.GetTokenInfo(opening_token).closing_token = token;
  616. buffer_.GetTokenInfo(token).opening_token = opening_token;
  617. return token;
  618. }
  619. auto LexSymbolToken(llvm::StringRef source_text, ssize_t& position)
  620. -> LexResult {
  621. // One character symbols and grouping symbols are handled with dedicated
  622. // dispatch. We only lex the multi-character tokens here.
  623. TokenKind kind = llvm::StringSwitch<TokenKind>(source_text.substr(position))
  624. #define CARBON_SYMBOL_TOKEN(Name, Spelling) \
  625. .StartsWith(Spelling, TokenKind::Name)
  626. #define CARBON_ONE_CHAR_SYMBOL_TOKEN(TokenName, Spelling)
  627. #define CARBON_OPENING_GROUP_SYMBOL_TOKEN(TokenName, Spelling, ClosingName)
  628. #define CARBON_CLOSING_GROUP_SYMBOL_TOKEN(TokenName, Spelling, OpeningName)
  629. #include "toolchain/lex/token_kind.def"
  630. .Default(TokenKind::Error);
  631. if (kind == TokenKind::Error) {
  632. return LexError(source_text, position);
  633. }
  634. Token token = buffer_.AddToken({.kind = kind,
  635. .token_line = current_line(),
  636. .column = ComputeColumn(position)});
  637. position += kind.fixed_spelling().size();
  638. return token;
  639. }
  640. // Given a word that has already been lexed, determine whether it is a type
  641. // literal and if so form the corresponding token.
  642. auto LexWordAsTypeLiteralToken(llvm::StringRef word, int column)
  643. -> LexResult {
  644. if (word.size() < 2) {
  645. // Too short to form one of these tokens.
  646. return LexResult::NoMatch();
  647. }
  648. if (word[1] < '1' || word[1] > '9') {
  649. // Doesn't start with a valid initial digit.
  650. return LexResult::NoMatch();
  651. }
  652. std::optional<TokenKind> kind;
  653. switch (word.front()) {
  654. case 'i':
  655. kind = TokenKind::IntegerTypeLiteral;
  656. break;
  657. case 'u':
  658. kind = TokenKind::UnsignedIntegerTypeLiteral;
  659. break;
  660. case 'f':
  661. kind = TokenKind::FloatingPointTypeLiteral;
  662. break;
  663. default:
  664. return LexResult::NoMatch();
  665. };
  666. llvm::StringRef suffix = word.substr(1);
  667. if (!CanLexInteger(emitter_, suffix)) {
  668. return buffer_.AddToken(
  669. {.kind = TokenKind::Error,
  670. .token_line = current_line(),
  671. .column = column,
  672. .error_length = static_cast<int32_t>(word.size())});
  673. }
  674. llvm::APInt suffix_value;
  675. if (suffix.getAsInteger(10, suffix_value)) {
  676. return LexResult::NoMatch();
  677. }
  678. auto token = buffer_.AddToken(
  679. {.kind = *kind, .token_line = current_line(), .column = column});
  680. buffer_.GetTokenInfo(token).integer_id =
  681. buffer_.value_stores_->integers().Add(std::move(suffix_value));
  682. return token;
  683. }
  684. // Closes all open groups that cannot remain open across a closing symbol.
  685. // Users may pass `Error` to close all open groups.
  686. [[gnu::noinline]] auto CloseInvalidOpenGroups(TokenKind kind,
  687. ssize_t position) -> void {
  688. CARBON_CHECK(kind.is_closing_symbol() || kind == TokenKind::Error);
  689. CARBON_CHECK(!open_groups_.empty());
  690. int column = ComputeColumn(position);
  691. do {
  692. Token opening_token = open_groups_.back();
  693. TokenKind opening_kind = buffer_.GetTokenInfo(opening_token).kind;
  694. if (kind == opening_kind.closing_symbol()) {
  695. return;
  696. }
  697. open_groups_.pop_back();
  698. CARBON_DIAGNOSTIC(
  699. MismatchedClosing, Error,
  700. "Closing symbol does not match most recent opening symbol.");
  701. token_emitter_.Emit(opening_token, MismatchedClosing);
  702. CARBON_CHECK(!buffer_.tokens().empty())
  703. << "Must have a prior opening token!";
  704. Token prev_token = buffer_.tokens().end()[-1];
  705. // TODO: do a smarter backwards scan for where to put the closing
  706. // token.
  707. Token closing_token = buffer_.AddToken(
  708. {.kind = opening_kind.closing_symbol(),
  709. .has_trailing_space = buffer_.HasTrailingWhitespace(prev_token),
  710. .is_recovery = true,
  711. .token_line = current_line(),
  712. .column = column});
  713. TokenInfo& opening_token_info = buffer_.GetTokenInfo(opening_token);
  714. TokenInfo& closing_token_info = buffer_.GetTokenInfo(closing_token);
  715. opening_token_info.closing_token = closing_token;
  716. closing_token_info.opening_token = opening_token;
  717. } while (!open_groups_.empty());
  718. }
  719. auto LexKeywordOrIdentifier(llvm::StringRef source_text, ssize_t& position)
  720. -> LexResult {
  721. if (static_cast<unsigned char>(source_text[position]) > 0x7F) {
  722. // TODO: Need to add support for Unicode lexing.
  723. return LexError(source_text, position);
  724. }
  725. CARBON_CHECK(IsAlpha(source_text[position]) ||
  726. source_text[position] == '_');
  727. int column = ComputeColumn(position);
  728. // Take the valid characters off the front of the source buffer.
  729. llvm::StringRef identifier_text =
  730. ScanForIdentifierPrefix(source_text.substr(position));
  731. CARBON_CHECK(!identifier_text.empty())
  732. << "Must have at least one character!";
  733. position += identifier_text.size();
  734. // Check if the text is a type literal, and if so form such a literal.
  735. if (LexResult result = LexWordAsTypeLiteralToken(identifier_text, column)) {
  736. return result;
  737. }
  738. // Check if the text matches a keyword token, and if so use that.
  739. TokenKind kind = llvm::StringSwitch<TokenKind>(identifier_text)
  740. #define CARBON_KEYWORD_TOKEN(Name, Spelling) .Case(Spelling, TokenKind::Name)
  741. #include "toolchain/lex/token_kind.def"
  742. .Default(TokenKind::Error);
  743. if (kind != TokenKind::Error) {
  744. return buffer_.AddToken(
  745. {.kind = kind, .token_line = current_line(), .column = column});
  746. }
  747. // Otherwise we have a generic identifier.
  748. return buffer_.AddToken(
  749. {.kind = TokenKind::Identifier,
  750. .token_line = current_line(),
  751. .column = column,
  752. .string_id = buffer_.value_stores_->strings().Add(identifier_text)});
  753. }
  754. auto LexError(llvm::StringRef source_text, ssize_t& position) -> LexResult {
  755. llvm::StringRef error_text =
  756. source_text.substr(position).take_while([](char c) {
  757. if (IsAlnum(c)) {
  758. return false;
  759. }
  760. switch (c) {
  761. case '_':
  762. case '\t':
  763. case '\n':
  764. return false;
  765. default:
  766. break;
  767. }
  768. return llvm::StringSwitch<bool>(llvm::StringRef(&c, 1))
  769. #define CARBON_SYMBOL_TOKEN(Name, Spelling) .StartsWith(Spelling, false)
  770. #include "toolchain/lex/token_kind.def"
  771. .Default(true);
  772. });
  773. if (error_text.empty()) {
  774. // TODO: Reimplement this to use the lexer properly. In the meantime,
  775. // guarantee that we eat at least one byte.
  776. error_text = source_text.substr(position, 1);
  777. }
  778. auto token = buffer_.AddToken(
  779. {.kind = TokenKind::Error,
  780. .token_line = current_line(),
  781. .column = ComputeColumn(position),
  782. .error_length = static_cast<int32_t>(error_text.size())});
  783. CARBON_DIAGNOSTIC(UnrecognizedCharacters, Error,
  784. "Encountered unrecognized characters while parsing.");
  785. emitter_.Emit(error_text.begin(), UnrecognizedCharacters);
  786. position += error_text.size();
  787. return token;
  788. }
  789. auto LexStartOfFile(llvm::StringRef source_text, ssize_t& position) -> void {
  790. // Before lexing any source text, add the start-of-file token so that code
  791. // can assume a non-empty token buffer for the rest of lexing. Note that the
  792. // start-of-file always has trailing space because it *is* whitespace.
  793. buffer_.AddToken({.kind = TokenKind::StartOfFile,
  794. .has_trailing_space = true,
  795. .token_line = current_line(),
  796. .column = 0});
  797. // Also skip any horizontal whitespace and record the indentation of the
  798. // first line.
  799. SkipHorizontalWhitespace(source_text, position);
  800. auto* line_info = current_line_info();
  801. CARBON_CHECK(line_info->start == 0);
  802. line_info->indent = position;
  803. }
  804. auto LexEndOfFile(llvm::StringRef source_text, ssize_t position) -> void {
  805. CARBON_CHECK(position == static_cast<ssize_t>(source_text.size()));
  806. // Check if the last line is empty and not the first line (and only). If so,
  807. // re-pin the last line to be the prior one so that diagnostics and editors
  808. // can treat newlines as terminators even though we internally handle them
  809. // as separators in case of a missing newline on the last line. We do this
  810. // here instead of detecting this when we see the newline to avoid more
  811. // conditions along that fast path.
  812. if (position == current_line_info()->start && line_index_ != 0) {
  813. --line_index_;
  814. --position;
  815. } else {
  816. // Update the line length as this is also the end of a line.
  817. current_line_info()->length = ComputeColumn(position);
  818. }
  819. // The end-of-file token is always considered to be whitespace.
  820. NoteWhitespace();
  821. // Close any open groups. We do this after marking whitespace, it will
  822. // preserve that.
  823. if (!open_groups_.empty()) {
  824. CloseInvalidOpenGroups(TokenKind::Error, position);
  825. }
  826. buffer_.AddToken({.kind = TokenKind::EndOfFile,
  827. .token_line = current_line(),
  828. .column = ComputeColumn(position)});
  829. }
  830. // We use a collection of static member functions for table-based dispatch to
  831. // lexer methods. These are named static member functions so that they show up
  832. // helpfully in profiles and backtraces, but they tend to not contain the
  833. // interesting logic and simply delegate to the relevant methods. All of their
  834. // signatures need to be exactly the same however in order to ensure we can
  835. // build efficient dispatch tables out of them. All of them end by doing a
  836. // must-tail return call to this routine. It handles continuing the dispatch
  837. // chain.
  838. static auto DispatchNext(Lexer& lexer, llvm::StringRef source_text,
  839. ssize_t position) -> void {
  840. if (LLVM_LIKELY(position < static_cast<ssize_t>(source_text.size()))) {
  841. // The common case is to tail recurse based on the next character. Note
  842. // that because this is a must-tail return, this cannot fail to tail-call
  843. // and will not grow the stack. This is in essence a loop with dynamic
  844. // tail dispatch to the next stage of the loop.
  845. [[clang::musttail]] return DispatchTable[static_cast<unsigned char>(
  846. source_text[position])](lexer, source_text, position);
  847. }
  848. // When we finish the source text, stop recursing. We also hint this so that
  849. // the tail-dispatch is optimized as that's essentially the loop back-edge
  850. // and this is the loop exit.
  851. lexer.LexEndOfFile(source_text, position);
  852. }
  853. // Define a set of dispatch functions that simply forward to a method that
  854. // lexes a token. This includes validating that an actual token was produced,
  855. // and continuing the dispatch.
  856. #define CARBON_DISPATCH_LEX_TOKEN(LexMethod) \
  857. static auto Dispatch##LexMethod(Lexer& lexer, llvm::StringRef source_text, \
  858. ssize_t position) \
  859. ->void { \
  860. LexResult result = lexer.LexMethod(source_text, position); \
  861. CARBON_CHECK(result) << "Failed to form a token!"; \
  862. [[clang::musttail]] return DispatchNext(lexer, source_text, position); \
  863. }
  864. CARBON_DISPATCH_LEX_TOKEN(LexError)
  865. CARBON_DISPATCH_LEX_TOKEN(LexSymbolToken)
  866. CARBON_DISPATCH_LEX_TOKEN(LexKeywordOrIdentifier)
  867. CARBON_DISPATCH_LEX_TOKEN(LexNumericLiteral)
  868. CARBON_DISPATCH_LEX_TOKEN(LexStringLiteral)
  869. // A custom dispatch functions that pre-select the symbol token to lex.
  870. #define CARBON_DISPATCH_LEX_SYMBOL_TOKEN(LexMethod) \
  871. static auto Dispatch##LexMethod##SymbolToken( \
  872. Lexer& lexer, llvm::StringRef source_text, ssize_t position) \
  873. ->void { \
  874. LexResult result = lexer.LexMethod##SymbolToken( \
  875. source_text, OneCharTokenKindTable[source_text[position]], position); \
  876. CARBON_CHECK(result) << "Failed to form a token!"; \
  877. [[clang::musttail]] return DispatchNext(lexer, source_text, position); \
  878. }
  879. CARBON_DISPATCH_LEX_SYMBOL_TOKEN(LexOneChar)
  880. CARBON_DISPATCH_LEX_SYMBOL_TOKEN(LexOpening)
  881. CARBON_DISPATCH_LEX_SYMBOL_TOKEN(LexClosing)
  882. // Define a set of non-token dispatch functions that handle things like
  883. // whitespace and comments.
  884. #define CARBON_DISPATCH_LEX_NON_TOKEN(LexMethod) \
  885. static auto Dispatch##LexMethod(Lexer& lexer, llvm::StringRef source_text, \
  886. ssize_t position) \
  887. ->void { \
  888. lexer.LexMethod(source_text, position); \
  889. [[clang::musttail]] return DispatchNext(lexer, source_text, position); \
  890. }
  891. CARBON_DISPATCH_LEX_NON_TOKEN(LexHorizontalWhitespace)
  892. CARBON_DISPATCH_LEX_NON_TOKEN(LexVerticalWhitespace)
  893. CARBON_DISPATCH_LEX_NON_TOKEN(LexCommentOrSlash)
  894. // The main entry point for dispatching through the lexer's table. This method
  895. // should always fully consume the source text.
  896. auto Lex() && -> TokenizedBuffer {
  897. llvm::StringRef source_text = buffer_.source_->text();
  898. // First build up our line data structures.
  899. CreateLines(source_text);
  900. ssize_t position = 0;
  901. LexStartOfFile(source_text, position);
  902. // Manually enter the dispatch loop. This call will tail-recurse through the
  903. // dispatch table until everything from source_text is consumed.
  904. DispatchNext(*this, source_text, position);
  905. if (consumer_.seen_error()) {
  906. buffer_.has_errors_ = true;
  907. }
  908. return std::move(buffer_);
  909. }
  910. private:
  911. using DispatchFunctionT = auto(Lexer& lexer, llvm::StringRef source_text,
  912. ssize_t position) -> void;
  913. using DispatchTableT = std::array<DispatchFunctionT*, 256>;
  914. // Build a table of function pointers that we can use to dispatch to the
  915. // correct lexer routine based on the first byte of source text.
  916. //
  917. // While it is tempting to simply use a `switch` on the first byte and
  918. // dispatch with cases into this, in practice that doesn't produce great code.
  919. // There seem to be two issues that are the root cause.
  920. //
  921. // First, there are lots of different values of bytes that dispatch to a
  922. // fairly small set of routines, and then some byte values that dispatch
  923. // differently for each byte. This pattern isn't one that the compiler-based
  924. // lowering of switches works well with -- it tries to balance all the cases,
  925. // and in doing so emits several compares and other control flow rather than a
  926. // simple jump table.
  927. //
  928. // Second, with a `case`, it isn't as obvious how to create a single, uniform
  929. // interface that is effective for *every* byte value, and thus makes for a
  930. // single consistent table-based dispatch. By forcing these to be function
  931. // pointers, we also coerce the code to use a strictly homogeneous structure
  932. // that can form a single dispatch table.
  933. //
  934. // These two actually interact -- the second issue is part of what makes the
  935. // non-table lowering in the first one desirable for many switches and cases.
  936. //
  937. // Ultimately, when table-based dispatch is such an important technique, we
  938. // get better results by taking full control and manually creating the
  939. // dispatch structures.
  940. //
  941. // The functions in this table also use tail-recursion to implement the loop
  942. // of the lexer. This is based on the technique described more fully for any
  943. // kind of byte-stream loop structure here:
  944. // https://blog.reverberate.org/2021/04/21/musttail-efficient-interpreters.html
  945. constexpr static auto MakeDispatchTable() -> DispatchTableT {
  946. DispatchTableT table = {};
  947. // First set the table entries to dispatch to our error token handler as the
  948. // base case. Everything valid comes from an override below.
  949. for (int i = 0; i < 256; ++i) {
  950. table[i] = &DispatchLexError;
  951. }
  952. // Symbols have some special dispatching. First, set the first character of
  953. // each symbol token spelling to dispatch to the symbol lexer. We don't
  954. // provide a pre-computed token here, so the symbol lexer will compute the
  955. // exact symbol token kind. We'll override this with more specific dispatch
  956. // below.
  957. #define CARBON_SYMBOL_TOKEN(TokenName, Spelling) \
  958. table[(Spelling)[0]] = &DispatchLexSymbolToken;
  959. #include "toolchain/lex/token_kind.def"
  960. // Now special cased single-character symbols that are guaranteed to not
  961. // join with another symbol. These are grouping symbols, terminators,
  962. // or separators in the grammar and have a good reason to be
  963. // orthogonal to any other punctuation. We do this separately because this
  964. // needs to override some of the generic handling above, and provide a
  965. // custom token.
  966. #define CARBON_ONE_CHAR_SYMBOL_TOKEN(TokenName, Spelling) \
  967. table[(Spelling)[0]] = &DispatchLexOneCharSymbolToken;
  968. #define CARBON_OPENING_GROUP_SYMBOL_TOKEN(TokenName, Spelling, ClosingName) \
  969. table[(Spelling)[0]] = &DispatchLexOpeningSymbolToken;
  970. #define CARBON_CLOSING_GROUP_SYMBOL_TOKEN(TokenName, Spelling, OpeningName) \
  971. table[(Spelling)[0]] = &DispatchLexClosingSymbolToken;
  972. #include "toolchain/lex/token_kind.def"
  973. // Override the handling for `/` to consider comments as well as a `/`
  974. // symbol.
  975. table['/'] = &DispatchLexCommentOrSlash;
  976. table['_'] = &DispatchLexKeywordOrIdentifier;
  977. // Note that we don't use `llvm::seq` because this needs to be `constexpr`
  978. // evaluated.
  979. for (unsigned char c = 'a'; c <= 'z'; ++c) {
  980. table[c] = &DispatchLexKeywordOrIdentifier;
  981. }
  982. for (unsigned char c = 'A'; c <= 'Z'; ++c) {
  983. table[c] = &DispatchLexKeywordOrIdentifier;
  984. }
  985. // We dispatch all non-ASCII UTF-8 characters to the identifier lexing
  986. // as whitespace characters should already have been skipped and the
  987. // only remaining valid Unicode characters would be part of an
  988. // identifier. That code can either accept or reject.
  989. for (int i = 0x80; i < 0x100; ++i) {
  990. table[i] = &DispatchLexKeywordOrIdentifier;
  991. }
  992. for (unsigned char c = '0'; c <= '9'; ++c) {
  993. table[c] = &DispatchLexNumericLiteral;
  994. }
  995. table['\''] = &DispatchLexStringLiteral;
  996. table['"'] = &DispatchLexStringLiteral;
  997. table['#'] = &DispatchLexStringLiteral;
  998. table[' '] = &DispatchLexHorizontalWhitespace;
  999. table['\t'] = &DispatchLexHorizontalWhitespace;
  1000. table['\n'] = &DispatchLexVerticalWhitespace;
  1001. return table;
  1002. };
  1003. static const DispatchTableT DispatchTable;
  1004. static const std::array<TokenKind, 256> OneCharTokenKindTable;
  1005. TokenizedBuffer buffer_;
  1006. ssize_t line_index_;
  1007. llvm::SmallVector<Token> open_groups_;
  1008. ErrorTrackingDiagnosticConsumer consumer_;
  1009. SourceBufferLocationTranslator translator_;
  1010. LexerDiagnosticEmitter emitter_;
  1011. TokenLocationTranslator token_translator_;
  1012. TokenDiagnosticEmitter token_emitter_;
  1013. };
  1014. constexpr TokenizedBuffer::Lexer::DispatchTableT
  1015. TokenizedBuffer::Lexer::DispatchTable = MakeDispatchTable();
  1016. constexpr std::array<TokenKind, 256>
  1017. TokenizedBuffer::Lexer::OneCharTokenKindTable = [] {
  1018. std::array<TokenKind, 256> table = {};
  1019. #define CARBON_ONE_CHAR_SYMBOL_TOKEN(TokenName, Spelling) \
  1020. table[(Spelling)[0]] = TokenKind::TokenName;
  1021. #define CARBON_OPENING_GROUP_SYMBOL_TOKEN(TokenName, Spelling, ClosingName) \
  1022. table[(Spelling)[0]] = TokenKind::TokenName;
  1023. #define CARBON_CLOSING_GROUP_SYMBOL_TOKEN(TokenName, Spelling, OpeningName) \
  1024. table[(Spelling)[0]] = TokenKind::TokenName;
  1025. #include "toolchain/lex/token_kind.def"
  1026. return table;
  1027. }();
  1028. auto TokenizedBuffer::Lex(SharedValueStores& value_stores, SourceBuffer& source,
  1029. DiagnosticConsumer& consumer) -> TokenizedBuffer {
  1030. Lexer lexer(value_stores, source, consumer);
  1031. return std::move(lexer).Lex();
  1032. }
  1033. auto TokenizedBuffer::GetKind(Token token) const -> TokenKind {
  1034. return GetTokenInfo(token).kind;
  1035. }
  1036. auto TokenizedBuffer::GetLine(Token token) const -> Line {
  1037. return GetTokenInfo(token).token_line;
  1038. }
  1039. auto TokenizedBuffer::GetLineNumber(Token token) const -> int {
  1040. return GetLineNumber(GetLine(token));
  1041. }
  1042. auto TokenizedBuffer::GetColumnNumber(Token token) const -> int {
  1043. return GetTokenInfo(token).column + 1;
  1044. }
  1045. auto TokenizedBuffer::GetTokenText(Token token) const -> llvm::StringRef {
  1046. const auto& token_info = GetTokenInfo(token);
  1047. llvm::StringRef fixed_spelling = token_info.kind.fixed_spelling();
  1048. if (!fixed_spelling.empty()) {
  1049. return fixed_spelling;
  1050. }
  1051. if (token_info.kind == TokenKind::Error) {
  1052. const auto& line_info = GetLineInfo(token_info.token_line);
  1053. int64_t token_start = line_info.start + token_info.column;
  1054. return source_->text().substr(token_start, token_info.error_length);
  1055. }
  1056. // Refer back to the source text to preserve oddities like radix or digit
  1057. // separators the author included.
  1058. if (token_info.kind == TokenKind::IntegerLiteral ||
  1059. token_info.kind == TokenKind::RealLiteral) {
  1060. const auto& line_info = GetLineInfo(token_info.token_line);
  1061. int64_t token_start = line_info.start + token_info.column;
  1062. std::optional<NumericLiteral> relexed_token =
  1063. NumericLiteral::Lex(source_->text().substr(token_start));
  1064. CARBON_CHECK(relexed_token) << "Could not reform numeric literal token.";
  1065. return relexed_token->text();
  1066. }
  1067. // Refer back to the source text to find the original spelling, including
  1068. // escape sequences etc.
  1069. if (token_info.kind == TokenKind::StringLiteral) {
  1070. const auto& line_info = GetLineInfo(token_info.token_line);
  1071. int64_t token_start = line_info.start + token_info.column;
  1072. std::optional<StringLiteral> relexed_token =
  1073. StringLiteral::Lex(source_->text().substr(token_start));
  1074. CARBON_CHECK(relexed_token) << "Could not reform string literal token.";
  1075. return relexed_token->text();
  1076. }
  1077. // Refer back to the source text to avoid needing to reconstruct the
  1078. // spelling from the size.
  1079. if (token_info.kind.is_sized_type_literal()) {
  1080. const auto& line_info = GetLineInfo(token_info.token_line);
  1081. int64_t token_start = line_info.start + token_info.column;
  1082. llvm::StringRef suffix =
  1083. source_->text().substr(token_start + 1).take_while(IsDecimalDigit);
  1084. return llvm::StringRef(suffix.data() - 1, suffix.size() + 1);
  1085. }
  1086. if (token_info.kind == TokenKind::StartOfFile ||
  1087. token_info.kind == TokenKind::EndOfFile) {
  1088. return llvm::StringRef();
  1089. }
  1090. CARBON_CHECK(token_info.kind == TokenKind::Identifier) << token_info.kind;
  1091. return value_stores_->strings().Get(token_info.string_id);
  1092. }
  1093. auto TokenizedBuffer::GetIdentifier(Token token) const -> StringId {
  1094. const auto& token_info = GetTokenInfo(token);
  1095. CARBON_CHECK(token_info.kind == TokenKind::Identifier) << token_info.kind;
  1096. return token_info.string_id;
  1097. }
  1098. auto TokenizedBuffer::GetIntegerLiteral(Token token) const
  1099. -> const llvm::APInt& {
  1100. const auto& token_info = GetTokenInfo(token);
  1101. CARBON_CHECK(token_info.kind == TokenKind::IntegerLiteral) << token_info.kind;
  1102. return value_stores_->integers().Get(token_info.integer_id);
  1103. }
  1104. auto TokenizedBuffer::GetRealLiteral(Token token) const -> Real {
  1105. const auto& token_info = GetTokenInfo(token);
  1106. CARBON_CHECK(token_info.kind == TokenKind::RealLiteral) << token_info.kind;
  1107. return value_stores_->reals().Get(token_info.real_id);
  1108. }
  1109. auto TokenizedBuffer::GetStringLiteral(Token token) const -> llvm::StringRef {
  1110. const auto& token_info = GetTokenInfo(token);
  1111. CARBON_CHECK(token_info.kind == TokenKind::StringLiteral) << token_info.kind;
  1112. return value_stores_->strings().Get(token_info.string_id);
  1113. }
  1114. auto TokenizedBuffer::GetTypeLiteralSize(Token token) const
  1115. -> const llvm::APInt& {
  1116. const auto& token_info = GetTokenInfo(token);
  1117. CARBON_CHECK(token_info.kind.is_sized_type_literal()) << token_info.kind;
  1118. return value_stores_->integers().Get(token_info.integer_id);
  1119. }
  1120. auto TokenizedBuffer::GetMatchedClosingToken(Token opening_token) const
  1121. -> Token {
  1122. const auto& opening_token_info = GetTokenInfo(opening_token);
  1123. CARBON_CHECK(opening_token_info.kind.is_opening_symbol())
  1124. << opening_token_info.kind;
  1125. return opening_token_info.closing_token;
  1126. }
  1127. auto TokenizedBuffer::GetMatchedOpeningToken(Token closing_token) const
  1128. -> Token {
  1129. const auto& closing_token_info = GetTokenInfo(closing_token);
  1130. CARBON_CHECK(closing_token_info.kind.is_closing_symbol())
  1131. << closing_token_info.kind;
  1132. return closing_token_info.opening_token;
  1133. }
  1134. auto TokenizedBuffer::HasLeadingWhitespace(Token token) const -> bool {
  1135. auto it = TokenIterator(token);
  1136. return it == tokens().begin() || GetTokenInfo(*(it - 1)).has_trailing_space;
  1137. }
  1138. auto TokenizedBuffer::HasTrailingWhitespace(Token token) const -> bool {
  1139. return GetTokenInfo(token).has_trailing_space;
  1140. }
  1141. auto TokenizedBuffer::IsRecoveryToken(Token token) const -> bool {
  1142. return GetTokenInfo(token).is_recovery;
  1143. }
  1144. auto TokenizedBuffer::GetLineNumber(Line line) const -> int {
  1145. return line.index + 1;
  1146. }
  1147. auto TokenizedBuffer::GetNextLine(Line line) const -> Line {
  1148. Line next(line.index + 1);
  1149. CARBON_DCHECK(static_cast<size_t>(next.index) < line_infos_.size());
  1150. return next;
  1151. }
  1152. auto TokenizedBuffer::GetPrevLine(Line line) const -> Line {
  1153. CARBON_CHECK(line.index > 0);
  1154. return Line(line.index - 1);
  1155. }
  1156. auto TokenizedBuffer::GetIndentColumnNumber(Line line) const -> int {
  1157. return GetLineInfo(line).indent + 1;
  1158. }
  1159. auto TokenizedBuffer::PrintWidths::Widen(const PrintWidths& widths) -> void {
  1160. index = std::max(widths.index, index);
  1161. kind = std::max(widths.kind, kind);
  1162. column = std::max(widths.column, column);
  1163. line = std::max(widths.line, line);
  1164. indent = std::max(widths.indent, indent);
  1165. }
  1166. // Compute the printed width of a number. When numbers are printed in decimal,
  1167. // the number of digits needed is is one more than the log-base-10 of the
  1168. // value. We handle a value of `zero` explicitly.
  1169. //
  1170. // This routine requires its argument to be *non-negative*.
  1171. static auto ComputeDecimalPrintedWidth(int number) -> int {
  1172. CARBON_CHECK(number >= 0) << "Negative numbers are not supported.";
  1173. if (number == 0) {
  1174. return 1;
  1175. }
  1176. return static_cast<int>(std::log10(number)) + 1;
  1177. }
  1178. auto TokenizedBuffer::GetTokenPrintWidths(Token token) const -> PrintWidths {
  1179. PrintWidths widths = {};
  1180. widths.index = ComputeDecimalPrintedWidth(token_infos_.size());
  1181. widths.kind = GetKind(token).name().size();
  1182. widths.line = ComputeDecimalPrintedWidth(GetLineNumber(token));
  1183. widths.column = ComputeDecimalPrintedWidth(GetColumnNumber(token));
  1184. widths.indent =
  1185. ComputeDecimalPrintedWidth(GetIndentColumnNumber(GetLine(token)));
  1186. return widths;
  1187. }
  1188. auto TokenizedBuffer::Print(llvm::raw_ostream& output_stream) const -> void {
  1189. if (tokens().begin() == tokens().end()) {
  1190. return;
  1191. }
  1192. output_stream << "- filename: " << source_->filename() << "\n"
  1193. << " tokens: [\n";
  1194. PrintWidths widths = {};
  1195. widths.index = ComputeDecimalPrintedWidth((token_infos_.size()));
  1196. for (Token token : tokens()) {
  1197. widths.Widen(GetTokenPrintWidths(token));
  1198. }
  1199. for (Token token : tokens()) {
  1200. PrintToken(output_stream, token, widths);
  1201. output_stream << "\n";
  1202. }
  1203. output_stream << " ]\n";
  1204. }
  1205. auto TokenizedBuffer::PrintToken(llvm::raw_ostream& output_stream,
  1206. Token token) const -> void {
  1207. PrintToken(output_stream, token, {});
  1208. }
  1209. auto TokenizedBuffer::PrintToken(llvm::raw_ostream& output_stream, Token token,
  1210. PrintWidths widths) const -> void {
  1211. widths.Widen(GetTokenPrintWidths(token));
  1212. int token_index = token.index;
  1213. const auto& token_info = GetTokenInfo(token);
  1214. llvm::StringRef token_text = GetTokenText(token);
  1215. // Output the main chunk using one format string. We have to do the
  1216. // justification manually in order to use the dynamically computed widths
  1217. // and get the quotes included.
  1218. output_stream << llvm::formatv(
  1219. " { index: {0}, kind: {1}, line: {2}, column: {3}, indent: {4}, "
  1220. "spelling: '{5}'",
  1221. llvm::format_decimal(token_index, widths.index),
  1222. llvm::right_justify(llvm::formatv("'{0}'", token_info.kind.name()).str(),
  1223. widths.kind + 2),
  1224. llvm::format_decimal(GetLineNumber(token_info.token_line), widths.line),
  1225. llvm::format_decimal(GetColumnNumber(token), widths.column),
  1226. llvm::format_decimal(GetIndentColumnNumber(token_info.token_line),
  1227. widths.indent),
  1228. token_text);
  1229. switch (token_info.kind) {
  1230. case TokenKind::Identifier:
  1231. output_stream << ", identifier: " << GetIdentifier(token).index;
  1232. break;
  1233. case TokenKind::IntegerLiteral:
  1234. output_stream << ", value: `";
  1235. GetIntegerLiteral(token).print(output_stream, /*isSigned=*/false);
  1236. output_stream << "`";
  1237. break;
  1238. case TokenKind::RealLiteral:
  1239. output_stream << ", value: `" << GetRealLiteral(token) << "`";
  1240. break;
  1241. case TokenKind::StringLiteral:
  1242. output_stream << ", value: `" << GetStringLiteral(token) << "`";
  1243. break;
  1244. default:
  1245. if (token_info.kind.is_opening_symbol()) {
  1246. output_stream << ", closing_token: "
  1247. << GetMatchedClosingToken(token).index;
  1248. } else if (token_info.kind.is_closing_symbol()) {
  1249. output_stream << ", opening_token: "
  1250. << GetMatchedOpeningToken(token).index;
  1251. }
  1252. break;
  1253. }
  1254. if (token_info.has_trailing_space) {
  1255. output_stream << ", has_trailing_space: true";
  1256. }
  1257. if (token_info.is_recovery) {
  1258. output_stream << ", recovery: true";
  1259. }
  1260. output_stream << " },";
  1261. }
  1262. auto TokenizedBuffer::GetLineInfo(Line line) -> LineInfo& {
  1263. return line_infos_[line.index];
  1264. }
  1265. auto TokenizedBuffer::GetLineInfo(Line line) const -> const LineInfo& {
  1266. return line_infos_[line.index];
  1267. }
  1268. auto TokenizedBuffer::AddLine(LineInfo info) -> Line {
  1269. line_infos_.push_back(info);
  1270. return Line(static_cast<int>(line_infos_.size()) - 1);
  1271. }
  1272. auto TokenizedBuffer::GetTokenInfo(Token token) -> TokenInfo& {
  1273. return token_infos_[token.index];
  1274. }
  1275. auto TokenizedBuffer::GetTokenInfo(Token token) const -> const TokenInfo& {
  1276. return token_infos_[token.index];
  1277. }
  1278. auto TokenizedBuffer::AddToken(TokenInfo info) -> Token {
  1279. token_infos_.push_back(info);
  1280. expected_parse_tree_size_ += info.kind.expected_parse_tree_size();
  1281. return Token(static_cast<int>(token_infos_.size()) - 1);
  1282. }
  1283. auto TokenIterator::Print(llvm::raw_ostream& output) const -> void {
  1284. output << token_.index;
  1285. }
  1286. auto TokenizedBuffer::SourceBufferLocationTranslator::GetLocation(
  1287. const char* loc) -> DiagnosticLocation {
  1288. CARBON_CHECK(StringRefContainsPointer(buffer_->source_->text(), loc))
  1289. << "location not within buffer";
  1290. int64_t offset = loc - buffer_->source_->text().begin();
  1291. // Find the first line starting after the given location. Note that we can't
  1292. // inspect `line.length` here because it is not necessarily correct for the
  1293. // final line during lexing (but will be correct later for the parse tree).
  1294. const auto* line_it = std::partition_point(
  1295. buffer_->line_infos_.begin(), buffer_->line_infos_.end(),
  1296. [offset](const LineInfo& line) { return line.start <= offset; });
  1297. // Step back one line to find the line containing the given position.
  1298. CARBON_CHECK(line_it != buffer_->line_infos_.begin())
  1299. << "location precedes the start of the first line";
  1300. --line_it;
  1301. int line_number = line_it - buffer_->line_infos_.begin();
  1302. int column_number = offset - line_it->start;
  1303. // Start by grabbing the line from the buffer. If the line isn't fully lexed,
  1304. // the length will be npos and the line will be grabbed from the known start
  1305. // to the end of the buffer; we'll then adjust the length.
  1306. llvm::StringRef line =
  1307. buffer_->source_->text().substr(line_it->start, line_it->length);
  1308. if (line_it->length == static_cast<int32_t>(llvm::StringRef::npos)) {
  1309. CARBON_CHECK(line.take_front(column_number).count('\n') == 0)
  1310. << "Currently we assume no unlexed newlines prior to the error column, "
  1311. "but there was one when erroring at "
  1312. << buffer_->source_->filename() << ":" << line_number << ":"
  1313. << column_number;
  1314. // Look for the next newline since we don't know the length. We can start at
  1315. // the column because prior newlines will have been lexed.
  1316. auto end_newline_pos = line.find('\n', column_number);
  1317. if (end_newline_pos != llvm::StringRef::npos) {
  1318. line = line.take_front(end_newline_pos);
  1319. }
  1320. }
  1321. return {.file_name = buffer_->source_->filename(),
  1322. .line = line,
  1323. .line_number = line_number + 1,
  1324. .column_number = column_number + 1};
  1325. }
  1326. auto TokenLocationTranslator::GetLocation(Token token) -> DiagnosticLocation {
  1327. // Map the token location into a position within the source buffer.
  1328. const auto& token_info = buffer_->GetTokenInfo(token);
  1329. const auto& line_info = buffer_->GetLineInfo(token_info.token_line);
  1330. const char* token_start =
  1331. buffer_->source_->text().begin() + line_info.start + token_info.column;
  1332. // Find the corresponding file location.
  1333. // TODO: Should we somehow indicate in the diagnostic location if this token
  1334. // is a recovery token that doesn't correspond to the original source?
  1335. return TokenizedBuffer::SourceBufferLocationTranslator(buffer_).GetLocation(
  1336. token_start);
  1337. }
  1338. } // namespace Carbon::Lex