tokenized_buffer.cpp 40 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "toolchain/lexer/tokenized_buffer.h"
  5. #include <algorithm>
  6. #include <array>
  7. #include <cmath>
  8. #include <iterator>
  9. #include <string>
  10. #include "common/check.h"
  11. #include "common/string_helpers.h"
  12. #include "llvm/ADT/STLExtras.h"
  13. #include "llvm/ADT/StringRef.h"
  14. #include "llvm/ADT/StringSwitch.h"
  15. #include "llvm/Support/ErrorHandling.h"
  16. #include "llvm/Support/Format.h"
  17. #include "llvm/Support/FormatVariadic.h"
  18. #include "llvm/Support/raw_ostream.h"
  19. #include "toolchain/lexer/character_set.h"
  20. #include "toolchain/lexer/lex_helpers.h"
  21. #include "toolchain/lexer/numeric_literal.h"
  22. #include "toolchain/lexer/string_literal.h"
  23. #if __x86_64__
  24. #include <x86intrin.h>
  25. #endif
  26. namespace Carbon {
  27. // TODO: Move Overload and VariantMatch somewhere more central.
  28. // Form an overload set from a list of functions. For example:
  29. //
  30. // ```
  31. // auto overloaded = Overload{[] (int) {}, [] (float) {}};
  32. // ```
  33. template <typename... Fs>
  34. struct Overload : Fs... {
  35. using Fs::operator()...;
  36. };
  37. template <typename... Fs>
  38. Overload(Fs...) -> Overload<Fs...>;
  39. // Pattern-match against the type of the value stored in the variant `V`. Each
  40. // element of `fs` should be a function that takes one or more of the variant
  41. // values in `V`.
  42. template <typename V, typename... Fs>
  43. auto VariantMatch(V&& v, Fs&&... fs) -> decltype(auto) {
  44. return std::visit(Overload{std::forward<Fs&&>(fs)...}, std::forward<V&&>(v));
  45. }
  46. // Scans the provided text and returns the prefix `StringRef` of contiguous
  47. // identifier characters.
  48. //
  49. // This is a performance sensitive function and so uses vectorized code
  50. // sequences to optimize its scanning. When modifying, the identifier lexing
  51. // benchmarks should be checked for regressions.
  52. //
  53. // Identifier characters here are currently the ASCII characters `[0-9A-Za-z_]`.
  54. //
  55. // TODO: Currently, this code does not implement Carbon's design for Unicode
  56. // characters in identifiers. It does work on UTF-8 code unit sequences, but
  57. // currently considers non-ASCII characters to be non-identifier characters.
  58. // Some work has been done to ensure the hot loop, while optimized, retains
  59. // enough information to add Unicode handling without completely destroying the
  60. // relevant optimizations.
  61. static auto ScanForIdentifierPrefix(llvm::StringRef text) -> llvm::StringRef {
  62. // A table of booleans that we can use to classify bytes as being valid
  63. // identifier (or keyword) characters. This is used in the generic,
  64. // non-vectorized fallback code to scan for length of an identifier.
  65. constexpr std::array<bool, 256> IsIdentifierByteTable = []() constexpr {
  66. std::array<bool, 256> table = {};
  67. for (char c = '0'; c <= '9'; ++c) {
  68. table[c] = true;
  69. }
  70. for (char c = 'A'; c <= 'Z'; ++c) {
  71. table[c] = true;
  72. }
  73. for (char c = 'a'; c <= 'z'; ++c) {
  74. table[c] = true;
  75. }
  76. table['_'] = true;
  77. return table;
  78. }();
  79. #if __x86_64__
  80. // This code uses a scheme derived from the techniques in Geoff Langdale and
  81. // Daniel Lemire's work on parsing JSON[1]. Specifically, that paper outlines
  82. // a technique of using two 4-bit indexed in-register look-up tables (LUTs) to
  83. // classify bytes in a branchless SIMD code sequence.
  84. //
  85. // [1]: https://arxiv.org/pdf/1902.08318.pdf
  86. //
  87. // The goal is to get a bit mask classifying different sets of bytes. For each
  88. // input byte, we first test for a high bit indicating a UTF-8 encoded Unicode
  89. // character. Otherwise, we want the mask bits to be set with the following
  90. // logic derived by inspecting the high nibble and low nibble of the input:
  91. // bit0 = 1 for `_`: high `0x5` and low `0xF`
  92. // bit1 = 1 for `0-9`: high `0x3` and low `0x0` - `0x9`
  93. // bit2 = 1 for `A-O` and `a-o`: high `0x4` or `0x6` and low `0x1` - `0xF`
  94. // bit3 = 1 for `P-Z` and 'p-z': high `0x5` or `0x7` and low `0x0` - `0xA`
  95. // bit4 = unused
  96. // bit5 = unused
  97. // bit6 = unused
  98. // bit7 = unused
  99. //
  100. // No bits set means definitively non-ID ASCII character.
  101. //
  102. // bits 4-7 remain unused if we need to classify more characters.
  103. const auto high_lut = _mm_setr_epi8(
  104. /*0x0:*/ 0b0000'0000,
  105. /*0x1:*/ 0b0000'0000,
  106. /*0x2:*/ 0b0000'0000,
  107. /*0x3:*/ 0b0000'0010,
  108. /*0x4:*/ 0b0000'0100,
  109. /*0x5:*/ 0b0000'1001,
  110. /*0x6:*/ 0b0000'0100,
  111. /*0x7:*/ 0b0000'1000,
  112. /*0x8:*/ 0b0000'0000,
  113. /*0x9:*/ 0b0000'0000,
  114. /*0xA:*/ 0b0000'0000,
  115. /*0xB:*/ 0b0000'0000,
  116. /*0xC:*/ 0b0000'0000,
  117. /*0xD:*/ 0b0000'0000,
  118. /*0xE:*/ 0b0000'0000,
  119. /*0xF:*/ 0b0000'0000);
  120. const auto low_lut = _mm_setr_epi8(
  121. /*0x0:*/ 0b0000'1010,
  122. /*0x1:*/ 0b0000'1110,
  123. /*0x2:*/ 0b0000'1110,
  124. /*0x3:*/ 0b0000'1110,
  125. /*0x4:*/ 0b0000'1110,
  126. /*0x5:*/ 0b0000'1110,
  127. /*0x6:*/ 0b0000'1110,
  128. /*0x7:*/ 0b0000'1110,
  129. /*0x8:*/ 0b0000'1110,
  130. /*0x9:*/ 0b0000'1110,
  131. /*0xA:*/ 0b0000'1100,
  132. /*0xB:*/ 0b0000'0100,
  133. /*0xC:*/ 0b0000'0100,
  134. /*0xD:*/ 0b0000'0100,
  135. /*0xE:*/ 0b0000'0100,
  136. /*0xF:*/ 0b0000'0101);
  137. // Use `ssize_t` for performance here as we index memory in a tight loop.
  138. ssize_t i = 0;
  139. const ssize_t size = text.size();
  140. while ((i + 16) <= size) {
  141. __m128i input =
  142. _mm_loadu_si128(reinterpret_cast<const __m128i*>(text.data() + i));
  143. // The high bits of each byte indicate a non-ASCII character encoded using
  144. // UTF-8. Test those and fall back to the scalar code if present. These
  145. // bytes will also cause spurious zeros in the LUT results, but we can
  146. // ignore that because we track them independently here.
  147. #if __SSE4_1__
  148. if (!_mm_test_all_zeros(_mm_set1_epi8(0x80), input)) {
  149. break;
  150. }
  151. #else
  152. if (_mm_movemask_epi8(input) != 0) {
  153. break;
  154. }
  155. #endif
  156. // Do two LUT lookups and mask the results together to get the results for
  157. // both low and high nibbles. Note that we don't need to mask out the high
  158. // bit of input here because we track that above for UTF-8 handling.
  159. __m128i low_mask = _mm_shuffle_epi8(low_lut, input);
  160. // Note that the input needs to be masked to only include the high nibble or
  161. // we could end up with bit7 set forcing the result to a zero byte.
  162. __m128i input_high =
  163. _mm_and_si128(_mm_srli_epi32(input, 4), _mm_set1_epi8(0x0f));
  164. __m128i high_mask = _mm_shuffle_epi8(high_lut, input_high);
  165. __m128i mask = _mm_and_si128(low_mask, high_mask);
  166. // Now compare to find the completely zero bytes.
  167. __m128i id_byte_mask_vec = _mm_cmpeq_epi8(mask, _mm_setzero_si128());
  168. int tail_ascii_mask = _mm_movemask_epi8(id_byte_mask_vec);
  169. // Check if there are bits in the tail mask, which means zero bytes and the
  170. // end of the identifier. We could do this without materializing the scalar
  171. // mask on more recent CPUs, but we generally expect the median length we
  172. // encounter to be <16 characters and so we avoid the extra instruction in
  173. // that case and predict this branch to succeed so it is laid out in a
  174. // reasonable way.
  175. if (LLVM_LIKELY(tail_ascii_mask != 0)) {
  176. // Move past the definitively classified bytes that are part of the
  177. // identifier, and return the complete identifier text.
  178. i += __builtin_ctz(tail_ascii_mask);
  179. return text.substr(0, i);
  180. }
  181. i += 16;
  182. }
  183. // Fallback to scalar loop. We only end up here when we don't have >=16
  184. // bytes to scan or we find a UTF-8 unicode character.
  185. // TODO: This assumes all Unicode characters are non-identifiers.
  186. while (i < size &&
  187. IsIdentifierByteTable[static_cast<unsigned char>(text[i])]) {
  188. ++i;
  189. }
  190. return text.substr(0, i);
  191. #else
  192. // TODO: Optimize this with SIMD for other architectures.
  193. return text.take_while([](char c) {
  194. return IsIdentifierByteTable[static_cast<unsigned char>(c)];
  195. });
  196. #endif
  197. }
  198. // Implementation of the lexer logic itself.
  199. //
  200. // The design is that lexing can loop over the source buffer, consuming it into
  201. // tokens by calling into this API. This class handles the state and breaks down
  202. // the different lexing steps that may be used. It directly updates the provided
  203. // tokenized buffer with the lexed tokens.
  204. class TokenizedBuffer::Lexer {
  205. public:
  206. // Symbolic result of a lexing action. This indicates whether we successfully
  207. // lexed a token, or whether other lexing actions should be attempted.
  208. //
  209. // While it wraps a simple boolean state, its API both helps make the failures
  210. // more self documenting, and by consuming the actual token constructively
  211. // when one is produced, it helps ensure the correct result is returned.
  212. class LexResult {
  213. public:
  214. // Consumes (and discard) a valid token to construct a result
  215. // indicating a token has been produced. Relies on implicit conversions.
  216. // NOLINTNEXTLINE(google-explicit-constructor)
  217. LexResult(Token /*discarded_token*/) : LexResult(true) {}
  218. // Returns a result indicating no token was produced.
  219. static auto NoMatch() -> LexResult { return LexResult(false); }
  220. // Tests whether a token was produced by the lexing routine, and
  221. // the lexer can continue forming tokens.
  222. explicit operator bool() const { return formed_token_; }
  223. private:
  224. explicit LexResult(bool formed_token) : formed_token_(formed_token) {}
  225. bool formed_token_;
  226. };
  227. Lexer(TokenizedBuffer& buffer, DiagnosticConsumer& consumer)
  228. : buffer_(&buffer),
  229. translator_(&buffer),
  230. emitter_(translator_, consumer),
  231. token_translator_(&buffer),
  232. token_emitter_(token_translator_, consumer),
  233. current_line_(buffer.AddLine(LineInfo(0))),
  234. current_line_info_(&buffer.GetLineInfo(current_line_)) {}
  235. // Perform the necessary bookkeeping to step past a newline at the current
  236. // line and column.
  237. auto HandleNewline() -> void {
  238. current_line_info_->length = current_column_;
  239. current_line_ = buffer_->AddLine(
  240. LineInfo(current_line_info_->start + current_column_ + 1));
  241. current_line_info_ = &buffer_->GetLineInfo(current_line_);
  242. current_column_ = 0;
  243. set_indent_ = false;
  244. }
  245. auto NoteWhitespace() -> void {
  246. if (!buffer_->token_infos_.empty()) {
  247. buffer_->token_infos_.back().has_trailing_space = true;
  248. }
  249. }
  250. auto SkipWhitespace(llvm::StringRef& source_text) -> bool {
  251. const char* const whitespace_start = source_text.begin();
  252. while (!source_text.empty()) {
  253. // We only support line-oriented commenting and lex comments as-if they
  254. // were whitespace.
  255. if (source_text.startswith("//")) {
  256. // Any comment must be the only non-whitespace on the line.
  257. if (set_indent_) {
  258. CARBON_DIAGNOSTIC(TrailingComment, Error,
  259. "Trailing comments are not permitted.");
  260. emitter_.Emit(source_text.begin(), TrailingComment);
  261. }
  262. // The introducer '//' must be followed by whitespace or EOF.
  263. if (source_text.size() > 2 && !IsSpace(source_text[2])) {
  264. CARBON_DIAGNOSTIC(NoWhitespaceAfterCommentIntroducer, Error,
  265. "Whitespace is required after '//'.");
  266. emitter_.Emit(source_text.begin() + 2,
  267. NoWhitespaceAfterCommentIntroducer);
  268. }
  269. while (!source_text.empty() && source_text.front() != '\n') {
  270. ++current_column_;
  271. source_text = source_text.drop_front();
  272. }
  273. if (source_text.empty()) {
  274. break;
  275. }
  276. }
  277. switch (source_text.front()) {
  278. default:
  279. // If we find a non-whitespace character without exhausting the
  280. // buffer, return true to continue lexing.
  281. CARBON_CHECK(!IsSpace(source_text.front()));
  282. if (whitespace_start != source_text.begin()) {
  283. NoteWhitespace();
  284. }
  285. return true;
  286. case '\n':
  287. // If this is the last character in the source, directly return here
  288. // to avoid creating an empty line.
  289. source_text = source_text.drop_front();
  290. if (source_text.empty()) {
  291. current_line_info_->length = current_column_;
  292. return false;
  293. }
  294. // Otherwise, add a line and set up to continue lexing.
  295. HandleNewline();
  296. continue;
  297. case ' ':
  298. case '\t':
  299. // Skip other forms of whitespace while tracking column.
  300. // TODO: This obviously needs looooots more work to handle unicode
  301. // whitespace as well as special handling to allow better tokenization
  302. // of operators. This is just a stub to check that our column
  303. // management works.
  304. ++current_column_;
  305. source_text = source_text.drop_front();
  306. continue;
  307. }
  308. }
  309. CARBON_CHECK(source_text.empty())
  310. << "Cannot reach here w/o finishing the text!";
  311. // Update the line length as this is also the end of a line.
  312. current_line_info_->length = current_column_;
  313. return false;
  314. }
  315. auto LexNumericLiteral(llvm::StringRef& source_text) -> LexResult {
  316. std::optional<LexedNumericLiteral> literal =
  317. LexedNumericLiteral::Lex(source_text);
  318. if (!literal) {
  319. return LexResult::NoMatch();
  320. }
  321. int int_column = current_column_;
  322. int token_size = literal->text().size();
  323. current_column_ += token_size;
  324. source_text = source_text.drop_front(token_size);
  325. if (!set_indent_) {
  326. current_line_info_->indent = int_column;
  327. set_indent_ = true;
  328. }
  329. return VariantMatch(
  330. literal->ComputeValue(emitter_),
  331. [&](LexedNumericLiteral::IntegerValue&& value) {
  332. auto token = buffer_->AddToken({.kind = TokenKind::IntegerLiteral,
  333. .token_line = current_line_,
  334. .column = int_column});
  335. buffer_->GetTokenInfo(token).literal_index =
  336. buffer_->literal_int_storage_.size();
  337. buffer_->literal_int_storage_.push_back(std::move(value.value));
  338. return token;
  339. },
  340. [&](LexedNumericLiteral::RealValue&& value) {
  341. auto token = buffer_->AddToken({.kind = TokenKind::RealLiteral,
  342. .token_line = current_line_,
  343. .column = int_column});
  344. buffer_->GetTokenInfo(token).literal_index =
  345. buffer_->literal_int_storage_.size();
  346. buffer_->literal_int_storage_.push_back(std::move(value.mantissa));
  347. buffer_->literal_int_storage_.push_back(std::move(value.exponent));
  348. CARBON_CHECK(buffer_->GetRealLiteral(token).IsDecimal() ==
  349. (value.radix == LexedNumericLiteral::Radix::Decimal));
  350. return token;
  351. },
  352. [&](LexedNumericLiteral::UnrecoverableError) {
  353. auto token = buffer_->AddToken({
  354. .kind = TokenKind::Error,
  355. .token_line = current_line_,
  356. .column = int_column,
  357. .error_length = token_size,
  358. });
  359. return token;
  360. });
  361. }
  362. auto LexStringLiteral(llvm::StringRef& source_text) -> LexResult {
  363. std::optional<LexedStringLiteral> literal =
  364. LexedStringLiteral::Lex(source_text);
  365. if (!literal) {
  366. return LexResult::NoMatch();
  367. }
  368. Line string_line = current_line_;
  369. int string_column = current_column_;
  370. int literal_size = literal->text().size();
  371. source_text = source_text.drop_front(literal_size);
  372. if (!set_indent_) {
  373. current_line_info_->indent = string_column;
  374. set_indent_ = true;
  375. }
  376. // Update line and column information.
  377. if (!literal->is_multi_line()) {
  378. current_column_ += literal_size;
  379. } else {
  380. for (char c : literal->text()) {
  381. if (c == '\n') {
  382. HandleNewline();
  383. // The indentation of all lines in a multi-line string literal is
  384. // that of the first line.
  385. current_line_info_->indent = string_column;
  386. set_indent_ = true;
  387. } else {
  388. ++current_column_;
  389. }
  390. }
  391. }
  392. if (literal->is_terminated()) {
  393. auto token =
  394. buffer_->AddToken({.kind = TokenKind::StringLiteral,
  395. .token_line = string_line,
  396. .column = string_column,
  397. .literal_index = static_cast<int32_t>(
  398. buffer_->literal_string_storage_.size())});
  399. buffer_->literal_string_storage_.push_back(
  400. literal->ComputeValue(emitter_));
  401. return token;
  402. } else {
  403. CARBON_DIAGNOSTIC(UnterminatedString, Error,
  404. "String is missing a terminator.");
  405. emitter_.Emit(literal->text().begin(), UnterminatedString);
  406. return buffer_->AddToken({.kind = TokenKind::Error,
  407. .token_line = string_line,
  408. .column = string_column,
  409. .error_length = literal_size});
  410. }
  411. }
  412. auto LexSymbolToken(llvm::StringRef& source_text) -> LexResult {
  413. TokenKind kind = llvm::StringSwitch<TokenKind>(source_text)
  414. #define CARBON_SYMBOL_TOKEN(Name, Spelling) \
  415. .StartsWith(Spelling, TokenKind::Name)
  416. #include "toolchain/lexer/token_kind.def"
  417. .Default(TokenKind::Error);
  418. if (kind == TokenKind::Error) {
  419. return LexResult::NoMatch();
  420. }
  421. if (!set_indent_) {
  422. current_line_info_->indent = current_column_;
  423. set_indent_ = true;
  424. }
  425. CloseInvalidOpenGroups(kind);
  426. const char* location = source_text.begin();
  427. Token token = buffer_->AddToken(
  428. {.kind = kind, .token_line = current_line_, .column = current_column_});
  429. current_column_ += kind.fixed_spelling().size();
  430. source_text = source_text.drop_front(kind.fixed_spelling().size());
  431. // Opening symbols just need to be pushed onto our queue of opening groups.
  432. if (kind.is_opening_symbol()) {
  433. open_groups_.push_back(token);
  434. return token;
  435. }
  436. // Only closing symbols need further special handling.
  437. if (!kind.is_closing_symbol()) {
  438. return token;
  439. }
  440. TokenInfo& closing_token_info = buffer_->GetTokenInfo(token);
  441. // Check that there is a matching opening symbol before we consume this as
  442. // a closing symbol.
  443. if (open_groups_.empty()) {
  444. closing_token_info.kind = TokenKind::Error;
  445. closing_token_info.error_length = kind.fixed_spelling().size();
  446. CARBON_DIAGNOSTIC(
  447. UnmatchedClosing, Error,
  448. "Closing symbol without a corresponding opening symbol.");
  449. emitter_.Emit(location, UnmatchedClosing);
  450. // Note that this still returns true as we do consume a symbol.
  451. return token;
  452. }
  453. // Finally can handle a normal closing symbol.
  454. Token opening_token = open_groups_.pop_back_val();
  455. TokenInfo& opening_token_info = buffer_->GetTokenInfo(opening_token);
  456. opening_token_info.closing_token = token;
  457. closing_token_info.opening_token = opening_token;
  458. return token;
  459. }
  460. // Given a word that has already been lexed, determine whether it is a type
  461. // literal and if so form the corresponding token.
  462. auto LexWordAsTypeLiteralToken(llvm::StringRef word, int column)
  463. -> LexResult {
  464. if (word.size() < 2) {
  465. // Too short to form one of these tokens.
  466. return LexResult::NoMatch();
  467. }
  468. if (!('1' <= word[1] && word[1] <= '9')) {
  469. // Doesn't start with a valid initial digit.
  470. return LexResult::NoMatch();
  471. }
  472. std::optional<TokenKind> kind;
  473. switch (word.front()) {
  474. case 'i':
  475. kind = TokenKind::IntegerTypeLiteral;
  476. break;
  477. case 'u':
  478. kind = TokenKind::UnsignedIntegerTypeLiteral;
  479. break;
  480. case 'f':
  481. kind = TokenKind::FloatingPointTypeLiteral;
  482. break;
  483. default:
  484. return LexResult::NoMatch();
  485. };
  486. llvm::StringRef suffix = word.substr(1);
  487. if (!CanLexInteger(emitter_, suffix)) {
  488. return buffer_->AddToken(
  489. {.kind = TokenKind::Error,
  490. .token_line = current_line_,
  491. .column = column,
  492. .error_length = static_cast<int32_t>(word.size())});
  493. }
  494. llvm::APInt suffix_value;
  495. if (suffix.getAsInteger(10, suffix_value)) {
  496. return LexResult::NoMatch();
  497. }
  498. auto token = buffer_->AddToken(
  499. {.kind = *kind, .token_line = current_line_, .column = column});
  500. buffer_->GetTokenInfo(token).literal_index =
  501. buffer_->literal_int_storage_.size();
  502. buffer_->literal_int_storage_.push_back(std::move(suffix_value));
  503. return token;
  504. }
  505. // Closes all open groups that cannot remain open across the symbol `K`.
  506. // Users may pass `Error` to close all open groups.
  507. auto CloseInvalidOpenGroups(TokenKind kind) -> void {
  508. if (!kind.is_closing_symbol() && kind != TokenKind::Error) {
  509. return;
  510. }
  511. while (!open_groups_.empty()) {
  512. Token opening_token = open_groups_.back();
  513. TokenKind opening_kind = buffer_->GetTokenInfo(opening_token).kind;
  514. if (kind == opening_kind.closing_symbol()) {
  515. return;
  516. }
  517. open_groups_.pop_back();
  518. CARBON_DIAGNOSTIC(
  519. MismatchedClosing, Error,
  520. "Closing symbol does not match most recent opening symbol.");
  521. token_emitter_.Emit(opening_token, MismatchedClosing);
  522. CARBON_CHECK(!buffer_->tokens().empty())
  523. << "Must have a prior opening token!";
  524. Token prev_token = buffer_->tokens().end()[-1];
  525. // TODO: do a smarter backwards scan for where to put the closing
  526. // token.
  527. Token closing_token = buffer_->AddToken(
  528. {.kind = opening_kind.closing_symbol(),
  529. .has_trailing_space = buffer_->HasTrailingWhitespace(prev_token),
  530. .is_recovery = true,
  531. .token_line = current_line_,
  532. .column = current_column_});
  533. TokenInfo& opening_token_info = buffer_->GetTokenInfo(opening_token);
  534. TokenInfo& closing_token_info = buffer_->GetTokenInfo(closing_token);
  535. opening_token_info.closing_token = closing_token;
  536. closing_token_info.opening_token = opening_token;
  537. }
  538. }
  539. auto GetOrCreateIdentifier(llvm::StringRef text) -> Identifier {
  540. auto insert_result = buffer_->identifier_map_.insert(
  541. {text, Identifier(buffer_->identifier_infos_.size())});
  542. if (insert_result.second) {
  543. buffer_->identifier_infos_.push_back({text});
  544. }
  545. return insert_result.first->second;
  546. }
  547. auto LexKeywordOrIdentifier(llvm::StringRef& source_text) -> LexResult {
  548. if (!IsAlpha(source_text.front()) && source_text.front() != '_') {
  549. return LexResult::NoMatch();
  550. }
  551. if (!set_indent_) {
  552. current_line_info_->indent = current_column_;
  553. set_indent_ = true;
  554. }
  555. // Take the valid characters off the front of the source buffer.
  556. llvm::StringRef identifier_text = ScanForIdentifierPrefix(source_text);
  557. CARBON_CHECK(!identifier_text.empty())
  558. << "Must have at least one character!";
  559. int identifier_column = current_column_;
  560. current_column_ += identifier_text.size();
  561. source_text = source_text.drop_front(identifier_text.size());
  562. // Check if the text is a type literal, and if so form such a literal.
  563. if (LexResult result =
  564. LexWordAsTypeLiteralToken(identifier_text, identifier_column)) {
  565. return result;
  566. }
  567. // Check if the text matches a keyword token, and if so use that.
  568. TokenKind kind = llvm::StringSwitch<TokenKind>(identifier_text)
  569. #define CARBON_KEYWORD_TOKEN(Name, Spelling) .Case(Spelling, TokenKind::Name)
  570. #include "toolchain/lexer/token_kind.def"
  571. .Default(TokenKind::Error);
  572. if (kind != TokenKind::Error) {
  573. return buffer_->AddToken({.kind = kind,
  574. .token_line = current_line_,
  575. .column = identifier_column});
  576. }
  577. // Otherwise we have a generic identifier.
  578. return buffer_->AddToken({.kind = TokenKind::Identifier,
  579. .token_line = current_line_,
  580. .column = identifier_column,
  581. .id = GetOrCreateIdentifier(identifier_text)});
  582. }
  583. auto LexError(llvm::StringRef& source_text) -> LexResult {
  584. llvm::StringRef error_text = source_text.take_while([](char c) {
  585. if (IsAlnum(c)) {
  586. return false;
  587. }
  588. switch (c) {
  589. case '_':
  590. case '\t':
  591. case '\n':
  592. return false;
  593. }
  594. return llvm::StringSwitch<bool>(llvm::StringRef(&c, 1))
  595. #define CARBON_SYMBOL_TOKEN(Name, Spelling) .StartsWith(Spelling, false)
  596. #include "toolchain/lexer/token_kind.def"
  597. .Default(true);
  598. });
  599. if (error_text.empty()) {
  600. // TODO: Reimplement this to use the lexer properly. In the meantime,
  601. // guarantee that we eat at least one byte.
  602. error_text = source_text.take_front(1);
  603. }
  604. auto token = buffer_->AddToken(
  605. {.kind = TokenKind::Error,
  606. .token_line = current_line_,
  607. .column = current_column_,
  608. .error_length = static_cast<int32_t>(error_text.size())});
  609. CARBON_DIAGNOSTIC(UnrecognizedCharacters, Error,
  610. "Encountered unrecognized characters while parsing.");
  611. emitter_.Emit(error_text.begin(), UnrecognizedCharacters);
  612. current_column_ += error_text.size();
  613. source_text = source_text.drop_front(error_text.size());
  614. return token;
  615. }
  616. auto AddEndOfFileToken() -> void {
  617. buffer_->AddToken({.kind = TokenKind::EndOfFile,
  618. .token_line = current_line_,
  619. .column = current_column_});
  620. }
  621. private:
  622. TokenizedBuffer* buffer_;
  623. SourceBufferLocationTranslator translator_;
  624. LexerDiagnosticEmitter emitter_;
  625. TokenLocationTranslator token_translator_;
  626. TokenDiagnosticEmitter token_emitter_;
  627. Line current_line_;
  628. LineInfo* current_line_info_;
  629. int current_column_ = 0;
  630. bool set_indent_ = false;
  631. llvm::SmallVector<Token> open_groups_;
  632. };
  633. auto TokenizedBuffer::Lex(SourceBuffer& source, DiagnosticConsumer& consumer)
  634. -> TokenizedBuffer {
  635. TokenizedBuffer buffer(source);
  636. ErrorTrackingDiagnosticConsumer error_tracking_consumer(consumer);
  637. Lexer lexer(buffer, error_tracking_consumer);
  638. llvm::StringRef source_text = source.text();
  639. while (lexer.SkipWhitespace(source_text)) {
  640. // Each time we find non-whitespace characters, try each kind of token we
  641. // support lexing, from simplest to most complex.
  642. Lexer::LexResult result = lexer.LexSymbolToken(source_text);
  643. if (!result) {
  644. result = lexer.LexKeywordOrIdentifier(source_text);
  645. }
  646. if (!result) {
  647. result = lexer.LexNumericLiteral(source_text);
  648. }
  649. if (!result) {
  650. result = lexer.LexStringLiteral(source_text);
  651. }
  652. if (!result) {
  653. result = lexer.LexError(source_text);
  654. }
  655. CARBON_CHECK(result) << "No token was lexed.";
  656. }
  657. // The end-of-file token is always considered to be whitespace.
  658. lexer.NoteWhitespace();
  659. lexer.CloseInvalidOpenGroups(TokenKind::Error);
  660. lexer.AddEndOfFileToken();
  661. if (error_tracking_consumer.seen_error()) {
  662. buffer.has_errors_ = true;
  663. }
  664. return buffer;
  665. }
  666. auto TokenizedBuffer::GetKind(Token token) const -> TokenKind {
  667. return GetTokenInfo(token).kind;
  668. }
  669. auto TokenizedBuffer::GetLine(Token token) const -> Line {
  670. return GetTokenInfo(token).token_line;
  671. }
  672. auto TokenizedBuffer::GetLineNumber(Token token) const -> int {
  673. return GetLineNumber(GetLine(token));
  674. }
  675. auto TokenizedBuffer::GetColumnNumber(Token token) const -> int {
  676. return GetTokenInfo(token).column + 1;
  677. }
  678. auto TokenizedBuffer::GetTokenText(Token token) const -> llvm::StringRef {
  679. const auto& token_info = GetTokenInfo(token);
  680. llvm::StringRef fixed_spelling = token_info.kind.fixed_spelling();
  681. if (!fixed_spelling.empty()) {
  682. return fixed_spelling;
  683. }
  684. if (token_info.kind == TokenKind::Error) {
  685. const auto& line_info = GetLineInfo(token_info.token_line);
  686. int64_t token_start = line_info.start + token_info.column;
  687. return source_->text().substr(token_start, token_info.error_length);
  688. }
  689. // Refer back to the source text to preserve oddities like radix or digit
  690. // separators the author included.
  691. if (token_info.kind == TokenKind::IntegerLiteral ||
  692. token_info.kind == TokenKind::RealLiteral) {
  693. const auto& line_info = GetLineInfo(token_info.token_line);
  694. int64_t token_start = line_info.start + token_info.column;
  695. std::optional<LexedNumericLiteral> relexed_token =
  696. LexedNumericLiteral::Lex(source_->text().substr(token_start));
  697. CARBON_CHECK(relexed_token) << "Could not reform numeric literal token.";
  698. return relexed_token->text();
  699. }
  700. // Refer back to the source text to find the original spelling, including
  701. // escape sequences etc.
  702. if (token_info.kind == TokenKind::StringLiteral) {
  703. const auto& line_info = GetLineInfo(token_info.token_line);
  704. int64_t token_start = line_info.start + token_info.column;
  705. std::optional<LexedStringLiteral> relexed_token =
  706. LexedStringLiteral::Lex(source_->text().substr(token_start));
  707. CARBON_CHECK(relexed_token) << "Could not reform string literal token.";
  708. return relexed_token->text();
  709. }
  710. // Refer back to the source text to avoid needing to reconstruct the
  711. // spelling from the size.
  712. if (token_info.kind.is_sized_type_literal()) {
  713. const auto& line_info = GetLineInfo(token_info.token_line);
  714. int64_t token_start = line_info.start + token_info.column;
  715. llvm::StringRef suffix =
  716. source_->text().substr(token_start + 1).take_while(IsDecimalDigit);
  717. return llvm::StringRef(suffix.data() - 1, suffix.size() + 1);
  718. }
  719. if (token_info.kind == TokenKind::EndOfFile) {
  720. return llvm::StringRef();
  721. }
  722. CARBON_CHECK(token_info.kind == TokenKind::Identifier) << token_info.kind;
  723. return GetIdentifierText(token_info.id);
  724. }
  725. auto TokenizedBuffer::GetIdentifier(Token token) const -> Identifier {
  726. const auto& token_info = GetTokenInfo(token);
  727. CARBON_CHECK(token_info.kind == TokenKind::Identifier) << token_info.kind;
  728. return token_info.id;
  729. }
  730. auto TokenizedBuffer::GetIntegerLiteral(Token token) const
  731. -> const llvm::APInt& {
  732. const auto& token_info = GetTokenInfo(token);
  733. CARBON_CHECK(token_info.kind == TokenKind::IntegerLiteral) << token_info.kind;
  734. return literal_int_storage_[token_info.literal_index];
  735. }
  736. auto TokenizedBuffer::GetRealLiteral(Token token) const -> RealLiteralValue {
  737. const auto& token_info = GetTokenInfo(token);
  738. CARBON_CHECK(token_info.kind == TokenKind::RealLiteral) << token_info.kind;
  739. // Note that every real literal is at least three characters long, so we can
  740. // safely look at the second character to determine whether we have a
  741. // decimal or hexadecimal literal.
  742. const auto& line_info = GetLineInfo(token_info.token_line);
  743. int64_t token_start = line_info.start + token_info.column;
  744. char second_char = source_->text()[token_start + 1];
  745. bool is_decimal = second_char != 'x' && second_char != 'b';
  746. return RealLiteralValue(this, token_info.literal_index, is_decimal);
  747. }
  748. auto TokenizedBuffer::GetStringLiteral(Token token) const -> llvm::StringRef {
  749. const auto& token_info = GetTokenInfo(token);
  750. CARBON_CHECK(token_info.kind == TokenKind::StringLiteral) << token_info.kind;
  751. return literal_string_storage_[token_info.literal_index];
  752. }
  753. auto TokenizedBuffer::GetTypeLiteralSize(Token token) const
  754. -> const llvm::APInt& {
  755. const auto& token_info = GetTokenInfo(token);
  756. CARBON_CHECK(token_info.kind.is_sized_type_literal()) << token_info.kind;
  757. return literal_int_storage_[token_info.literal_index];
  758. }
  759. auto TokenizedBuffer::GetMatchedClosingToken(Token opening_token) const
  760. -> Token {
  761. const auto& opening_token_info = GetTokenInfo(opening_token);
  762. CARBON_CHECK(opening_token_info.kind.is_opening_symbol())
  763. << opening_token_info.kind;
  764. return opening_token_info.closing_token;
  765. }
  766. auto TokenizedBuffer::GetMatchedOpeningToken(Token closing_token) const
  767. -> Token {
  768. const auto& closing_token_info = GetTokenInfo(closing_token);
  769. CARBON_CHECK(closing_token_info.kind.is_closing_symbol())
  770. << closing_token_info.kind;
  771. return closing_token_info.opening_token;
  772. }
  773. auto TokenizedBuffer::HasLeadingWhitespace(Token token) const -> bool {
  774. auto it = TokenIterator(token);
  775. return it == tokens().begin() || GetTokenInfo(*(it - 1)).has_trailing_space;
  776. }
  777. auto TokenizedBuffer::HasTrailingWhitespace(Token token) const -> bool {
  778. return GetTokenInfo(token).has_trailing_space;
  779. }
  780. auto TokenizedBuffer::IsRecoveryToken(Token token) const -> bool {
  781. return GetTokenInfo(token).is_recovery;
  782. }
  783. auto TokenizedBuffer::GetLineNumber(Line line) const -> int {
  784. return line.index + 1;
  785. }
  786. auto TokenizedBuffer::GetIndentColumnNumber(Line line) const -> int {
  787. return GetLineInfo(line).indent + 1;
  788. }
  789. auto TokenizedBuffer::GetIdentifierText(Identifier identifier) const
  790. -> llvm::StringRef {
  791. return identifier_infos_[identifier.index].text;
  792. }
  793. auto TokenizedBuffer::PrintWidths::Widen(const PrintWidths& widths) -> void {
  794. index = std::max(widths.index, index);
  795. kind = std::max(widths.kind, kind);
  796. column = std::max(widths.column, column);
  797. line = std::max(widths.line, line);
  798. indent = std::max(widths.indent, indent);
  799. }
  800. // Compute the printed width of a number. When numbers are printed in decimal,
  801. // the number of digits needed is is one more than the log-base-10 of the
  802. // value. We handle a value of `zero` explicitly.
  803. //
  804. // This routine requires its argument to be *non-negative*.
  805. static auto ComputeDecimalPrintedWidth(int number) -> int {
  806. CARBON_CHECK(number >= 0) << "Negative numbers are not supported.";
  807. if (number == 0) {
  808. return 1;
  809. }
  810. return static_cast<int>(std::log10(number)) + 1;
  811. }
  812. auto TokenizedBuffer::GetTokenPrintWidths(Token token) const -> PrintWidths {
  813. PrintWidths widths = {};
  814. widths.index = ComputeDecimalPrintedWidth(token_infos_.size());
  815. widths.kind = GetKind(token).name().size();
  816. widths.line = ComputeDecimalPrintedWidth(GetLineNumber(token));
  817. widths.column = ComputeDecimalPrintedWidth(GetColumnNumber(token));
  818. widths.indent =
  819. ComputeDecimalPrintedWidth(GetIndentColumnNumber(GetLine(token)));
  820. return widths;
  821. }
  822. auto TokenizedBuffer::Print(llvm::raw_ostream& output_stream) const -> void {
  823. if (tokens().begin() == tokens().end()) {
  824. return;
  825. }
  826. PrintWidths widths = {};
  827. widths.index = ComputeDecimalPrintedWidth((token_infos_.size()));
  828. for (Token token : tokens()) {
  829. widths.Widen(GetTokenPrintWidths(token));
  830. }
  831. output_stream << "[\n";
  832. for (Token token : tokens()) {
  833. PrintToken(output_stream, token, widths);
  834. output_stream << "\n";
  835. }
  836. output_stream << "]\n";
  837. }
  838. auto TokenizedBuffer::PrintToken(llvm::raw_ostream& output_stream,
  839. Token token) const -> void {
  840. PrintToken(output_stream, token, {});
  841. }
  842. auto TokenizedBuffer::PrintToken(llvm::raw_ostream& output_stream, Token token,
  843. PrintWidths widths) const -> void {
  844. widths.Widen(GetTokenPrintWidths(token));
  845. int token_index = token.index;
  846. const auto& token_info = GetTokenInfo(token);
  847. llvm::StringRef token_text = GetTokenText(token);
  848. // Output the main chunk using one format string. We have to do the
  849. // justification manually in order to use the dynamically computed widths
  850. // and get the quotes included.
  851. output_stream << llvm::formatv(
  852. "{ index: {0}, kind: {1}, line: {2}, column: {3}, indent: {4}, "
  853. "spelling: '{5}'",
  854. llvm::format_decimal(token_index, widths.index),
  855. llvm::right_justify(llvm::formatv("'{0}'", token_info.kind.name()).str(),
  856. widths.kind + 2),
  857. llvm::format_decimal(GetLineNumber(token_info.token_line), widths.line),
  858. llvm::format_decimal(GetColumnNumber(token), widths.column),
  859. llvm::format_decimal(GetIndentColumnNumber(token_info.token_line),
  860. widths.indent),
  861. token_text);
  862. switch (token_info.kind) {
  863. case TokenKind::Identifier:
  864. output_stream << ", identifier: " << GetIdentifier(token).index;
  865. break;
  866. case TokenKind::IntegerLiteral:
  867. output_stream << ", value: `";
  868. GetIntegerLiteral(token).print(output_stream, /*isSigned=*/false);
  869. output_stream << "`";
  870. break;
  871. case TokenKind::RealLiteral:
  872. output_stream << ", value: `" << GetRealLiteral(token) << "`";
  873. break;
  874. case TokenKind::StringLiteral:
  875. output_stream << ", value: `" << GetStringLiteral(token) << "`";
  876. break;
  877. default:
  878. if (token_info.kind.is_opening_symbol()) {
  879. output_stream << ", closing_token: "
  880. << GetMatchedClosingToken(token).index;
  881. } else if (token_info.kind.is_closing_symbol()) {
  882. output_stream << ", opening_token: "
  883. << GetMatchedOpeningToken(token).index;
  884. }
  885. break;
  886. }
  887. if (token_info.has_trailing_space) {
  888. output_stream << ", has_trailing_space: true";
  889. }
  890. if (token_info.is_recovery) {
  891. output_stream << ", recovery: true";
  892. }
  893. output_stream << " },";
  894. }
  895. auto TokenizedBuffer::GetLineInfo(Line line) -> LineInfo& {
  896. return line_infos_[line.index];
  897. }
  898. auto TokenizedBuffer::GetLineInfo(Line line) const -> const LineInfo& {
  899. return line_infos_[line.index];
  900. }
  901. auto TokenizedBuffer::AddLine(LineInfo info) -> Line {
  902. line_infos_.push_back(info);
  903. return Line(static_cast<int>(line_infos_.size()) - 1);
  904. }
  905. auto TokenizedBuffer::GetTokenInfo(Token token) -> TokenInfo& {
  906. return token_infos_[token.index];
  907. }
  908. auto TokenizedBuffer::GetTokenInfo(Token token) const -> const TokenInfo& {
  909. return token_infos_[token.index];
  910. }
  911. auto TokenizedBuffer::AddToken(TokenInfo info) -> Token {
  912. token_infos_.push_back(info);
  913. expected_parse_tree_size_ += info.kind.expected_parse_tree_size();
  914. return Token(static_cast<int>(token_infos_.size()) - 1);
  915. }
  916. auto TokenizedBuffer::TokenIterator::Print(llvm::raw_ostream& output) const
  917. -> void {
  918. output << token_.index;
  919. }
  920. auto TokenizedBuffer::SourceBufferLocationTranslator::GetLocation(
  921. const char* loc) -> DiagnosticLocation {
  922. CARBON_CHECK(StringRefContainsPointer(buffer_->source_->text(), loc))
  923. << "location not within buffer";
  924. int64_t offset = loc - buffer_->source_->text().begin();
  925. // Find the first line starting after the given location. Note that we can't
  926. // inspect `line.length` here because it is not necessarily correct for the
  927. // final line during lexing (but will be correct later for the parse tree).
  928. const auto* line_it = std::partition_point(
  929. buffer_->line_infos_.begin(), buffer_->line_infos_.end(),
  930. [offset](const LineInfo& line) { return line.start <= offset; });
  931. // Step back one line to find the line containing the given position.
  932. CARBON_CHECK(line_it != buffer_->line_infos_.begin())
  933. << "location precedes the start of the first line";
  934. --line_it;
  935. int line_number = line_it - buffer_->line_infos_.begin();
  936. int column_number = offset - line_it->start;
  937. // Start by grabbing the line from the buffer. If the line isn't fully lexed,
  938. // the length will be npos and the line will be grabbed from the known start
  939. // to the end of the buffer; we'll then adjust the length.
  940. llvm::StringRef line =
  941. buffer_->source_->text().substr(line_it->start, line_it->length);
  942. if (line_it->length == static_cast<int32_t>(llvm::StringRef::npos)) {
  943. CARBON_CHECK(line.take_front(column_number).count('\n') == 0)
  944. << "Currently we assume no unlexed newlines prior to the error column, "
  945. "but there was one when erroring at "
  946. << buffer_->source_->filename() << ":" << line_number << ":"
  947. << column_number;
  948. // Look for the next newline since we don't know the length. We can start at
  949. // the column because prior newlines will have been lexed.
  950. auto end_newline_pos = line.find('\n', column_number);
  951. if (end_newline_pos != llvm::StringRef::npos) {
  952. line = line.take_front(end_newline_pos);
  953. }
  954. }
  955. return {.file_name = buffer_->source_->filename(),
  956. .line = line,
  957. .line_number = line_number + 1,
  958. .column_number = column_number + 1};
  959. }
  960. auto TokenizedBuffer::TokenLocationTranslator::GetLocation(Token token)
  961. -> DiagnosticLocation {
  962. // Map the token location into a position within the source buffer.
  963. const auto& token_info = buffer_->GetTokenInfo(token);
  964. const auto& line_info = buffer_->GetLineInfo(token_info.token_line);
  965. const char* token_start =
  966. buffer_->source_->text().begin() + line_info.start + token_info.column;
  967. // Find the corresponding file location.
  968. // TODO: Should we somehow indicate in the diagnostic location if this token
  969. // is a recovery token that doesn't correspond to the original source?
  970. return SourceBufferLocationTranslator(buffer_).GetLocation(token_start);
  971. }
  972. } // namespace Carbon