tokenized_buffer.cpp 50 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "toolchain/lex/tokenized_buffer.h"
  5. #include <algorithm>
  6. #include <array>
  7. #include <cmath>
  8. #include "common/check.h"
  9. #include "common/string_helpers.h"
  10. #include "llvm/ADT/StringRef.h"
  11. #include "llvm/ADT/StringSwitch.h"
  12. #include "llvm/Support/ErrorHandling.h"
  13. #include "llvm/Support/Format.h"
  14. #include "llvm/Support/FormatVariadic.h"
  15. #include "llvm/Support/raw_ostream.h"
  16. #include "toolchain/lex/character_set.h"
  17. #include "toolchain/lex/helpers.h"
  18. #include "toolchain/lex/numeric_literal.h"
  19. #include "toolchain/lex/string_literal.h"
  20. #if __x86_64__
  21. #include <x86intrin.h>
  22. #endif
  23. namespace Carbon::Lex {
  24. // TODO: Move Overload and VariantMatch somewhere more central.
  25. // Form an overload set from a list of functions. For example:
  26. //
  27. // ```
  28. // auto overloaded = Overload{[] (int) {}, [] (float) {}};
  29. // ```
  30. template <typename... Fs>
  31. struct Overload : Fs... {
  32. using Fs::operator()...;
  33. };
  34. template <typename... Fs>
  35. Overload(Fs...) -> Overload<Fs...>;
  36. // Pattern-match against the type of the value stored in the variant `V`. Each
  37. // element of `fs` should be a function that takes one or more of the variant
  38. // values in `V`.
  39. template <typename V, typename... Fs>
  40. auto VariantMatch(V&& v, Fs&&... fs) -> decltype(auto) {
  41. return std::visit(Overload{std::forward<Fs&&>(fs)...}, std::forward<V&&>(v));
  42. }
  43. // Scans the provided text and returns the prefix `StringRef` of contiguous
  44. // identifier characters.
  45. //
  46. // This is a performance sensitive function and so uses vectorized code
  47. // sequences to optimize its scanning. When modifying, the identifier lexing
  48. // benchmarks should be checked for regressions.
  49. //
  50. // Identifier characters here are currently the ASCII characters `[0-9A-Za-z_]`.
  51. //
  52. // TODO: Currently, this code does not implement Carbon's design for Unicode
  53. // characters in identifiers. It does work on UTF-8 code unit sequences, but
  54. // currently considers non-ASCII characters to be non-identifier characters.
  55. // Some work has been done to ensure the hot loop, while optimized, retains
  56. // enough information to add Unicode handling without completely destroying the
  57. // relevant optimizations.
  58. static auto ScanForIdentifierPrefix(llvm::StringRef text) -> llvm::StringRef {
  59. // A table of booleans that we can use to classify bytes as being valid
  60. // identifier (or keyword) characters. This is used in the generic,
  61. // non-vectorized fallback code to scan for length of an identifier.
  62. static constexpr std::array<bool, 256> IsIdByteTable = ([]() constexpr {
  63. std::array<bool, 256> table = {};
  64. for (char c = '0'; c <= '9'; ++c) {
  65. table[c] = true;
  66. }
  67. for (char c = 'A'; c <= 'Z'; ++c) {
  68. table[c] = true;
  69. }
  70. for (char c = 'a'; c <= 'z'; ++c) {
  71. table[c] = true;
  72. }
  73. table['_'] = true;
  74. return table;
  75. })();
  76. #if __x86_64__
  77. // This code uses a scheme derived from the techniques in Geoff Langdale and
  78. // Daniel Lemire's work on parsing JSON[1]. Specifically, that paper outlines
  79. // a technique of using two 4-bit indexed in-register look-up tables (LUTs) to
  80. // classify bytes in a branchless SIMD code sequence.
  81. //
  82. // [1]: https://arxiv.org/pdf/1902.08318.pdf
  83. //
  84. // The goal is to get a bit mask classifying different sets of bytes. For each
  85. // input byte, we first test for a high bit indicating a UTF-8 encoded Unicode
  86. // character. Otherwise, we want the mask bits to be set with the following
  87. // logic derived by inspecting the high nibble and low nibble of the input:
  88. // bit0 = 1 for `_`: high `0x5` and low `0xF`
  89. // bit1 = 1 for `0-9`: high `0x3` and low `0x0` - `0x9`
  90. // bit2 = 1 for `A-O` and `a-o`: high `0x4` or `0x6` and low `0x1` - `0xF`
  91. // bit3 = 1 for `P-Z` and 'p-z': high `0x5` or `0x7` and low `0x0` - `0xA`
  92. // bit4 = unused
  93. // bit5 = unused
  94. // bit6 = unused
  95. // bit7 = unused
  96. //
  97. // No bits set means definitively non-ID ASCII character.
  98. //
  99. // bits 4-7 remain unused if we need to classify more characters.
  100. const auto high_lut = _mm_setr_epi8(
  101. /* __b0=*/0b0000'0000,
  102. /* __b1=*/0b0000'0000,
  103. /* __b2=*/0b0000'0000,
  104. /* __b3=*/0b0000'0010,
  105. /* __b4=*/0b0000'0100,
  106. /* __b5=*/0b0000'1001,
  107. /* __b6=*/0b0000'0100,
  108. /* __b7=*/0b0000'1000,
  109. /* __b8=*/0b0000'0000,
  110. /* __b9=*/0b0000'0000,
  111. /*__b10=*/0b0000'0000,
  112. /*__b11=*/0b0000'0000,
  113. /*__b12=*/0b0000'0000,
  114. /*__b13=*/0b0000'0000,
  115. /*__b14=*/0b0000'0000,
  116. /*__b15=*/0b0000'0000);
  117. const auto low_lut = _mm_setr_epi8(
  118. /* __b0=*/0b0000'1010,
  119. /* __b1=*/0b0000'1110,
  120. /* __b2=*/0b0000'1110,
  121. /* __b3=*/0b0000'1110,
  122. /* __b4=*/0b0000'1110,
  123. /* __b5=*/0b0000'1110,
  124. /* __b6=*/0b0000'1110,
  125. /* __b7=*/0b0000'1110,
  126. /* __b8=*/0b0000'1110,
  127. /* __b9=*/0b0000'1110,
  128. /*__b10=*/0b0000'1100,
  129. /*__b11=*/0b0000'0100,
  130. /*__b12=*/0b0000'0100,
  131. /*__b13=*/0b0000'0100,
  132. /*__b14=*/0b0000'0100,
  133. /*__b15=*/0b0000'0101);
  134. // Use `ssize_t` for performance here as we index memory in a tight loop.
  135. ssize_t i = 0;
  136. const ssize_t size = text.size();
  137. while ((i + 16) <= size) {
  138. __m128i input =
  139. _mm_loadu_si128(reinterpret_cast<const __m128i*>(text.data() + i));
  140. // The high bits of each byte indicate a non-ASCII character encoded using
  141. // UTF-8. Test those and fall back to the scalar code if present. These
  142. // bytes will also cause spurious zeros in the LUT results, but we can
  143. // ignore that because we track them independently here.
  144. #if __SSE4_1__
  145. if (!_mm_test_all_zeros(_mm_set1_epi8(0x80), input)) {
  146. break;
  147. }
  148. #else
  149. if (_mm_movemask_epi8(input) != 0) {
  150. break;
  151. }
  152. #endif
  153. // Do two LUT lookups and mask the results together to get the results for
  154. // both low and high nibbles. Note that we don't need to mask out the high
  155. // bit of input here because we track that above for UTF-8 handling.
  156. __m128i low_mask = _mm_shuffle_epi8(low_lut, input);
  157. // Note that the input needs to be masked to only include the high nibble or
  158. // we could end up with bit7 set forcing the result to a zero byte.
  159. __m128i input_high =
  160. _mm_and_si128(_mm_srli_epi32(input, 4), _mm_set1_epi8(0x0f));
  161. __m128i high_mask = _mm_shuffle_epi8(high_lut, input_high);
  162. __m128i mask = _mm_and_si128(low_mask, high_mask);
  163. // Now compare to find the completely zero bytes.
  164. __m128i id_byte_mask_vec = _mm_cmpeq_epi8(mask, _mm_setzero_si128());
  165. int tail_ascii_mask = _mm_movemask_epi8(id_byte_mask_vec);
  166. // Check if there are bits in the tail mask, which means zero bytes and the
  167. // end of the identifier. We could do this without materializing the scalar
  168. // mask on more recent CPUs, but we generally expect the median length we
  169. // encounter to be <16 characters and so we avoid the extra instruction in
  170. // that case and predict this branch to succeed so it is laid out in a
  171. // reasonable way.
  172. if (LLVM_LIKELY(tail_ascii_mask != 0)) {
  173. // Move past the definitively classified bytes that are part of the
  174. // identifier, and return the complete identifier text.
  175. i += __builtin_ctz(tail_ascii_mask);
  176. return text.substr(0, i);
  177. }
  178. i += 16;
  179. }
  180. // Fallback to scalar loop. We only end up here when we don't have >=16
  181. // bytes to scan or we find a UTF-8 unicode character.
  182. // TODO: This assumes all Unicode characters are non-identifiers.
  183. while (i < size && IsIdByteTable[static_cast<unsigned char>(text[i])]) {
  184. ++i;
  185. }
  186. return text.substr(0, i);
  187. #else
  188. // TODO: Optimize this with SIMD for other architectures.
  189. return text.take_while(
  190. [](char c) { return IsIdByteTable[static_cast<unsigned char>(c)]; });
  191. #endif
  192. }
  193. // Implementation of the lexer logic itself.
  194. //
  195. // The design is that lexing can loop over the source buffer, consuming it into
  196. // tokens by calling into this API. This class handles the state and breaks down
  197. // the different lexing steps that may be used. It directly updates the provided
  198. // tokenized buffer with the lexed tokens.
  199. class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
  200. public:
  201. // Symbolic result of a lexing action. This indicates whether we successfully
  202. // lexed a token, or whether other lexing actions should be attempted.
  203. //
  204. // While it wraps a simple boolean state, its API both helps make the failures
  205. // more self documenting, and by consuming the actual token constructively
  206. // when one is produced, it helps ensure the correct result is returned.
  207. class LexResult {
  208. public:
  209. // Consumes (and discard) a valid token to construct a result
  210. // indicating a token has been produced. Relies on implicit conversions.
  211. // NOLINTNEXTLINE(google-explicit-constructor)
  212. LexResult(Token /*discarded_token*/) : LexResult(true) {}
  213. // Returns a result indicating no token was produced.
  214. static auto NoMatch() -> LexResult { return LexResult(false); }
  215. // Tests whether a token was produced by the lexing routine, and
  216. // the lexer can continue forming tokens.
  217. explicit operator bool() const { return formed_token_; }
  218. private:
  219. explicit LexResult(bool formed_token) : formed_token_(formed_token) {}
  220. bool formed_token_;
  221. };
  222. Lexer(TokenizedBuffer& buffer, DiagnosticConsumer& consumer)
  223. : buffer_(&buffer),
  224. translator_(&buffer),
  225. emitter_(translator_, consumer),
  226. token_translator_(&buffer),
  227. token_emitter_(token_translator_, consumer),
  228. current_line_(buffer.AddLine(LineInfo(0))),
  229. current_line_info_(&buffer.GetLineInfo(current_line_)) {}
  230. // Perform the necessary bookkeeping to step past a newline at the current
  231. // line and column.
  232. auto HandleNewline() -> void {
  233. current_line_info_->length = current_column_;
  234. current_line_ = buffer_->AddLine(
  235. LineInfo(current_line_info_->start + current_column_ + 1));
  236. current_line_info_ = &buffer_->GetLineInfo(current_line_);
  237. current_column_ = 0;
  238. set_indent_ = false;
  239. }
  240. auto NoteWhitespace() -> void {
  241. buffer_->token_infos_.back().has_trailing_space = true;
  242. }
  243. auto LexHorizontalWhitespace(llvm::StringRef& source_text) -> void {
  244. CARBON_DCHECK(source_text.front() == ' ' || source_text.front() == '\t');
  245. NoteWhitespace();
  246. ++current_column_;
  247. source_text = source_text.drop_front();
  248. }
  249. auto LexVerticalWhitespace(llvm::StringRef& source_text) -> void {
  250. CARBON_DCHECK(source_text.front() == '\n');
  251. NoteWhitespace();
  252. source_text = source_text.drop_front();
  253. // If this is the last character in the source, directly return here
  254. // to avoid creating an empty line.
  255. if (LLVM_UNLIKELY(source_text.empty())) {
  256. current_line_info_->length = current_column_;
  257. return;
  258. }
  259. // Otherwise, add a line and set up to continue lexing.
  260. HandleNewline();
  261. }
  262. auto LexCommentOrSlash(llvm::StringRef& source_text) -> void {
  263. CARBON_DCHECK(source_text.front() == '/');
  264. // Both comments and slash symbols start with a `/`. We disambiguate with a
  265. // max-munch rule -- if the next character is another `/` then we lex it as
  266. // a comment start. If it isn't, then we lex as a slash.
  267. if (source_text.size() > 1 && source_text[1] == '/') {
  268. LexComment(source_text);
  269. return;
  270. }
  271. // This code path should produce a token, make sure that happens.
  272. LexResult result = LexSymbolToken(source_text);
  273. CARBON_CHECK(result) << "Failed to form a token!";
  274. }
  275. auto LexComment(llvm::StringRef& source_text) -> void {
  276. CARBON_DCHECK(source_text.startswith("//"));
  277. // Any comment must be the only non-whitespace on the line.
  278. if (set_indent_) {
  279. CARBON_DIAGNOSTIC(TrailingComment, Error,
  280. "Trailing comments are not permitted.");
  281. emitter_.Emit(source_text.begin(), TrailingComment);
  282. }
  283. // The introducer '//' must be followed by whitespace or EOF.
  284. if (source_text.size() > 2 && !IsSpace(source_text[2])) {
  285. CARBON_DIAGNOSTIC(NoWhitespaceAfterCommentIntroducer, Error,
  286. "Whitespace is required after '//'.");
  287. emitter_.Emit(source_text.begin() + 2,
  288. NoWhitespaceAfterCommentIntroducer);
  289. }
  290. // Now just consume the text until a newline.
  291. while (!source_text.empty() && source_text.front() != '\n') {
  292. ++current_column_;
  293. source_text = source_text.drop_front();
  294. }
  295. // We don't handle the newline, just fall back to the lex loop to handle it
  296. // generically.
  297. }
  298. auto LexNumericLiteral(llvm::StringRef& source_text) -> LexResult {
  299. std::optional<NumericLiteral> literal = NumericLiteral::Lex(source_text);
  300. if (!literal) {
  301. return LexError(source_text);
  302. }
  303. int int_column = current_column_;
  304. int token_size = literal->text().size();
  305. current_column_ += token_size;
  306. source_text = source_text.drop_front(token_size);
  307. if (!set_indent_) {
  308. current_line_info_->indent = int_column;
  309. set_indent_ = true;
  310. }
  311. return VariantMatch(
  312. literal->ComputeValue(emitter_),
  313. [&](NumericLiteral::IntegerValue&& value) {
  314. auto token = buffer_->AddToken({.kind = TokenKind::IntegerLiteral,
  315. .token_line = current_line_,
  316. .column = int_column});
  317. buffer_->GetTokenInfo(token).literal_index =
  318. buffer_->literal_int_storage_.size();
  319. buffer_->literal_int_storage_.push_back(std::move(value.value));
  320. return token;
  321. },
  322. [&](NumericLiteral::RealValue&& value) {
  323. auto token = buffer_->AddToken({.kind = TokenKind::RealLiteral,
  324. .token_line = current_line_,
  325. .column = int_column});
  326. buffer_->GetTokenInfo(token).literal_index =
  327. buffer_->literal_int_storage_.size();
  328. buffer_->literal_int_storage_.push_back(std::move(value.mantissa));
  329. buffer_->literal_int_storage_.push_back(std::move(value.exponent));
  330. CARBON_CHECK(buffer_->GetRealLiteral(token).is_decimal ==
  331. (value.radix == NumericLiteral::Radix::Decimal));
  332. return token;
  333. },
  334. [&](NumericLiteral::UnrecoverableError) {
  335. auto token = buffer_->AddToken({
  336. .kind = TokenKind::Error,
  337. .token_line = current_line_,
  338. .column = int_column,
  339. .error_length = token_size,
  340. });
  341. return token;
  342. });
  343. }
  344. auto LexStringLiteral(llvm::StringRef& source_text) -> LexResult {
  345. std::optional<StringLiteral> literal = StringLiteral::Lex(source_text);
  346. if (!literal) {
  347. return LexError(source_text);
  348. }
  349. Line string_line = current_line_;
  350. int string_column = current_column_;
  351. int literal_size = literal->text().size();
  352. source_text = source_text.drop_front(literal_size);
  353. if (!set_indent_) {
  354. current_line_info_->indent = string_column;
  355. set_indent_ = true;
  356. }
  357. // Update line and column information.
  358. if (!literal->is_multi_line()) {
  359. current_column_ += literal_size;
  360. } else {
  361. for (char c : literal->text()) {
  362. if (c == '\n') {
  363. HandleNewline();
  364. // The indentation of all lines in a multi-line string literal is
  365. // that of the first line.
  366. current_line_info_->indent = string_column;
  367. set_indent_ = true;
  368. } else {
  369. ++current_column_;
  370. }
  371. }
  372. }
  373. if (literal->is_terminated()) {
  374. auto token =
  375. buffer_->AddToken({.kind = TokenKind::StringLiteral,
  376. .token_line = string_line,
  377. .column = string_column,
  378. .literal_index = static_cast<int32_t>(
  379. buffer_->literal_string_storage_.size())});
  380. buffer_->literal_string_storage_.push_back(
  381. literal->ComputeValue(emitter_));
  382. return token;
  383. } else {
  384. CARBON_DIAGNOSTIC(UnterminatedString, Error,
  385. "String is missing a terminator.");
  386. emitter_.Emit(literal->text().begin(), UnterminatedString);
  387. return buffer_->AddToken({.kind = TokenKind::Error,
  388. .token_line = string_line,
  389. .column = string_column,
  390. .error_length = literal_size});
  391. }
  392. }
  393. auto LexSymbolToken(llvm::StringRef& source_text,
  394. TokenKind kind = TokenKind::Error) -> LexResult {
  395. auto compute_symbol_kind = [](llvm::StringRef source_text) {
  396. return llvm::StringSwitch<TokenKind>(source_text)
  397. #define CARBON_SYMBOL_TOKEN(Name, Spelling) \
  398. .StartsWith(Spelling, TokenKind::Name)
  399. #include "toolchain/lex/token_kind.def"
  400. .Default(TokenKind::Error);
  401. };
  402. // We use the `error` token as a place-holder for cases where one character
  403. // isn't enough to pick a definitive symbol token. Recompute the kind using
  404. // the full symbol set.
  405. if (LLVM_UNLIKELY(kind == TokenKind::Error)) {
  406. kind = compute_symbol_kind(source_text);
  407. if (kind == TokenKind::Error) {
  408. return LexError(source_text);
  409. }
  410. } else {
  411. // Verify in a debug build that the incoming token kind is correct.
  412. CARBON_DCHECK(kind == compute_symbol_kind(source_text))
  413. << "Incoming token kind '" << kind
  414. << "' does not match computed kind '"
  415. << compute_symbol_kind(source_text) << "'!";
  416. }
  417. if (!set_indent_) {
  418. current_line_info_->indent = current_column_;
  419. set_indent_ = true;
  420. }
  421. CloseInvalidOpenGroups(kind);
  422. const char* location = source_text.begin();
  423. Token token = buffer_->AddToken(
  424. {.kind = kind, .token_line = current_line_, .column = current_column_});
  425. current_column_ += kind.fixed_spelling().size();
  426. source_text = source_text.drop_front(kind.fixed_spelling().size());
  427. // Opening symbols just need to be pushed onto our queue of opening groups.
  428. if (kind.is_opening_symbol()) {
  429. open_groups_.push_back(token);
  430. return token;
  431. }
  432. // Only closing symbols need further special handling.
  433. if (!kind.is_closing_symbol()) {
  434. return token;
  435. }
  436. TokenInfo& closing_token_info = buffer_->GetTokenInfo(token);
  437. // Check that there is a matching opening symbol before we consume this as
  438. // a closing symbol.
  439. if (open_groups_.empty()) {
  440. closing_token_info.kind = TokenKind::Error;
  441. closing_token_info.error_length = kind.fixed_spelling().size();
  442. CARBON_DIAGNOSTIC(
  443. UnmatchedClosing, Error,
  444. "Closing symbol without a corresponding opening symbol.");
  445. emitter_.Emit(location, UnmatchedClosing);
  446. // Note that this still returns true as we do consume a symbol.
  447. return token;
  448. }
  449. // Finally can handle a normal closing symbol.
  450. Token opening_token = open_groups_.pop_back_val();
  451. TokenInfo& opening_token_info = buffer_->GetTokenInfo(opening_token);
  452. opening_token_info.closing_token = token;
  453. closing_token_info.opening_token = opening_token;
  454. return token;
  455. }
  456. // Given a word that has already been lexed, determine whether it is a type
  457. // literal and if so form the corresponding token.
  458. auto LexWordAsTypeLiteralToken(llvm::StringRef word, int column)
  459. -> LexResult {
  460. if (word.size() < 2) {
  461. // Too short to form one of these tokens.
  462. return LexResult::NoMatch();
  463. }
  464. if (word[1] < '1' || word[1] > '9') {
  465. // Doesn't start with a valid initial digit.
  466. return LexResult::NoMatch();
  467. }
  468. std::optional<TokenKind> kind;
  469. switch (word.front()) {
  470. case 'i':
  471. kind = TokenKind::IntegerTypeLiteral;
  472. break;
  473. case 'u':
  474. kind = TokenKind::UnsignedIntegerTypeLiteral;
  475. break;
  476. case 'f':
  477. kind = TokenKind::FloatingPointTypeLiteral;
  478. break;
  479. default:
  480. return LexResult::NoMatch();
  481. };
  482. llvm::StringRef suffix = word.substr(1);
  483. if (!CanLexInteger(emitter_, suffix)) {
  484. return buffer_->AddToken(
  485. {.kind = TokenKind::Error,
  486. .token_line = current_line_,
  487. .column = column,
  488. .error_length = static_cast<int32_t>(word.size())});
  489. }
  490. llvm::APInt suffix_value;
  491. if (suffix.getAsInteger(10, suffix_value)) {
  492. return LexResult::NoMatch();
  493. }
  494. auto token = buffer_->AddToken(
  495. {.kind = *kind, .token_line = current_line_, .column = column});
  496. buffer_->GetTokenInfo(token).literal_index =
  497. buffer_->literal_int_storage_.size();
  498. buffer_->literal_int_storage_.push_back(std::move(suffix_value));
  499. return token;
  500. }
  501. // Closes all open groups that cannot remain open across the symbol `K`.
  502. // Users may pass `Error` to close all open groups.
  503. auto CloseInvalidOpenGroups(TokenKind kind) -> void {
  504. // There are two common cases that result in nothing to close. Short circuit
  505. // those here.
  506. if ((!kind.is_closing_symbol() && kind != TokenKind::Error) ||
  507. open_groups_.empty()) {
  508. return;
  509. }
  510. // Also check the first open group token to see if it matches this closing
  511. // token, in which case there is nothing to do. This is redundant with the
  512. // work inside the main loop, but we peel it out to allow inlining.
  513. Token opening_token = open_groups_.back();
  514. TokenKind opening_kind = buffer_->GetTokenInfo(opening_token).kind;
  515. if (kind == opening_kind.closing_symbol()) {
  516. return;
  517. }
  518. // Otherwise, delegate to a separate function to help with inlining.
  519. CloseInvalidOpenGroupsSlow(kind);
  520. }
  521. [[gnu::noinline]] auto CloseInvalidOpenGroupsSlow(TokenKind kind) -> void {
  522. CARBON_CHECK(kind.is_closing_symbol() || kind == TokenKind::Error);
  523. CARBON_CHECK(!open_groups_.empty());
  524. do {
  525. Token opening_token = open_groups_.back();
  526. TokenKind opening_kind = buffer_->GetTokenInfo(opening_token).kind;
  527. if (kind == opening_kind.closing_symbol()) {
  528. return;
  529. }
  530. open_groups_.pop_back();
  531. CARBON_DIAGNOSTIC(
  532. MismatchedClosing, Error,
  533. "Closing symbol does not match most recent opening symbol.");
  534. token_emitter_.Emit(opening_token, MismatchedClosing);
  535. CARBON_CHECK(!buffer_->tokens().empty())
  536. << "Must have a prior opening token!";
  537. Token prev_token = buffer_->tokens().end()[-1];
  538. // TODO: do a smarter backwards scan for where to put the closing
  539. // token.
  540. Token closing_token = buffer_->AddToken(
  541. {.kind = opening_kind.closing_symbol(),
  542. .has_trailing_space = buffer_->HasTrailingWhitespace(prev_token),
  543. .is_recovery = true,
  544. .token_line = current_line_,
  545. .column = current_column_});
  546. TokenInfo& opening_token_info = buffer_->GetTokenInfo(opening_token);
  547. TokenInfo& closing_token_info = buffer_->GetTokenInfo(closing_token);
  548. opening_token_info.closing_token = closing_token;
  549. closing_token_info.opening_token = opening_token;
  550. } while (!open_groups_.empty());
  551. }
  552. auto GetOrCreateIdentifier(llvm::StringRef text) -> Identifier {
  553. auto insert_result = buffer_->identifier_map_.insert(
  554. {text, Identifier(buffer_->identifier_infos_.size())});
  555. if (insert_result.second) {
  556. buffer_->identifier_infos_.push_back({text});
  557. }
  558. return insert_result.first->second;
  559. }
  560. auto LexKeywordOrIdentifier(llvm::StringRef& source_text) -> LexResult {
  561. if (static_cast<unsigned char>(source_text.front()) > 0x7F) {
  562. // TODO: Need to add support for Unicode lexing.
  563. return LexError(source_text);
  564. }
  565. CARBON_CHECK(IsAlpha(source_text.front()) || source_text.front() == '_');
  566. if (!set_indent_) {
  567. current_line_info_->indent = current_column_;
  568. set_indent_ = true;
  569. }
  570. // Take the valid characters off the front of the source buffer.
  571. llvm::StringRef identifier_text = ScanForIdentifierPrefix(source_text);
  572. CARBON_CHECK(!identifier_text.empty())
  573. << "Must have at least one character!";
  574. int identifier_column = current_column_;
  575. current_column_ += identifier_text.size();
  576. source_text = source_text.drop_front(identifier_text.size());
  577. // Check if the text is a type literal, and if so form such a literal.
  578. if (LexResult result =
  579. LexWordAsTypeLiteralToken(identifier_text, identifier_column)) {
  580. return result;
  581. }
  582. // Check if the text matches a keyword token, and if so use that.
  583. TokenKind kind = llvm::StringSwitch<TokenKind>(identifier_text)
  584. #define CARBON_KEYWORD_TOKEN(Name, Spelling) .Case(Spelling, TokenKind::Name)
  585. #include "toolchain/lex/token_kind.def"
  586. .Default(TokenKind::Error);
  587. if (kind != TokenKind::Error) {
  588. return buffer_->AddToken({.kind = kind,
  589. .token_line = current_line_,
  590. .column = identifier_column});
  591. }
  592. // Otherwise we have a generic identifier.
  593. return buffer_->AddToken({.kind = TokenKind::Identifier,
  594. .token_line = current_line_,
  595. .column = identifier_column,
  596. .id = GetOrCreateIdentifier(identifier_text)});
  597. }
  598. auto LexError(llvm::StringRef& source_text) -> LexResult {
  599. llvm::StringRef error_text = source_text.take_while([](char c) {
  600. if (IsAlnum(c)) {
  601. return false;
  602. }
  603. switch (c) {
  604. case '_':
  605. case '\t':
  606. case '\n':
  607. return false;
  608. default:
  609. break;
  610. }
  611. return llvm::StringSwitch<bool>(llvm::StringRef(&c, 1))
  612. #define CARBON_SYMBOL_TOKEN(Name, Spelling) .StartsWith(Spelling, false)
  613. #include "toolchain/lex/token_kind.def"
  614. .Default(true);
  615. });
  616. if (error_text.empty()) {
  617. // TODO: Reimplement this to use the lexer properly. In the meantime,
  618. // guarantee that we eat at least one byte.
  619. error_text = source_text.take_front(1);
  620. }
  621. auto token = buffer_->AddToken(
  622. {.kind = TokenKind::Error,
  623. .token_line = current_line_,
  624. .column = current_column_,
  625. .error_length = static_cast<int32_t>(error_text.size())});
  626. CARBON_DIAGNOSTIC(UnrecognizedCharacters, Error,
  627. "Encountered unrecognized characters while parsing.");
  628. emitter_.Emit(error_text.begin(), UnrecognizedCharacters);
  629. current_column_ += error_text.size();
  630. source_text = source_text.drop_front(error_text.size());
  631. return token;
  632. }
  633. auto LexStartOfFile(llvm::StringRef& /*source_text*/) -> void {
  634. // Before lexing any source text, add the start-of-file token so that code
  635. // can assume a non-empty token buffer for the rest of lexing. Note that the
  636. // start-of-file always has trailing space because it *is* whitespace.
  637. buffer_->AddToken({.kind = TokenKind::StartOfFile,
  638. .has_trailing_space = true,
  639. .token_line = current_line_,
  640. .column = current_column_});
  641. }
  642. auto LexEndOfFile(llvm::StringRef& source_text) -> void {
  643. CARBON_DCHECK(source_text.empty());
  644. // The end-of-file token is always considered to be whitespace.
  645. NoteWhitespace();
  646. // Update the line length as this is also the end of a line.
  647. current_line_info_->length = current_column_;
  648. // Close any open groups. We do this after marking whitespace, it will
  649. // preserve that.
  650. CloseInvalidOpenGroups(TokenKind::Error);
  651. buffer_->AddToken({.kind = TokenKind::EndOfFile,
  652. .token_line = current_line_,
  653. .column = current_column_});
  654. }
  655. // We use a collection of static member functions for table-based dispatch to
  656. // lexer methods. These are named static member functions so that they show up
  657. // helpfully in profiles and backtraces, but they tend to not contain the
  658. // interesting logic and simply delegate to the relevant methods. All of their
  659. // signatures need to be exactly the same however in order to ensure we can
  660. // build efficient dispatch tables out of them. All of them end by doing a
  661. // must-tail return call to this routine. It handles continuing the dispatch
  662. // chain.
  663. static auto DispatchNext(Lexer& lexer, llvm::StringRef& source_text) -> void {
  664. // When we finish the source text, stop recursing. We also hint this so that
  665. // the tail-dispatch is optimized as that's essentially the loop back-edge
  666. // and this is the loop exit.
  667. if (LLVM_UNLIKELY(source_text.empty())) {
  668. lexer.LexEndOfFile(source_text);
  669. return;
  670. }
  671. // The common case is to tail recurse based on the next character. Note that
  672. // because this is a must-tail return, this cannot fail to tail-call and
  673. // will not grow the stack. This is in essence a loop with dynamic tail
  674. // dispatch to the next stage of the loop.
  675. [[clang::musttail]] return DispatchTable[static_cast<unsigned char>(
  676. source_text.front())](lexer, source_text);
  677. }
  678. // Define a set of dispatch functions that simply forward to a method that
  679. // lexes a token. This includes validating that an actual token was produced,
  680. // and continuing the dispatch.
  681. #define CARBON_DISPATCH_LEX_TOKEN(LexMethod) \
  682. static auto Dispatch##LexMethod(Lexer& lexer, llvm::StringRef& source_text) \
  683. ->void { \
  684. LexResult result = lexer.LexMethod(source_text); \
  685. CARBON_CHECK(result) << "Failed to form a token!"; \
  686. [[clang::musttail]] return DispatchNext(lexer, source_text); \
  687. }
  688. CARBON_DISPATCH_LEX_TOKEN(LexError)
  689. CARBON_DISPATCH_LEX_TOKEN(LexSymbolToken)
  690. CARBON_DISPATCH_LEX_TOKEN(LexKeywordOrIdentifier)
  691. CARBON_DISPATCH_LEX_TOKEN(LexNumericLiteral)
  692. CARBON_DISPATCH_LEX_TOKEN(LexStringLiteral)
  693. // A custom dispatch function that pre-selects a symbol token to lex.
  694. template <const TokenKind& Token>
  695. static auto DispatchLexOneCharSymbol(Lexer& lexer,
  696. llvm::StringRef& source_text) -> void {
  697. LexResult result = lexer.LexSymbolToken(source_text, Token);
  698. CARBON_CHECK(result) << "Failed to form a token!";
  699. [[clang::musttail]] return DispatchNext(lexer, source_text);
  700. }
  701. // Define a set of non-token dispatch functions that handle things like
  702. // whitespace and comments.
  703. #define CARBON_DISPATCH_LEX_NON_TOKEN(LexMethod) \
  704. static auto Dispatch##LexMethod(Lexer& lexer, llvm::StringRef& source_text) \
  705. ->void { \
  706. lexer.LexMethod(source_text); \
  707. [[clang::musttail]] return DispatchNext(lexer, source_text); \
  708. }
  709. CARBON_DISPATCH_LEX_NON_TOKEN(LexHorizontalWhitespace)
  710. CARBON_DISPATCH_LEX_NON_TOKEN(LexVerticalWhitespace)
  711. CARBON_DISPATCH_LEX_NON_TOKEN(LexCommentOrSlash)
  712. // The main entry point for dispatching through the lexer's table. This method
  713. // should always fully consume the source text.
  714. auto Dispatch(llvm::StringRef& source_text) -> void {
  715. LexStartOfFile(source_text);
  716. // Manually enter the dispatch loop. This call will tail-recurse through the
  717. // dispatch table until everything from source_text is consumed.
  718. DispatchNext(*this, source_text);
  719. CARBON_CHECK(source_text.empty())
  720. << "Finished lexer dispatch without consuming the entire source text!";
  721. }
  722. private:
  723. using DispatchFunctionT = auto(Lexer& lexer, llvm::StringRef& source_text)
  724. -> void;
  725. using DispatchTableT = std::array<DispatchFunctionT*, 256>;
  726. // Build a table of function pointers that we can use to dispatch to the
  727. // correct lexer routine based on the first byte of source text.
  728. //
  729. // While it is tempting to simply use a `switch` on the first byte and
  730. // dispatch with cases into this, in practice that doesn't produce great code.
  731. // There seem to be two issues that are the root cause.
  732. //
  733. // First, there are lots of different values of bytes that dispatch to a
  734. // fairly small set of routines, and then some byte values that dispatch
  735. // differently for each byte. This pattern isn't one that the compiler-based
  736. // lowering of switches works well with -- it tries to balance all the cases,
  737. // and in doing so emits several compares and other control flow rather than a
  738. // simple jump table.
  739. //
  740. // Second, with a `case`, it isn't as obvious how to create a single, uniform
  741. // interface that is effective for *every* byte value, and thus makes for a
  742. // single consistent table-based dispatch. By forcing these to be function
  743. // pointers, we also coerce the code to use a strictly homogeneous structure
  744. // that can form a single dispatch table.
  745. //
  746. // These two actually interact -- the second issue is part of what makes the
  747. // non-table lowering in the first one desirable for many switches and cases.
  748. //
  749. // Ultimately, when table-based dispatch is such an important technique, we
  750. // get better results by taking full control and manually creating the
  751. // dispatch structures.
  752. //
  753. // The functions in this table also use tail-recursion to implement the loop
  754. // of the lexer. This is based on the technique described more fully for any
  755. // kind of byte-stream loop structure here:
  756. // https://blog.reverberate.org/2021/04/21/musttail-efficient-interpreters.html
  757. constexpr static auto MakeDispatchTable() -> DispatchTableT {
  758. DispatchTableT table = {};
  759. // First set the table entries to dispatch to our error token handler as the
  760. // base case. Everything valid comes from an override below.
  761. for (int i = 0; i < 256; ++i) {
  762. table[i] = &DispatchLexError;
  763. }
  764. // Symbols have some special dispatching. First, set the first character of
  765. // each symbol token spelling to dispatch to the symbol lexer. We don't
  766. // provide a pre-computed token here, so the symbol lexer will compute the
  767. // exact symbol token kind. We'll override this with more specific dispatch
  768. // below.
  769. #define CARBON_SYMBOL_TOKEN(TokenName, Spelling) \
  770. table[(Spelling)[0]] = &DispatchLexSymbolToken;
  771. #include "toolchain/lex/token_kind.def"
  772. // Now special cased single-character symbols that are guaranteed to not
  773. // join with another symbol. These are grouping symbols, terminators,
  774. // or separators in the grammar and have a good reason to be
  775. // orthogonal to any other punctuation. We do this separately because this
  776. // needs to override some of the generic handling above, and provide a
  777. // custom token.
  778. #define CARBON_ONE_CHAR_SYMBOL_TOKEN(TokenName, Spelling) \
  779. table[(Spelling)[0]] = &DispatchLexOneCharSymbol<TokenKind::TokenName>;
  780. #include "toolchain/lex/token_kind.def"
  781. // Override the handling for `/` to consider comments as well as a `/`
  782. // symbol.
  783. table['/'] = &DispatchLexCommentOrSlash;
  784. table['_'] = &DispatchLexKeywordOrIdentifier;
  785. // Note that we don't use `llvm::seq` because this needs to be `constexpr`
  786. // evaluated.
  787. for (unsigned char c = 'a'; c <= 'z'; ++c) {
  788. table[c] = &DispatchLexKeywordOrIdentifier;
  789. }
  790. for (unsigned char c = 'A'; c <= 'Z'; ++c) {
  791. table[c] = &DispatchLexKeywordOrIdentifier;
  792. }
  793. // We dispatch all non-ASCII UTF-8 characters to the identifier lexing
  794. // as whitespace characters should already have been skipped and the
  795. // only remaining valid Unicode characters would be part of an
  796. // identifier. That code can either accept or reject.
  797. for (int i = 0x80; i < 0x100; ++i) {
  798. table[i] = &DispatchLexKeywordOrIdentifier;
  799. }
  800. for (unsigned char c = '0'; c <= '9'; ++c) {
  801. table[c] = &DispatchLexNumericLiteral;
  802. }
  803. table['\''] = &DispatchLexStringLiteral;
  804. table['"'] = &DispatchLexStringLiteral;
  805. table['#'] = &DispatchLexStringLiteral;
  806. table[' '] = &DispatchLexHorizontalWhitespace;
  807. table['\t'] = &DispatchLexHorizontalWhitespace;
  808. table['\n'] = &DispatchLexVerticalWhitespace;
  809. return table;
  810. };
  811. static const DispatchTableT DispatchTable;
  812. TokenizedBuffer* buffer_;
  813. SourceBufferLocationTranslator translator_;
  814. LexerDiagnosticEmitter emitter_;
  815. TokenLocationTranslator token_translator_;
  816. TokenDiagnosticEmitter token_emitter_;
  817. Line current_line_;
  818. LineInfo* current_line_info_;
  819. int current_column_ = 0;
  820. bool set_indent_ = false;
  821. llvm::SmallVector<Token> open_groups_;
  822. };
  823. constexpr TokenizedBuffer::Lexer::DispatchTableT
  824. TokenizedBuffer::Lexer::DispatchTable = MakeDispatchTable();
  825. auto TokenizedBuffer::Lex(SourceBuffer& source, DiagnosticConsumer& consumer)
  826. -> TokenizedBuffer {
  827. TokenizedBuffer buffer(source);
  828. ErrorTrackingDiagnosticConsumer error_tracking_consumer(consumer);
  829. Lexer lexer(buffer, error_tracking_consumer);
  830. llvm::StringRef source_text = source.text();
  831. lexer.Dispatch(source_text);
  832. if (error_tracking_consumer.seen_error()) {
  833. buffer.has_errors_ = true;
  834. }
  835. return buffer;
  836. }
  837. auto TokenizedBuffer::GetKind(Token token) const -> TokenKind {
  838. return GetTokenInfo(token).kind;
  839. }
  840. auto TokenizedBuffer::GetLine(Token token) const -> Line {
  841. return GetTokenInfo(token).token_line;
  842. }
  843. auto TokenizedBuffer::GetLineNumber(Token token) const -> int {
  844. return GetLineNumber(GetLine(token));
  845. }
  846. auto TokenizedBuffer::GetColumnNumber(Token token) const -> int {
  847. return GetTokenInfo(token).column + 1;
  848. }
  849. auto TokenizedBuffer::GetTokenText(Token token) const -> llvm::StringRef {
  850. const auto& token_info = GetTokenInfo(token);
  851. llvm::StringRef fixed_spelling = token_info.kind.fixed_spelling();
  852. if (!fixed_spelling.empty()) {
  853. return fixed_spelling;
  854. }
  855. if (token_info.kind == TokenKind::Error) {
  856. const auto& line_info = GetLineInfo(token_info.token_line);
  857. int64_t token_start = line_info.start + token_info.column;
  858. return source_->text().substr(token_start, token_info.error_length);
  859. }
  860. // Refer back to the source text to preserve oddities like radix or digit
  861. // separators the author included.
  862. if (token_info.kind == TokenKind::IntegerLiteral ||
  863. token_info.kind == TokenKind::RealLiteral) {
  864. const auto& line_info = GetLineInfo(token_info.token_line);
  865. int64_t token_start = line_info.start + token_info.column;
  866. std::optional<NumericLiteral> relexed_token =
  867. NumericLiteral::Lex(source_->text().substr(token_start));
  868. CARBON_CHECK(relexed_token) << "Could not reform numeric literal token.";
  869. return relexed_token->text();
  870. }
  871. // Refer back to the source text to find the original spelling, including
  872. // escape sequences etc.
  873. if (token_info.kind == TokenKind::StringLiteral) {
  874. const auto& line_info = GetLineInfo(token_info.token_line);
  875. int64_t token_start = line_info.start + token_info.column;
  876. std::optional<StringLiteral> relexed_token =
  877. StringLiteral::Lex(source_->text().substr(token_start));
  878. CARBON_CHECK(relexed_token) << "Could not reform string literal token.";
  879. return relexed_token->text();
  880. }
  881. // Refer back to the source text to avoid needing to reconstruct the
  882. // spelling from the size.
  883. if (token_info.kind.is_sized_type_literal()) {
  884. const auto& line_info = GetLineInfo(token_info.token_line);
  885. int64_t token_start = line_info.start + token_info.column;
  886. llvm::StringRef suffix =
  887. source_->text().substr(token_start + 1).take_while(IsDecimalDigit);
  888. return llvm::StringRef(suffix.data() - 1, suffix.size() + 1);
  889. }
  890. if (token_info.kind == TokenKind::StartOfFile ||
  891. token_info.kind == TokenKind::EndOfFile) {
  892. return llvm::StringRef();
  893. }
  894. CARBON_CHECK(token_info.kind == TokenKind::Identifier) << token_info.kind;
  895. return GetIdentifierText(token_info.id);
  896. }
  897. auto TokenizedBuffer::GetIdentifier(Token token) const -> Identifier {
  898. const auto& token_info = GetTokenInfo(token);
  899. CARBON_CHECK(token_info.kind == TokenKind::Identifier) << token_info.kind;
  900. return token_info.id;
  901. }
  902. auto TokenizedBuffer::GetIntegerLiteral(Token token) const
  903. -> const llvm::APInt& {
  904. const auto& token_info = GetTokenInfo(token);
  905. CARBON_CHECK(token_info.kind == TokenKind::IntegerLiteral) << token_info.kind;
  906. return literal_int_storage_[token_info.literal_index];
  907. }
  908. auto TokenizedBuffer::GetRealLiteral(Token token) const -> RealLiteralValue {
  909. const auto& token_info = GetTokenInfo(token);
  910. CARBON_CHECK(token_info.kind == TokenKind::RealLiteral) << token_info.kind;
  911. // Note that every real literal is at least three characters long, so we can
  912. // safely look at the second character to determine whether we have a
  913. // decimal or hexadecimal literal.
  914. const auto& line_info = GetLineInfo(token_info.token_line);
  915. int64_t token_start = line_info.start + token_info.column;
  916. char second_char = source_->text()[token_start + 1];
  917. bool is_decimal = second_char != 'x' && second_char != 'b';
  918. return {.mantissa = literal_int_storage_[token_info.literal_index],
  919. .exponent = literal_int_storage_[token_info.literal_index + 1],
  920. .is_decimal = is_decimal};
  921. }
  922. auto TokenizedBuffer::GetStringLiteral(Token token) const -> llvm::StringRef {
  923. const auto& token_info = GetTokenInfo(token);
  924. CARBON_CHECK(token_info.kind == TokenKind::StringLiteral) << token_info.kind;
  925. return literal_string_storage_[token_info.literal_index];
  926. }
  927. auto TokenizedBuffer::GetTypeLiteralSize(Token token) const
  928. -> const llvm::APInt& {
  929. const auto& token_info = GetTokenInfo(token);
  930. CARBON_CHECK(token_info.kind.is_sized_type_literal()) << token_info.kind;
  931. return literal_int_storage_[token_info.literal_index];
  932. }
  933. auto TokenizedBuffer::GetMatchedClosingToken(Token opening_token) const
  934. -> Token {
  935. const auto& opening_token_info = GetTokenInfo(opening_token);
  936. CARBON_CHECK(opening_token_info.kind.is_opening_symbol())
  937. << opening_token_info.kind;
  938. return opening_token_info.closing_token;
  939. }
  940. auto TokenizedBuffer::GetMatchedOpeningToken(Token closing_token) const
  941. -> Token {
  942. const auto& closing_token_info = GetTokenInfo(closing_token);
  943. CARBON_CHECK(closing_token_info.kind.is_closing_symbol())
  944. << closing_token_info.kind;
  945. return closing_token_info.opening_token;
  946. }
  947. auto TokenizedBuffer::HasLeadingWhitespace(Token token) const -> bool {
  948. auto it = TokenIterator(token);
  949. return it == tokens().begin() || GetTokenInfo(*(it - 1)).has_trailing_space;
  950. }
  951. auto TokenizedBuffer::HasTrailingWhitespace(Token token) const -> bool {
  952. return GetTokenInfo(token).has_trailing_space;
  953. }
  954. auto TokenizedBuffer::IsRecoveryToken(Token token) const -> bool {
  955. return GetTokenInfo(token).is_recovery;
  956. }
  957. auto TokenizedBuffer::GetLineNumber(Line line) const -> int {
  958. return line.index + 1;
  959. }
  960. auto TokenizedBuffer::GetIndentColumnNumber(Line line) const -> int {
  961. return GetLineInfo(line).indent + 1;
  962. }
  963. auto TokenizedBuffer::GetIdentifierText(Identifier identifier) const
  964. -> llvm::StringRef {
  965. return identifier_infos_[identifier.index].text;
  966. }
  967. auto TokenizedBuffer::PrintWidths::Widen(const PrintWidths& widths) -> void {
  968. index = std::max(widths.index, index);
  969. kind = std::max(widths.kind, kind);
  970. column = std::max(widths.column, column);
  971. line = std::max(widths.line, line);
  972. indent = std::max(widths.indent, indent);
  973. }
  974. // Compute the printed width of a number. When numbers are printed in decimal,
  975. // the number of digits needed is is one more than the log-base-10 of the
  976. // value. We handle a value of `zero` explicitly.
  977. //
  978. // This routine requires its argument to be *non-negative*.
  979. static auto ComputeDecimalPrintedWidth(int number) -> int {
  980. CARBON_CHECK(number >= 0) << "Negative numbers are not supported.";
  981. if (number == 0) {
  982. return 1;
  983. }
  984. return static_cast<int>(std::log10(number)) + 1;
  985. }
  986. auto TokenizedBuffer::GetTokenPrintWidths(Token token) const -> PrintWidths {
  987. PrintWidths widths = {};
  988. widths.index = ComputeDecimalPrintedWidth(token_infos_.size());
  989. widths.kind = GetKind(token).name().size();
  990. widths.line = ComputeDecimalPrintedWidth(GetLineNumber(token));
  991. widths.column = ComputeDecimalPrintedWidth(GetColumnNumber(token));
  992. widths.indent =
  993. ComputeDecimalPrintedWidth(GetIndentColumnNumber(GetLine(token)));
  994. return widths;
  995. }
  996. auto TokenizedBuffer::Print(llvm::raw_ostream& output_stream) const -> void {
  997. if (tokens().begin() == tokens().end()) {
  998. return;
  999. }
  1000. output_stream << "- filename: " << source_->filename() << "\n"
  1001. << " tokens: [\n";
  1002. PrintWidths widths = {};
  1003. widths.index = ComputeDecimalPrintedWidth((token_infos_.size()));
  1004. for (Token token : tokens()) {
  1005. widths.Widen(GetTokenPrintWidths(token));
  1006. }
  1007. for (Token token : tokens()) {
  1008. PrintToken(output_stream, token, widths);
  1009. output_stream << "\n";
  1010. }
  1011. output_stream << " ]\n";
  1012. }
  1013. auto TokenizedBuffer::PrintToken(llvm::raw_ostream& output_stream,
  1014. Token token) const -> void {
  1015. PrintToken(output_stream, token, {});
  1016. }
  1017. auto TokenizedBuffer::PrintToken(llvm::raw_ostream& output_stream, Token token,
  1018. PrintWidths widths) const -> void {
  1019. widths.Widen(GetTokenPrintWidths(token));
  1020. int token_index = token.index;
  1021. const auto& token_info = GetTokenInfo(token);
  1022. llvm::StringRef token_text = GetTokenText(token);
  1023. // Output the main chunk using one format string. We have to do the
  1024. // justification manually in order to use the dynamically computed widths
  1025. // and get the quotes included.
  1026. output_stream << llvm::formatv(
  1027. " { index: {0}, kind: {1}, line: {2}, column: {3}, indent: {4}, "
  1028. "spelling: '{5}'",
  1029. llvm::format_decimal(token_index, widths.index),
  1030. llvm::right_justify(llvm::formatv("'{0}'", token_info.kind.name()).str(),
  1031. widths.kind + 2),
  1032. llvm::format_decimal(GetLineNumber(token_info.token_line), widths.line),
  1033. llvm::format_decimal(GetColumnNumber(token), widths.column),
  1034. llvm::format_decimal(GetIndentColumnNumber(token_info.token_line),
  1035. widths.indent),
  1036. token_text);
  1037. switch (token_info.kind) {
  1038. case TokenKind::Identifier:
  1039. output_stream << ", identifier: " << GetIdentifier(token).index;
  1040. break;
  1041. case TokenKind::IntegerLiteral:
  1042. output_stream << ", value: `";
  1043. GetIntegerLiteral(token).print(output_stream, /*isSigned=*/false);
  1044. output_stream << "`";
  1045. break;
  1046. case TokenKind::RealLiteral:
  1047. output_stream << ", value: `" << GetRealLiteral(token) << "`";
  1048. break;
  1049. case TokenKind::StringLiteral:
  1050. output_stream << ", value: `" << GetStringLiteral(token) << "`";
  1051. break;
  1052. default:
  1053. if (token_info.kind.is_opening_symbol()) {
  1054. output_stream << ", closing_token: "
  1055. << GetMatchedClosingToken(token).index;
  1056. } else if (token_info.kind.is_closing_symbol()) {
  1057. output_stream << ", opening_token: "
  1058. << GetMatchedOpeningToken(token).index;
  1059. }
  1060. break;
  1061. }
  1062. if (token_info.has_trailing_space) {
  1063. output_stream << ", has_trailing_space: true";
  1064. }
  1065. if (token_info.is_recovery) {
  1066. output_stream << ", recovery: true";
  1067. }
  1068. output_stream << " },";
  1069. }
  1070. auto TokenizedBuffer::GetLineInfo(Line line) -> LineInfo& {
  1071. return line_infos_[line.index];
  1072. }
  1073. auto TokenizedBuffer::GetLineInfo(Line line) const -> const LineInfo& {
  1074. return line_infos_[line.index];
  1075. }
  1076. auto TokenizedBuffer::AddLine(LineInfo info) -> Line {
  1077. line_infos_.push_back(info);
  1078. return Line(static_cast<int>(line_infos_.size()) - 1);
  1079. }
  1080. auto TokenizedBuffer::GetTokenInfo(Token token) -> TokenInfo& {
  1081. return token_infos_[token.index];
  1082. }
  1083. auto TokenizedBuffer::GetTokenInfo(Token token) const -> const TokenInfo& {
  1084. return token_infos_[token.index];
  1085. }
  1086. auto TokenizedBuffer::AddToken(TokenInfo info) -> Token {
  1087. token_infos_.push_back(info);
  1088. expected_parse_tree_size_ += info.kind.expected_parse_tree_size();
  1089. return Token(static_cast<int>(token_infos_.size()) - 1);
  1090. }
  1091. auto TokenIterator::Print(llvm::raw_ostream& output) const -> void {
  1092. output << token_.index;
  1093. }
  1094. auto TokenizedBuffer::SourceBufferLocationTranslator::GetLocation(
  1095. const char* loc) -> DiagnosticLocation {
  1096. CARBON_CHECK(StringRefContainsPointer(buffer_->source_->text(), loc))
  1097. << "location not within buffer";
  1098. int64_t offset = loc - buffer_->source_->text().begin();
  1099. // Find the first line starting after the given location. Note that we can't
  1100. // inspect `line.length` here because it is not necessarily correct for the
  1101. // final line during lexing (but will be correct later for the parse tree).
  1102. const auto* line_it = std::partition_point(
  1103. buffer_->line_infos_.begin(), buffer_->line_infos_.end(),
  1104. [offset](const LineInfo& line) { return line.start <= offset; });
  1105. // Step back one line to find the line containing the given position.
  1106. CARBON_CHECK(line_it != buffer_->line_infos_.begin())
  1107. << "location precedes the start of the first line";
  1108. --line_it;
  1109. int line_number = line_it - buffer_->line_infos_.begin();
  1110. int column_number = offset - line_it->start;
  1111. // Start by grabbing the line from the buffer. If the line isn't fully lexed,
  1112. // the length will be npos and the line will be grabbed from the known start
  1113. // to the end of the buffer; we'll then adjust the length.
  1114. llvm::StringRef line =
  1115. buffer_->source_->text().substr(line_it->start, line_it->length);
  1116. if (line_it->length == static_cast<int32_t>(llvm::StringRef::npos)) {
  1117. CARBON_CHECK(line.take_front(column_number).count('\n') == 0)
  1118. << "Currently we assume no unlexed newlines prior to the error column, "
  1119. "but there was one when erroring at "
  1120. << buffer_->source_->filename() << ":" << line_number << ":"
  1121. << column_number;
  1122. // Look for the next newline since we don't know the length. We can start at
  1123. // the column because prior newlines will have been lexed.
  1124. auto end_newline_pos = line.find('\n', column_number);
  1125. if (end_newline_pos != llvm::StringRef::npos) {
  1126. line = line.take_front(end_newline_pos);
  1127. }
  1128. }
  1129. return {.file_name = buffer_->source_->filename(),
  1130. .line = line,
  1131. .line_number = line_number + 1,
  1132. .column_number = column_number + 1};
  1133. }
  1134. auto TokenLocationTranslator::GetLocation(Token token) -> DiagnosticLocation {
  1135. // Map the token location into a position within the source buffer.
  1136. const auto& token_info = buffer_->GetTokenInfo(token);
  1137. const auto& line_info = buffer_->GetLineInfo(token_info.token_line);
  1138. const char* token_start =
  1139. buffer_->source_->text().begin() + line_info.start + token_info.column;
  1140. // Find the corresponding file location.
  1141. // TODO: Should we somehow indicate in the diagnostic location if this token
  1142. // is a recovery token that doesn't correspond to the original source?
  1143. return TokenizedBuffer::SourceBufferLocationTranslator(buffer_).GetLocation(
  1144. token_start);
  1145. }
  1146. } // namespace Carbon::Lex