tokenized_buffer.h 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #ifndef CARBON_TOOLCHAIN_LEX_TOKENIZED_BUFFER_H_
  5. #define CARBON_TOOLCHAIN_LEX_TOKENIZED_BUFFER_H_
  6. #include <cstdint>
  7. #include "common/ostream.h"
  8. #include "llvm/ADT/APInt.h"
  9. #include "llvm/ADT/SmallVector.h"
  10. #include "llvm/ADT/StringRef.h"
  11. #include "llvm/ADT/iterator_range.h"
  12. #include "llvm/Support/Allocator.h"
  13. #include "llvm/Support/raw_ostream.h"
  14. #include "toolchain/base/index_base.h"
  15. #include "toolchain/base/mem_usage.h"
  16. #include "toolchain/base/shared_value_stores.h"
  17. #include "toolchain/diagnostics/diagnostic_emitter.h"
  18. #include "toolchain/lex/token_index.h"
  19. #include "toolchain/lex/token_kind.h"
  20. #include "toolchain/source/source_buffer.h"
  21. namespace Carbon::Lex {
  22. class TokenizedBuffer;
  23. // A lightweight handle to a lexed line in a `TokenizedBuffer`.
  24. //
  25. // `LineIndex` objects are designed to be passed by value, not reference or
  26. // pointer. They are also designed to be small and efficient to store in data
  27. // structures.
  28. //
  29. // Each `LineIndex` object refers to a specific line in the source code that was
  30. // lexed. They can be compared directly to establish that they refer to the
  31. // same line or the relative position of different lines within the source.
  32. //
  33. // All other APIs to query a `LineIndex` are on the `TokenizedBuffer`.
  34. struct LineIndex : public IndexBase<LineIndex> {
  35. static constexpr llvm::StringLiteral Label = "line";
  36. static const LineIndex None;
  37. using IndexBase::IndexBase;
  38. };
  39. constexpr LineIndex LineIndex::None(NoneIndex);
  40. // Indices for comments within the buffer.
  41. struct CommentIndex : public IndexBase<CommentIndex> {
  42. static constexpr llvm::StringLiteral Label = "comment";
  43. static const CommentIndex None;
  44. using IndexBase::IndexBase;
  45. };
  46. constexpr CommentIndex CommentIndex::None(NoneIndex);
  47. // Random-access iterator over comments within the buffer.
  48. using CommentIterator = IndexIterator<CommentIndex>;
  49. // Random-access iterator over tokens within the buffer.
  50. using TokenIterator = IndexIterator<TokenIndex>;
  51. // A token range which is inclusive of the begin and end.
  52. struct InclusiveTokenRange {
  53. TokenIndex begin;
  54. TokenIndex end;
  55. };
  56. // A buffer of tokenized Carbon source code.
  57. //
  58. // This is constructed by lexing the source code text into a series of tokens.
  59. // The buffer provides lightweight handles to tokens and other lexed entities,
  60. // as well as iterations to walk the sequence of tokens found in the buffer.
  61. //
  62. // Lexing errors result in a potentially incomplete sequence of tokens and
  63. // `HasError` returning true.
  64. class TokenizedBuffer : public Printable<TokenizedBuffer> {
  65. public:
  66. // A comment, which can be a block of lines.
  67. //
  68. // This is the API version of `CommentData`.
  69. struct CommentInfo {
  70. // The comment's full text, including `//` symbols. This may have several
  71. // lines for block comments.
  72. llvm::StringRef text;
  73. // The comment's indent.
  74. int32_t indent;
  75. // The first line of the comment.
  76. LineIndex start_line;
  77. };
  78. auto GetKind(TokenIndex token) const -> TokenKind;
  79. auto GetLine(TokenIndex token) const -> LineIndex;
  80. // Returns the 1-based line number.
  81. auto GetLineNumber(TokenIndex token) const -> int;
  82. // Returns the 1-based column number.
  83. auto GetColumnNumber(TokenIndex token) const -> int;
  84. // Returns the line and 1-based column number of the first character after
  85. // this token.
  86. auto GetEndLoc(TokenIndex token) const -> std::pair<LineIndex, int>;
  87. // Returns the source text lexed into this token.
  88. auto GetTokenText(TokenIndex token) const -> llvm::StringRef;
  89. // Returns the identifier associated with this token. The token kind must be
  90. // an `Identifier`.
  91. auto GetIdentifier(TokenIndex token) const -> IdentifierId;
  92. // Returns the value of an `IntLiteral()` token.
  93. auto GetIntLiteral(TokenIndex token) const -> IntId;
  94. // Returns the value of an `RealLiteral()` token.
  95. auto GetRealLiteral(TokenIndex token) const -> RealId;
  96. // Returns the value of a `StringLiteral()` token.
  97. auto GetStringLiteralValue(TokenIndex token) const -> StringLiteralValueId;
  98. // Returns the size specified in a `*TypeLiteral()` token.
  99. auto GetTypeLiteralSize(TokenIndex token) const -> IntId;
  100. // Returns the closing token matched with the given opening token.
  101. //
  102. // The given token must be an opening token kind.
  103. auto GetMatchedClosingToken(TokenIndex opening_token) const -> TokenIndex;
  104. // Returns the opening token matched with the given closing token.
  105. //
  106. // The given token must be a closing token kind.
  107. auto GetMatchedOpeningToken(TokenIndex closing_token) const -> TokenIndex;
  108. // Returns whether the given token has leading whitespace.
  109. auto HasLeadingWhitespace(TokenIndex token) const -> bool;
  110. // Returns whether the given token has trailing whitespace.
  111. auto HasTrailingWhitespace(TokenIndex token) const -> bool;
  112. // Returns whether the token was created as part of an error recovery effort.
  113. //
  114. // For example, a closing paren inserted to match an unmatched paren.
  115. auto IsRecoveryToken(TokenIndex token) const -> bool;
  116. // Returns the 1-based indentation column number.
  117. auto GetIndentColumnNumber(LineIndex line) const -> int;
  118. // Returns the next line handle.
  119. auto GetNextLine(LineIndex line) const -> LineIndex;
  120. // Returns the previous line handle.
  121. auto GetPrevLine(LineIndex line) const -> LineIndex;
  122. auto GetByteOffset(TokenIndex token) const -> int32_t {
  123. return GetTokenInfo(token).byte_offset();
  124. }
  125. // Returns true if the token comes after the comment.
  126. auto IsAfterComment(TokenIndex token, CommentIndex comment_index) const
  127. -> bool;
  128. // Returns the comment's full text range.
  129. auto GetCommentText(CommentIndex comment_index) const -> llvm::StringRef;
  130. // Returns tokens as YAML. This prints the tracked token information on a
  131. // single line for each token. We use the single-line format so that output is
  132. // compact, and so that tools like `grep` are compatible.
  133. //
  134. // An example token looks like:
  135. //
  136. // - { index: 1, kind: 'Semi', line: 1, column: 1, indent: 1, spelling: ';' }
  137. auto Print(llvm::raw_ostream& out,
  138. bool omit_file_boundary_tokens = false) const -> void;
  139. // Prints a description of a single token. See `Print` for details on the
  140. // format.
  141. auto PrintToken(llvm::raw_ostream& output_stream, TokenIndex token) const
  142. -> void;
  143. // Collects memory usage of members.
  144. auto CollectMemUsage(MemUsage& mem_usage, llvm::StringRef label) const
  145. -> void;
  146. // Converts a token to a diagnostic location.
  147. auto TokenToDiagnosticLoc(TokenIndex token) const
  148. -> Diagnostics::ConvertedLoc;
  149. // Returns true if the given range overlaps with an entry in
  150. // `dump_sem_ir_ranges_`. Must not be called when there are no ranges; query
  151. // `has_dump_sem_ir_ranges` first.
  152. auto OverlapsWithDumpSemIRRange(Lex::InclusiveTokenRange range) const -> bool;
  153. // Returns true if the buffer has errors that were detected at lexing time.
  154. auto has_errors() const -> bool { return has_errors_; }
  155. auto tokens() const -> llvm::iterator_range<TokenIterator> {
  156. return llvm::make_range(TokenIterator(TokenIndex(0)),
  157. TokenIterator(TokenIndex(token_infos_.size())));
  158. }
  159. auto size() const -> int { return token_infos_.size(); }
  160. auto comments() const -> llvm::iterator_range<CommentIterator> {
  161. return llvm::make_range(CommentIterator(CommentIndex(0)),
  162. CommentIterator(CommentIndex(comments_.size())));
  163. }
  164. auto comments_size() const -> size_t { return comments_.size(); }
  165. // Returns true if any `DumpSemIRRange`s were provided.
  166. auto has_dump_sem_ir_ranges() const -> bool {
  167. return !dump_sem_ir_ranges_.empty();
  168. }
  169. // This is an upper bound on the number of output parse nodes in the absence
  170. // of errors.
  171. auto expected_max_parse_tree_size() const -> int {
  172. return expected_max_parse_tree_size_;
  173. }
  174. auto source() const -> const SourceBuffer& { return *source_; }
  175. private:
  176. friend class Lexer;
  177. class SourcePointerDiagnosticEmitter
  178. : public Diagnostics::Emitter<const char*> {
  179. public:
  180. explicit SourcePointerDiagnosticEmitter(Diagnostics::Consumer* consumer,
  181. const TokenizedBuffer* tokens)
  182. : Emitter(consumer), tokens_(tokens) {}
  183. protected:
  184. auto ConvertLoc(const char* loc, ContextFnT /*context_fn*/) const
  185. -> Diagnostics::ConvertedLoc override {
  186. return tokens_->SourcePointerToDiagnosticLoc(loc);
  187. }
  188. private:
  189. const TokenizedBuffer* tokens_;
  190. };
  191. class TokenDiagnosticEmitter : public Diagnostics::Emitter<TokenIndex> {
  192. public:
  193. explicit TokenDiagnosticEmitter(Diagnostics::Consumer* consumer,
  194. const TokenizedBuffer* tokens)
  195. : Emitter(consumer), tokens_(tokens) {}
  196. protected:
  197. auto ConvertLoc(TokenIndex token, ContextFnT /*context_fn*/) const
  198. -> Diagnostics::ConvertedLoc override {
  199. return tokens_->TokenToDiagnosticLoc(token);
  200. }
  201. private:
  202. const TokenizedBuffer* tokens_;
  203. };
  204. // Converts a pointer into the source to a diagnostic location.
  205. auto SourcePointerToDiagnosticLoc(const char* loc) const
  206. -> Diagnostics::ConvertedLoc;
  207. // Specifies minimum widths to use when printing a token's fields via
  208. // `printToken`.
  209. struct PrintWidths {
  210. // Widens `this` to the maximum of `this` and `new_width` for each
  211. // dimension.
  212. auto Widen(const PrintWidths& widths) -> void;
  213. int index;
  214. int kind;
  215. int line;
  216. int column;
  217. int indent;
  218. };
  219. // Storage for the information about a specific token in the buffer.
  220. //
  221. // This provides a friendly accessor API to the carefully space-optimized
  222. // storage model of the information we associated with each token.
  223. //
  224. // There are four pieces of information stored here:
  225. // - The kind of the token.
  226. // - Whether that token has leading whitespace before it.
  227. // - A kind-specific payload that can be compressed into a small integer.
  228. // - This class provides dedicated accessors for each different form of
  229. // payload that check the kind and payload correspond correctly.
  230. // - A 32-bit byte offset of the token within the source text.
  231. //
  232. // These are compressed and stored in 8-bytes for each token.
  233. //
  234. // Note that while the class provides some limited setters for payloads and
  235. // mutating methods, setters on this type may be unexpectedly expensive due to
  236. // the bit-packed representation and should be avoided. As such, only the
  237. // minimal necessary setters are provided.
  238. //
  239. // TODO: It might be worth considering a struct-of-arrays data layout in order
  240. // to move the byte offset to a separate array from the rest as it is only hot
  241. // during lexing, and then cold during parsing and semantic analysis. However,
  242. // a trivial approach to that adds more overhead than it saves due to tracking
  243. // two separate vectors and their growth. Making this profitable would likely
  244. // at least require a highly specialized single vector that manages the growth
  245. // once and then provides separate storage areas for the two arrays.
  246. class TokenInfo {
  247. public:
  248. // The kind for this token.
  249. auto kind() const -> TokenKind { return kind_; }
  250. // Whether this token is preceded by whitespace. We only store the preceding
  251. // state, and look at the next token to check for trailing whitespace.
  252. auto has_leading_space() const -> bool { return has_leading_space_; }
  253. // A collection of methods to access the specific payload included with
  254. // particular kinds of tokens. Only the specific payload accessor below may
  255. // be used for an info entry of a token with a particular kind, and these
  256. // check that the kind is valid. Some tokens do not include a payload at all
  257. // and none of these methods may be called.
  258. auto ident_id() const -> IdentifierId {
  259. CARBON_DCHECK(kind() == TokenKind::Identifier);
  260. return IdentifierId(token_payload_);
  261. }
  262. auto set_ident_id(IdentifierId ident_id) -> void {
  263. CARBON_DCHECK(kind() == TokenKind::Identifier);
  264. token_payload_ = ident_id.index;
  265. }
  266. auto string_literal_id() const -> StringLiteralValueId {
  267. CARBON_DCHECK(kind() == TokenKind::StringLiteral);
  268. return StringLiteralValueId(token_payload_);
  269. }
  270. auto int_id() const -> IntId {
  271. CARBON_DCHECK(kind() == TokenKind::IntLiteral ||
  272. kind() == TokenKind::IntTypeLiteral ||
  273. kind() == TokenKind::UnsignedIntTypeLiteral ||
  274. kind() == TokenKind::FloatTypeLiteral);
  275. return IntId::MakeFromTokenPayload(token_payload_);
  276. }
  277. auto real_id() const -> RealId {
  278. CARBON_DCHECK(kind() == TokenKind::RealLiteral);
  279. return RealId(token_payload_);
  280. }
  281. auto closing_token_index() const -> TokenIndex {
  282. CARBON_DCHECK(kind().is_opening_symbol());
  283. return TokenIndex(token_payload_);
  284. }
  285. auto set_closing_token_index(TokenIndex closing_index) -> void {
  286. CARBON_DCHECK(kind().is_opening_symbol());
  287. token_payload_ = closing_index.index;
  288. }
  289. auto opening_token_index() const -> TokenIndex {
  290. CARBON_DCHECK(kind().is_closing_symbol());
  291. return TokenIndex(token_payload_);
  292. }
  293. auto set_opening_token_index(TokenIndex opening_index) -> void {
  294. CARBON_DCHECK(kind().is_closing_symbol());
  295. token_payload_ = opening_index.index;
  296. }
  297. auto error_length() const -> int {
  298. CARBON_DCHECK(kind() == TokenKind::Error);
  299. return token_payload_;
  300. }
  301. // Zero-based byte offset of the token within the file. This can be combined
  302. // with the buffer's line information to locate the line and column of the
  303. // token as well.
  304. auto byte_offset() const -> int32_t { return byte_offset_; }
  305. // Transforms the token into an error token of the given length but at its
  306. // original position and with the same whitespace adjacency.
  307. auto ResetAsError(int error_length) -> void {
  308. // Construct a fresh token to establish any needed invariants and replace
  309. // this token with it.
  310. TokenInfo error(TokenKind::Error, has_leading_space(), error_length,
  311. byte_offset());
  312. *this = error;
  313. }
  314. private:
  315. friend class Lexer;
  316. static constexpr int PayloadBits = 23;
  317. // Make sure we have enough payload bits to represent token-associated IDs.
  318. static_assert(PayloadBits >= IntId::TokenIdBits);
  319. static_assert(PayloadBits >= TokenIndex::Bits);
  320. // Constructor for a TokenKind that carries no payload, or where the payload
  321. // will be set later.
  322. //
  323. // Only used by the lexer which enforces only the correct kinds are used.
  324. //
  325. // When the payload is not being set, we leave it uninitialized. At least in
  326. // some cases, this will allow MSan to correctly detect erroneous attempts
  327. // to access the payload, as it works to track uninitialized memory
  328. // bit-for-bit specifically to handle complex cases like bitfields.
  329. TokenInfo(TokenKind kind, bool has_leading_space, int32_t byte_offset)
  330. : kind_(kind),
  331. has_leading_space_(has_leading_space),
  332. byte_offset_(byte_offset) {}
  333. // Constructor for a TokenKind that carries a payload.
  334. //
  335. // Only used by the lexer which enforces the correct kind and payload types.
  336. TokenInfo(TokenKind kind, bool has_leading_space, int payload,
  337. int32_t byte_offset)
  338. : kind_(kind),
  339. has_leading_space_(has_leading_space),
  340. token_payload_(payload),
  341. byte_offset_(byte_offset) {}
  342. // A bitfield that encodes the token's kind, the leading space flag, and the
  343. // remaining bits in a payload. These are encoded together as a bitfield for
  344. // density and because these are the hottest fields of tokens for consumers
  345. // after lexing.
  346. //
  347. // Payload values are typically ID types for which we create at most one per
  348. // token, so we ensure that `token_payload_` is large enough to fit any
  349. // token index. Stores to this field may overflow, but we produce an error
  350. // in `Lexer::Finalize` if the file has more than `TokenIndex::Max` tokens,
  351. // so this value never overflows if lexing succeeds.
  352. TokenKind kind_;
  353. static_assert(sizeof(kind_) == 1, "TokenKind must pack to 8 bits");
  354. bool has_leading_space_ : 1;
  355. unsigned token_payload_ : PayloadBits;
  356. // Separate storage for the byte offset, this is hot while lexing but then
  357. // generally cold.
  358. int32_t byte_offset_;
  359. };
  360. static_assert(sizeof(TokenInfo) == 8,
  361. "Expected `TokenInfo` to pack to an 8-byte structure.");
  362. // A comment, which can be a block of lines. These are tracked separately from
  363. // tokens because they don't affect parse; if they were part of tokens, we'd
  364. // need more general special-casing within token logic.
  365. //
  366. // Note that `CommentInfo` is used for an API to expose the comment.
  367. struct CommentData {
  368. // Zero-based byte offset of the start of the comment within the source
  369. // buffer provided.
  370. int32_t start;
  371. // The comment's length.
  372. int32_t length;
  373. };
  374. struct LineInfo {
  375. explicit LineInfo(int32_t start) : start(start), indent(0) {}
  376. // Zero-based byte offset of the start of the line within the source buffer
  377. // provided.
  378. int32_t start;
  379. // The byte offset from the start of the line of the first non-whitespace
  380. // character.
  381. int32_t indent;
  382. };
  383. // The constructor is merely responsible for trivial initialization of
  384. // members. A working object of this type is built with `Lex::Lex` so that its
  385. // return can indicate if an error was encountered while lexing.
  386. explicit TokenizedBuffer(SharedValueStores& value_stores
  387. [[clang::lifetimebound]],
  388. SourceBuffer& source [[clang::lifetimebound]])
  389. : value_stores_(&value_stores), source_(&source) {}
  390. auto FindLineIndex(int32_t byte_offset) const -> LineIndex;
  391. auto GetLineInfo(LineIndex line) -> LineInfo&;
  392. auto GetLineInfo(LineIndex line) const -> const LineInfo&;
  393. auto AddLine(LineInfo info) -> LineIndex;
  394. auto GetTokenInfo(TokenIndex token) -> TokenInfo&;
  395. auto GetTokenInfo(TokenIndex token) const -> const TokenInfo&;
  396. auto AddToken(TokenInfo info) -> TokenIndex;
  397. auto GetTokenPrintWidths(TokenIndex token) const -> PrintWidths;
  398. auto PrintToken(llvm::raw_ostream& output_stream, TokenIndex token,
  399. PrintWidths widths) const -> void;
  400. // Adds a comment. This uses the indent to potentially stitch together two
  401. // adjacent comments.
  402. auto AddComment(int32_t indent, int32_t start, int32_t end) -> void;
  403. // Used to allocate computed string literals.
  404. llvm::BumpPtrAllocator allocator_;
  405. SharedValueStores* value_stores_;
  406. SourceBuffer* source_;
  407. llvm::SmallVector<TokenInfo> token_infos_;
  408. llvm::SmallVector<LineInfo> line_infos_;
  409. // Comments in the file.
  410. llvm::SmallVector<CommentData> comments_;
  411. // A range of tokens marked by `//@dump-semir-[begin|end]`.
  412. //
  413. // The particular syntax was chosen because it can be lexed efficiently. It
  414. // only occurs in invalid comment strings, so shouldn't slow down lexing of
  415. // correct code. It's also comment-like because its presence won't affect
  416. // parse/check.
  417. llvm::SmallVector<InclusiveTokenRange> dump_sem_ir_ranges_;
  418. // An upper bound on the number of parse tree nodes that we expect to be
  419. // created for the tokens in this buffer.
  420. int expected_max_parse_tree_size_ = 0;
  421. bool has_errors_ = false;
  422. // A vector of flags for recovery tokens. If empty, there are none. When doing
  423. // token recovery, this will be extended to be indexable by token indices and
  424. // contain true for the tokens that were synthesized for recovery.
  425. llvm::BitVector recovery_tokens_;
  426. };
  427. inline auto TokenizedBuffer::GetKind(TokenIndex token) const -> TokenKind {
  428. return GetTokenInfo(token).kind();
  429. }
  430. inline auto TokenizedBuffer::HasLeadingWhitespace(TokenIndex token) const
  431. -> bool {
  432. return GetTokenInfo(token).has_leading_space();
  433. }
  434. inline auto TokenizedBuffer::HasTrailingWhitespace(TokenIndex token) const
  435. -> bool {
  436. TokenIterator it(token);
  437. ++it;
  438. return it != tokens().end() && GetTokenInfo(*it).has_leading_space();
  439. }
  440. inline auto TokenizedBuffer::GetTokenInfo(TokenIndex token) -> TokenInfo& {
  441. return token_infos_[token.index];
  442. }
  443. inline auto TokenizedBuffer::GetTokenInfo(TokenIndex token) const
  444. -> const TokenInfo& {
  445. return token_infos_[token.index];
  446. }
  447. inline auto TokenizedBuffer::AddToken(TokenInfo info) -> TokenIndex {
  448. TokenIndex index(token_infos_.size());
  449. token_infos_.push_back(info);
  450. expected_max_parse_tree_size_ += info.kind().expected_max_parse_tree_size();
  451. return index;
  452. }
  453. } // namespace Carbon::Lex
  454. #endif // CARBON_TOOLCHAIN_LEX_TOKENIZED_BUFFER_H_