tokenized_buffer.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "toolchain/lex/tokenized_buffer.h"
  5. #include <algorithm>
  6. #include <cmath>
  7. #include <iterator>
  8. #include <optional>
  9. #include <utility>
  10. #include "common/check.h"
  11. #include "common/string_helpers.h"
  12. #include "llvm/ADT/StringRef.h"
  13. #include "llvm/Support/Format.h"
  14. #include "llvm/Support/FormatVariadic.h"
  15. #include "toolchain/base/shared_value_stores.h"
  16. #include "toolchain/diagnostics/diagnostic_emitter.h"
  17. #include "toolchain/lex/character_set.h"
  18. #include "toolchain/lex/numeric_literal.h"
  19. #include "toolchain/lex/string_literal.h"
  20. namespace Carbon::Lex {
  21. auto TokenizedBuffer::GetLine(TokenIndex token) const -> LineIndex {
  22. return FindLineIndex(GetTokenInfo(token).byte_offset());
  23. }
  24. auto TokenizedBuffer::GetLineNumber(TokenIndex token) const -> int {
  25. return GetLine(token).index + 1;
  26. }
  27. auto TokenizedBuffer::GetColumnNumber(TokenIndex token) const -> int {
  28. const auto& token_info = GetTokenInfo(token);
  29. const auto& line_info =
  30. line_infos_.Get(FindLineIndex(token_info.byte_offset()));
  31. return token_info.byte_offset() - line_info.start + 1;
  32. }
  33. auto TokenizedBuffer::GetEndLoc(TokenIndex token) const
  34. -> std::pair<LineIndex, int> {
  35. auto line = GetLine(token);
  36. int column = GetColumnNumber(token);
  37. auto token_text = GetTokenText(token);
  38. if (auto [before_newline, after_newline] = token_text.rsplit('\n');
  39. before_newline.size() == token_text.size()) {
  40. // Token fits on one line, advance the column number.
  41. column += before_newline.size();
  42. } else {
  43. // Token contains newlines.
  44. line.index += before_newline.count('\n') + 1;
  45. column = 1 + after_newline.size();
  46. }
  47. return {line, column};
  48. }
  49. auto TokenizedBuffer::GetTokenText(TokenIndex token) const -> llvm::StringRef {
  50. const auto& token_info = GetTokenInfo(token);
  51. llvm::StringRef fixed_spelling = token_info.kind().fixed_spelling();
  52. if (!fixed_spelling.empty()) {
  53. return fixed_spelling;
  54. }
  55. if (token_info.kind() == TokenKind::Error) {
  56. return source_->text().substr(token_info.byte_offset(),
  57. token_info.error_length());
  58. }
  59. // Refer back to the source text to preserve oddities like radix or digit
  60. // separators the author included.
  61. if (token_info.kind() == TokenKind::IntLiteral ||
  62. token_info.kind() == TokenKind::RealLiteral) {
  63. std::optional<NumericLiteral> relexed_token =
  64. NumericLiteral::Lex(source_->text().substr(token_info.byte_offset()),
  65. token_info.kind() == TokenKind::RealLiteral);
  66. CARBON_CHECK(relexed_token, "Could not reform numeric literal token.");
  67. return relexed_token->text();
  68. }
  69. // Refer back to the source text to find the original spelling, including
  70. // escape sequences etc.
  71. if (token_info.kind() == TokenKind::StringLiteral) {
  72. std::optional<StringLiteral> relexed_token =
  73. StringLiteral::Lex(source_->text().substr(token_info.byte_offset()));
  74. CARBON_CHECK(relexed_token, "Could not reform string literal token.");
  75. return relexed_token->text();
  76. }
  77. // Refer back to the source text to avoid needing to reconstruct the
  78. // spelling from the size.
  79. if (token_info.kind().is_sized_type_literal()) {
  80. llvm::StringRef suffix = source_->text()
  81. .substr(token_info.byte_offset() + 1)
  82. .take_while(IsDecimalDigit);
  83. return llvm::StringRef(suffix.data() - 1, suffix.size() + 1);
  84. }
  85. if (token_info.kind() == TokenKind::FileStart ||
  86. token_info.kind() == TokenKind::FileEnd) {
  87. return llvm::StringRef();
  88. }
  89. CARBON_CHECK(token_info.kind() == TokenKind::Identifier, "{0}",
  90. token_info.kind());
  91. return value_stores_->identifiers().Get(token_info.ident_id());
  92. }
  93. auto TokenizedBuffer::GetIdentifier(TokenIndex token) const -> IdentifierId {
  94. const auto& token_info = GetTokenInfo(token);
  95. CARBON_CHECK(token_info.kind() == TokenKind::Identifier, "{0}",
  96. token_info.kind());
  97. return token_info.ident_id();
  98. }
  99. auto TokenizedBuffer::GetIntLiteral(TokenIndex token) const -> IntId {
  100. const auto& token_info = GetTokenInfo(token);
  101. CARBON_CHECK(token_info.kind() == TokenKind::IntLiteral, "{0}",
  102. token_info.kind());
  103. return token_info.int_id();
  104. }
  105. auto TokenizedBuffer::GetRealLiteral(TokenIndex token) const -> RealId {
  106. const auto& token_info = GetTokenInfo(token);
  107. CARBON_CHECK(token_info.kind() == TokenKind::RealLiteral, "{0}",
  108. token_info.kind());
  109. return token_info.real_id();
  110. }
  111. auto TokenizedBuffer::GetStringLiteralValue(TokenIndex token) const
  112. -> StringLiteralValueId {
  113. const auto& token_info = GetTokenInfo(token);
  114. CARBON_CHECK(token_info.kind() == TokenKind::StringLiteral, "{0}",
  115. token_info.kind());
  116. return token_info.string_literal_id();
  117. }
  118. auto TokenizedBuffer::GetTypeLiteralSize(TokenIndex token) const -> IntId {
  119. const auto& token_info = GetTokenInfo(token);
  120. CARBON_CHECK(token_info.kind().is_sized_type_literal(), "{0}",
  121. token_info.kind());
  122. return token_info.int_id();
  123. }
  124. auto TokenizedBuffer::GetMatchedClosingToken(TokenIndex opening_token) const
  125. -> TokenIndex {
  126. const auto& opening_token_info = GetTokenInfo(opening_token);
  127. CARBON_CHECK(opening_token_info.kind().is_opening_symbol(), "{0}",
  128. opening_token_info.kind());
  129. return opening_token_info.closing_token_index();
  130. }
  131. auto TokenizedBuffer::GetMatchedOpeningToken(TokenIndex closing_token) const
  132. -> TokenIndex {
  133. const auto& closing_token_info = GetTokenInfo(closing_token);
  134. CARBON_CHECK(closing_token_info.kind().is_closing_symbol(), "{0}",
  135. closing_token_info.kind());
  136. return closing_token_info.opening_token_index();
  137. }
  138. auto TokenizedBuffer::IsRecoveryToken(TokenIndex token) const -> bool {
  139. if (recovery_tokens_.empty()) {
  140. return false;
  141. }
  142. return recovery_tokens_[token.index];
  143. }
  144. auto TokenizedBuffer::GetIndentColumnNumber(LineIndex line) const -> int {
  145. return line_infos_.Get(line).indent + 1;
  146. }
  147. auto TokenizedBuffer::PrintWidths::Widen(const PrintWidths& widths) -> void {
  148. index = std::max(widths.index, index);
  149. kind = std::max(widths.kind, kind);
  150. column = std::max(widths.column, column);
  151. line = std::max(widths.line, line);
  152. indent = std::max(widths.indent, indent);
  153. }
  154. // Compute the printed width of a number. When numbers are printed in decimal,
  155. // the number of digits needed is one more than the log-base-10 of the
  156. // value. We handle a value of `zero` explicitly.
  157. //
  158. // This routine requires its argument to be *non-negative*.
  159. static auto ComputeDecimalPrintedWidth(int number) -> int {
  160. CARBON_CHECK(number >= 0, "Negative numbers are not supported.");
  161. if (number == 0) {
  162. return 1;
  163. }
  164. return static_cast<int>(std::log10(number)) + 1;
  165. }
  166. auto TokenizedBuffer::GetTokenPrintWidths(TokenIndex token) const
  167. -> PrintWidths {
  168. PrintWidths widths = {};
  169. widths.index = ComputeDecimalPrintedWidth(token_infos_.size());
  170. widths.kind = GetKind(token).name().size();
  171. widths.line = ComputeDecimalPrintedWidth(GetLineNumber(token));
  172. widths.column = ComputeDecimalPrintedWidth(GetColumnNumber(token));
  173. widths.indent =
  174. ComputeDecimalPrintedWidth(GetIndentColumnNumber(GetLine(token)));
  175. return widths;
  176. }
  177. auto TokenizedBuffer::Print(llvm::raw_ostream& output_stream,
  178. bool omit_file_boundary_tokens) const -> void {
  179. output_stream << "- filename: " << source_->filename() << "\n"
  180. << " tokens:\n";
  181. PrintWidths widths = {};
  182. widths.index = ComputeDecimalPrintedWidth((token_infos_.size()));
  183. for (TokenIndex token : tokens()) {
  184. widths.Widen(GetTokenPrintWidths(token));
  185. }
  186. for (TokenIndex token : tokens()) {
  187. if (omit_file_boundary_tokens) {
  188. auto kind = GetKind(token);
  189. if (kind == TokenKind::FileStart || kind == TokenKind::FileEnd) {
  190. continue;
  191. }
  192. }
  193. PrintToken(output_stream, token, widths);
  194. output_stream << "\n";
  195. }
  196. if (!dump_sem_ir_ranges_.empty()) {
  197. output_stream << " dump_sem_ir_ranges:\n";
  198. for (auto range : dump_sem_ir_ranges_) {
  199. output_stream << " - {begin: " << range.begin.index
  200. << ", end: " << range.end.index << "}\n";
  201. }
  202. }
  203. }
  204. auto TokenizedBuffer::PrintToken(llvm::raw_ostream& output_stream,
  205. TokenIndex token) const -> void {
  206. PrintToken(output_stream, token, {});
  207. }
  208. auto TokenizedBuffer::PrintToken(llvm::raw_ostream& output_stream,
  209. TokenIndex token, PrintWidths widths) const
  210. -> void {
  211. widths.Widen(GetTokenPrintWidths(token));
  212. int token_index = token.index;
  213. const auto& token_info = GetTokenInfo(token);
  214. LineIndex line_index = FindLineIndex(token_info.byte_offset());
  215. llvm::StringRef token_text = GetTokenText(token);
  216. // Output the main chunk using one format string. We have to do the
  217. // justification manually in order to use the dynamically computed widths
  218. // and get the quotes included.
  219. output_stream << llvm::formatv(
  220. " - { index: {0}, kind: {1}, line: {2}, column: {3}, indent: {4}, "
  221. "spelling: \"{5}\"",
  222. llvm::format_decimal(token_index, widths.index),
  223. llvm::right_justify(
  224. llvm::formatv("\"{0}\"", token_info.kind().name()).str(),
  225. widths.kind + 2),
  226. llvm::format_decimal(GetLineNumber(token), widths.line),
  227. llvm::format_decimal(GetColumnNumber(token), widths.column),
  228. llvm::format_decimal(GetIndentColumnNumber(line_index), widths.indent),
  229. FormatEscaped(token_text, /*use_hex_escapes=*/true));
  230. switch (token_info.kind()) {
  231. case TokenKind::Identifier:
  232. output_stream << ", identifier: " << GetIdentifier(token).index;
  233. break;
  234. case TokenKind::IntLiteral:
  235. output_stream << ", value: \"";
  236. value_stores_->ints()
  237. .Get(GetIntLiteral(token))
  238. .print(output_stream, /*isSigned=*/false);
  239. output_stream << "\"";
  240. break;
  241. case TokenKind::RealLiteral:
  242. output_stream << ", value: \""
  243. << value_stores_->reals().Get(GetRealLiteral(token))
  244. << "\"";
  245. break;
  246. case TokenKind::StringLiteral:
  247. output_stream << ", value: \""
  248. << FormatEscaped(value_stores_->string_literal_values().Get(
  249. GetStringLiteralValue(token)),
  250. /*use_hex_escapes=*/true)
  251. << "\"";
  252. break;
  253. default:
  254. if (token_info.kind().is_opening_symbol()) {
  255. output_stream << ", closing_token: "
  256. << GetMatchedClosingToken(token).index;
  257. } else if (token_info.kind().is_closing_symbol()) {
  258. output_stream << ", opening_token: "
  259. << GetMatchedOpeningToken(token).index;
  260. }
  261. break;
  262. }
  263. if (token_info.has_leading_space()) {
  264. output_stream << ", has_leading_space: true";
  265. }
  266. if (IsRecoveryToken(token)) {
  267. output_stream << ", recovery: true";
  268. }
  269. output_stream << " }";
  270. }
  271. // Find the line index corresponding to a specific byte offset within the source
  272. // text for this tokenized buffer.
  273. //
  274. // This takes advantage of the lines being sorted by their starting byte offsets
  275. // to do a binary search for the line that contains the provided offset.
  276. auto TokenizedBuffer::FindLineIndex(int32_t byte_offset) const -> LineIndex {
  277. CARBON_DCHECK(line_infos_.size() > 0);
  278. auto line_range = line_infos_.values();
  279. auto line_it =
  280. llvm::partition_point(line_range, [byte_offset](LineInfo line_info) {
  281. return line_info.start <= byte_offset;
  282. });
  283. --line_it;
  284. // If this isn't the first line but it starts past the end of the source, then
  285. // this is a synthetic line added for simplicity of lexing. Step back one
  286. // further to find the last non-synthetic line.
  287. if (line_it != line_range.begin() &&
  288. line_it->start == static_cast<int32_t>(source_->text().size())) {
  289. --line_it;
  290. }
  291. CARBON_DCHECK(line_it->start <= byte_offset);
  292. return LineIndex(line_it - line_range.begin());
  293. }
  294. auto TokenizedBuffer::IsAfterComment(TokenIndex token,
  295. CommentIndex comment_index) const -> bool {
  296. const auto& comment_data = comments_.Get(comment_index);
  297. return GetTokenInfo(token).byte_offset() > comment_data.start;
  298. }
  299. auto TokenizedBuffer::GetCommentText(CommentIndex comment_index) const
  300. -> llvm::StringRef {
  301. const auto& comment_data = comments_.Get(comment_index);
  302. return source_->text().substr(comment_data.start, comment_data.length);
  303. }
  304. auto TokenizedBuffer::AddComment(int32_t indent, int32_t start, int32_t end)
  305. -> void {
  306. if (comments_.size() > 0) {
  307. auto& comment = comments_.Get(CommentIndex(comments_.size() - 1));
  308. if (comment.start + comment.length + indent == start) {
  309. comment.length = end - comment.start;
  310. return;
  311. }
  312. }
  313. comments_.Add({.start = start, .length = end - start});
  314. }
  315. auto TokenizedBuffer::CollectMemUsage(MemUsage& mem_usage,
  316. llvm::StringRef label) const -> void {
  317. mem_usage.Collect(MemUsage::ConcatLabel(label, "allocator_"), allocator_);
  318. mem_usage.Collect(MemUsage::ConcatLabel(label, "token_infos_"), token_infos_);
  319. mem_usage.Collect(MemUsage::ConcatLabel(label, "line_infos_"), line_infos_);
  320. mem_usage.Collect(MemUsage::ConcatLabel(label, "comments_"), comments_);
  321. }
  322. auto TokenizedBuffer::SourcePointerToDiagnosticLoc(const char* loc) const
  323. -> Diagnostics::ConvertedLoc {
  324. CARBON_CHECK(StringRefContainsPointer(source_->text(), loc),
  325. "location not within buffer");
  326. int32_t offset = loc - source_->text().begin();
  327. auto line_range = line_infos_.values();
  328. // Find the first line starting after the given location.
  329. const auto next_line_it = llvm::partition_point(
  330. line_range,
  331. [offset](const LineInfo& line) { return line.start <= offset; });
  332. // Step back one line to find the line containing the given position.
  333. CARBON_CHECK(next_line_it != line_range.begin(),
  334. "location precedes the start of the first line");
  335. const auto line_it = std::prev(next_line_it);
  336. int line_number = line_it - line_range.begin();
  337. int column_number = offset - line_it->start;
  338. // Grab the line from the buffer by slicing from this line to the next
  339. // minus the newline. When on the last line, instead use the start to the end
  340. // of the buffer.
  341. llvm::StringRef text = source_->text();
  342. llvm::StringRef line = next_line_it != line_range.end()
  343. ? text.slice(line_it->start, next_line_it->start)
  344. : text.substr(line_it->start);
  345. // Remove a newline at the end of the line if present.
  346. // TODO: This should expand to remove all vertical whitespace bytes at the
  347. // tail of the line such as CR+LF, etc.
  348. line.consume_back("\n");
  349. return {.loc = {.filename = source_->filename(),
  350. .line = line,
  351. .line_number = line_number + 1,
  352. .column_number = column_number + 1},
  353. .last_byte_offset = offset};
  354. }
  355. auto TokenizedBuffer::TokenToDiagnosticLoc(TokenIndex token) const
  356. -> Diagnostics::ConvertedLoc {
  357. // Map the token location into a position within the source buffer.
  358. const char* token_start =
  359. source_->text().begin() + GetTokenInfo(token).byte_offset();
  360. // Find the corresponding file location.
  361. // TODO: Should we somehow indicate in the diagnostic location if this token
  362. // is a recovery token that doesn't correspond to the original source?
  363. auto converted = SourcePointerToDiagnosticLoc(token_start);
  364. converted.loc.length = GetTokenText(token).size();
  365. return converted;
  366. }
  367. auto TokenizedBuffer::OverlapsWithDumpSemIRRange(
  368. Lex::InclusiveTokenRange range) const -> bool {
  369. CARBON_CHECK(!dump_sem_ir_ranges_.empty());
  370. // Ranges are ordered, so we can decide overlap as soon as we find a range
  371. // that ends after `begin`.
  372. for (auto dump_range : dump_sem_ir_ranges_) {
  373. if (dump_range.end >= range.begin) {
  374. return dump_range.begin <= range.end;
  375. }
  376. }
  377. return false;
  378. }
  379. } // namespace Carbon::Lex