string_literal.cpp 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "toolchain/lex/string_literal.h"
  5. #include <initializer_list>
  6. #include <optional>
  7. #include "common/check.h"
  8. #include "llvm/ADT/SmallString.h"
  9. #include "llvm/ADT/StringExtras.h"
  10. #include "llvm/Support/ConvertUTF.h"
  11. #include "llvm/Support/ErrorHandling.h"
  12. #include "toolchain/lex/character_set.h"
  13. #include "toolchain/lex/helpers.h"
  14. namespace Carbon::Lex {
  15. using DiagnosticEmitter = Diagnostics::Emitter<const char*>;
  16. static constexpr char MultiLineIndicator[] = R"(''')";
  17. static constexpr char DoubleQuotedMultiLineIndicator[] = R"(""")";
  18. struct StringLiteral::Introducer {
  19. // The kind of string being introduced.
  20. MultiLineKind kind;
  21. // The terminator for the string, without any '#' suffixes.
  22. llvm::StringRef terminator;
  23. // The length of the introducer, including the file type indicator and
  24. // newline for a multi-line string literal.
  25. int prefix_size;
  26. // Lex the introducer for a string literal, after any '#'s.
  27. static auto Lex(llvm::StringRef source_text) -> std::optional<Introducer>;
  28. };
  29. // Lex the introducer for a string literal, after any '#'s.
  30. //
  31. // We lex multi-line literals when spelled with either ''' or """ for error
  32. // recovery purposes, and reject """ literals after lexing.
  33. auto StringLiteral::Introducer::Lex(llvm::StringRef source_text)
  34. -> std::optional<Introducer> {
  35. MultiLineKind kind = NotMultiLine;
  36. llvm::StringRef indicator;
  37. if (source_text.starts_with(MultiLineIndicator)) {
  38. kind = MultiLine;
  39. indicator = llvm::StringRef(MultiLineIndicator);
  40. } else if (source_text.starts_with(DoubleQuotedMultiLineIndicator)) {
  41. kind = MultiLineWithDoubleQuotes;
  42. indicator = llvm::StringRef(DoubleQuotedMultiLineIndicator);
  43. }
  44. if (kind != NotMultiLine) {
  45. // The rest of the line must be a valid file type indicator: a sequence of
  46. // characters containing neither '#' nor '"' followed by a newline.
  47. auto prefix_end = source_text.find_first_of("#\n\"", indicator.size());
  48. if (prefix_end != llvm::StringRef::npos &&
  49. source_text[prefix_end] == '\n') {
  50. // Include the newline in the prefix size.
  51. return Introducer{.kind = kind,
  52. .terminator = indicator,
  53. .prefix_size = static_cast<int>(prefix_end + 1)};
  54. }
  55. }
  56. if (!source_text.empty() && source_text[0] == '"') {
  57. return Introducer{
  58. .kind = NotMultiLine, .terminator = "\"", .prefix_size = 1};
  59. }
  60. return std::nullopt;
  61. }
  62. namespace {
  63. // A set of 'char' values.
  64. struct alignas(8) CharSet {
  65. bool Elements[UCHAR_MAX + 1];
  66. constexpr CharSet(std::initializer_list<char> chars) : Elements() {
  67. for (char c : chars) {
  68. Elements[static_cast<unsigned char>(c)] = true;
  69. }
  70. }
  71. constexpr auto operator[](char c) const -> bool {
  72. return Elements[static_cast<unsigned char>(c)];
  73. }
  74. };
  75. } // namespace
  76. auto StringLiteral::Lex(llvm::StringRef source_text)
  77. -> std::optional<StringLiteral> {
  78. int64_t cursor = 0;
  79. const int64_t source_text_size = source_text.size();
  80. // Determine the number of hashes prefixing.
  81. while (cursor < source_text_size && source_text[cursor] == '#') {
  82. ++cursor;
  83. }
  84. const int hash_level = cursor;
  85. const std::optional<Introducer> introducer =
  86. Introducer::Lex(source_text.substr(hash_level));
  87. if (!introducer) {
  88. return std::nullopt;
  89. }
  90. cursor += introducer->prefix_size;
  91. const int prefix_len = cursor;
  92. llvm::SmallString<16> terminator(introducer->terminator);
  93. llvm::SmallString<16> escape("\\");
  94. // The terminator and escape sequence marker require a number of '#'s
  95. // matching the leading sequence of '#'s.
  96. terminator.resize(terminator.size() + hash_level, '#');
  97. escape.resize(escape.size() + hash_level, '#');
  98. bool content_needs_validation = false;
  99. // TODO: Detect indent / dedent for multi-line string literals in order to
  100. // stop parsing on dedent before a terminator is found.
  101. for (; cursor < source_text_size; ++cursor) {
  102. // Use a lookup table to allow us to quickly skip uninteresting characters.
  103. static constexpr CharSet InterestingChars = {'\\', '\n', '"', '\'', '\t'};
  104. if (!InterestingChars[source_text[cursor]]) {
  105. continue;
  106. }
  107. // This switch and loop structure relies on multi-character terminators and
  108. // escape sequences starting with a predictable character and not containing
  109. // embedded and unescaped terminators or newlines.
  110. switch (source_text[cursor]) {
  111. case '\t':
  112. // Tabs have extra validation.
  113. content_needs_validation = true;
  114. break;
  115. case '\\':
  116. if (escape.size() == 1 ||
  117. source_text.substr(cursor + 1).starts_with(escape.substr(1))) {
  118. content_needs_validation = true;
  119. cursor += escape.size();
  120. // If there's either not a character following the escape, or it's a
  121. // single-line string and the escaped character is a newline, we
  122. // should stop here.
  123. if (cursor >= source_text_size || (introducer->kind == NotMultiLine &&
  124. source_text[cursor] == '\n')) {
  125. llvm::StringRef text = source_text.take_front(cursor);
  126. return StringLiteral(text, text.drop_front(prefix_len),
  127. content_needs_validation, hash_level,
  128. introducer->kind,
  129. /*is_terminated=*/false);
  130. }
  131. }
  132. break;
  133. case '\n':
  134. if (introducer->kind == NotMultiLine) {
  135. llvm::StringRef text = source_text.take_front(cursor);
  136. return StringLiteral(text, text.drop_front(prefix_len),
  137. content_needs_validation, hash_level,
  138. introducer->kind,
  139. /*is_terminated=*/false);
  140. }
  141. break;
  142. case '"':
  143. case '\'':
  144. if (source_text.substr(cursor).starts_with(terminator)) {
  145. llvm::StringRef text =
  146. source_text.substr(0, cursor + terminator.size());
  147. llvm::StringRef content =
  148. source_text.substr(prefix_len, cursor - prefix_len);
  149. return StringLiteral(text, content, content_needs_validation,
  150. hash_level, introducer->kind,
  151. /*is_terminated=*/true);
  152. }
  153. break;
  154. default:
  155. // No action for non-terminators.
  156. break;
  157. }
  158. }
  159. // No terminator was found.
  160. return StringLiteral(source_text, source_text.drop_front(prefix_len),
  161. content_needs_validation, hash_level, introducer->kind,
  162. /*is_terminated=*/false);
  163. }
  164. // Given a string that contains at least one newline, find the indent (the
  165. // leading sequence of horizontal whitespace) of its final line.
  166. static auto ComputeIndentOfFinalLine(llvm::StringRef text) -> llvm::StringRef {
  167. int indent_end = text.size();
  168. for (int i = indent_end - 1; i >= 0; --i) {
  169. if (text[i] == '\n') {
  170. int indent_start = i + 1;
  171. return text.substr(indent_start, indent_end - indent_start);
  172. }
  173. if (!IsSpace(text[i])) {
  174. indent_end = i;
  175. }
  176. }
  177. llvm_unreachable("Given text is required to contain a newline.");
  178. }
  179. // Check the literal is indented properly, if it's a multi-line litera.
  180. // Find the leading whitespace that should be removed from each line of a
  181. // multi-line string literal.
  182. static auto CheckIndent(DiagnosticEmitter& emitter, llvm::StringRef text,
  183. llvm::StringRef content) -> llvm::StringRef {
  184. // Find the leading horizontal whitespace on the final line of this literal.
  185. // Note that for an empty literal, this might not be inside the content.
  186. llvm::StringRef indent = ComputeIndentOfFinalLine(text);
  187. // The last line is not permitted to contain any content after its
  188. // indentation.
  189. if (indent.end() != content.end()) {
  190. CARBON_DIAGNOSTIC(
  191. ContentBeforeStringTerminator, Error,
  192. "only whitespace is permitted before the closing `'''` of a "
  193. "multi-line string");
  194. emitter.Emit(indent.end(), ContentBeforeStringTerminator);
  195. }
  196. return indent;
  197. }
  198. // Expand a `\u{HHHHHH}` escape sequence into a sequence of UTF-8 code units.
  199. static auto ExpandUnicodeEscapeSequence(DiagnosticEmitter& emitter,
  200. llvm::StringRef digits,
  201. char*& buffer_cursor) -> bool {
  202. unsigned code_point;
  203. if (!CanLexInt(emitter, digits)) {
  204. return false;
  205. }
  206. if (digits.getAsInteger(16, code_point) || code_point > 0x10FFFF) {
  207. CARBON_DIAGNOSTIC(UnicodeEscapeTooLarge, Error,
  208. "code point specified by `\\u{{...}}` escape is greater "
  209. "than 0x10FFFF");
  210. emitter.Emit(digits.begin(), UnicodeEscapeTooLarge);
  211. return false;
  212. }
  213. if (code_point >= 0xD800 && code_point < 0xE000) {
  214. CARBON_DIAGNOSTIC(UnicodeEscapeSurrogate, Error,
  215. "code point specified by `\\u{{...}}` escape is a "
  216. "surrogate character");
  217. emitter.Emit(digits.begin(), UnicodeEscapeSurrogate);
  218. return false;
  219. }
  220. // Convert the code point to a sequence of UTF-8 code units.
  221. // Every code point fits in 6 UTF-8 code units.
  222. const llvm::UTF32 utf32_code_units[1] = {code_point};
  223. const llvm::UTF32* src_pos = utf32_code_units;
  224. auto*& buffer_cursor_as_utf8 = reinterpret_cast<llvm::UTF8*&>(buffer_cursor);
  225. llvm::ConversionResult conv_result = llvm::ConvertUTF32toUTF8(
  226. &src_pos, src_pos + 1, &buffer_cursor_as_utf8, buffer_cursor_as_utf8 + 6,
  227. llvm::strictConversion);
  228. if (conv_result != llvm::conversionOK) {
  229. llvm_unreachable("conversion of valid code point to UTF-8 cannot fail");
  230. }
  231. return true;
  232. }
  233. // Appends a character to the buffer and advances the cursor.
  234. static auto AppendChar(char*& buffer_cursor, char append_char) -> void {
  235. buffer_cursor[0] = append_char;
  236. ++buffer_cursor;
  237. }
  238. // Appends the front of contents to the buffer and advances the cursor.
  239. static auto AppendFrontOfContents(char*& buffer_cursor,
  240. llvm::StringRef contents, size_t len_or_npos)
  241. -> void {
  242. auto len =
  243. len_or_npos == llvm::StringRef::npos ? contents.size() : len_or_npos;
  244. memcpy(buffer_cursor, contents.data(), len);
  245. buffer_cursor += len;
  246. }
  247. // Expand an escape sequence, appending the expanded value to the given
  248. // `result` string. `content` is the string content, starting from the first
  249. // character after the escape sequence introducer (for example, the `n` in
  250. // `\n`), and will be updated to remove the leading escape sequence.
  251. static auto ExpandAndConsumeEscapeSequence(DiagnosticEmitter& emitter,
  252. llvm::StringRef& content,
  253. char*& buffer_cursor) -> void {
  254. CARBON_CHECK(!content.empty(), "should have escaped closing delimiter");
  255. char first = content.front();
  256. content = content.drop_front(1);
  257. switch (first) {
  258. case 't':
  259. AppendChar(buffer_cursor, '\t');
  260. return;
  261. case 'n':
  262. AppendChar(buffer_cursor, '\n');
  263. return;
  264. case 'r':
  265. AppendChar(buffer_cursor, '\r');
  266. return;
  267. case '"':
  268. AppendChar(buffer_cursor, '"');
  269. return;
  270. case '\'':
  271. AppendChar(buffer_cursor, '\'');
  272. return;
  273. case '\\':
  274. AppendChar(buffer_cursor, '\\');
  275. return;
  276. case '0':
  277. AppendChar(buffer_cursor, '\0');
  278. if (!content.empty() && IsDecimalDigit(content.front())) {
  279. CARBON_DIAGNOSTIC(
  280. DecimalEscapeSequence, Error,
  281. "decimal digit follows `\\0` escape sequence. Use `\\x00` instead "
  282. "of `\\0` if the next character is a digit");
  283. emitter.Emit(content.begin(), DecimalEscapeSequence);
  284. return;
  285. }
  286. return;
  287. case 'x':
  288. if (content.size() >= 2 && IsUpperHexDigit(content[0]) &&
  289. IsUpperHexDigit(content[1])) {
  290. AppendChar(buffer_cursor, static_cast<char>(llvm::hexFromNibbles(
  291. content[0], content[1])));
  292. content = content.drop_front(2);
  293. return;
  294. }
  295. CARBON_DIAGNOSTIC(HexadecimalEscapeMissingDigits, Error,
  296. "escape sequence `\\x` must be followed by two "
  297. "uppercase hexadecimal digits, for example `\\x0F`");
  298. emitter.Emit(content.begin(), HexadecimalEscapeMissingDigits);
  299. break;
  300. case 'u': {
  301. llvm::StringRef remaining = content;
  302. if (remaining.consume_front("{")) {
  303. llvm::StringRef digits = remaining.take_while(IsUpperHexDigit);
  304. remaining = remaining.drop_front(digits.size());
  305. if (!digits.empty() && remaining.consume_front("}")) {
  306. if (!ExpandUnicodeEscapeSequence(emitter, digits, buffer_cursor)) {
  307. break;
  308. }
  309. content = remaining;
  310. return;
  311. }
  312. }
  313. CARBON_DIAGNOSTIC(
  314. UnicodeEscapeMissingBracedDigits, Error,
  315. "escape sequence `\\u` must be followed by a braced sequence of "
  316. "uppercase hexadecimal digits, for example `\\u{{70AD}}`");
  317. emitter.Emit(content.begin(), UnicodeEscapeMissingBracedDigits);
  318. break;
  319. }
  320. default:
  321. CARBON_DIAGNOSTIC(UnknownEscapeSequence, Error,
  322. "unrecognized escape sequence `{0}`", char);
  323. emitter.Emit(content.begin() - 1, UnknownEscapeSequence, first);
  324. break;
  325. }
  326. // If we get here, we didn't recognize this escape sequence and have already
  327. // issued a diagnostic. For error recovery purposes, expand this escape
  328. // sequence to itself, dropping the introducer (for example, `\q` -> `q`).
  329. AppendChar(buffer_cursor, first);
  330. }
  331. // Expand any escape sequences in the given string literal.
  332. static auto ExpandEscapeSequencesAndRemoveIndent(
  333. DiagnosticEmitter& emitter, llvm::StringRef contents, int hash_level,
  334. llvm::StringRef indent, char* buffer) -> llvm::StringRef {
  335. char* buffer_cursor = buffer;
  336. llvm::SmallString<16> escape("\\");
  337. escape.resize(1 + hash_level, '#');
  338. // Process each line of the string literal.
  339. while (true) {
  340. // Every non-empty line (that contains anything other than horizontal
  341. // whitespace) is required to start with the string's indent. For error
  342. // recovery, remove all leading whitespace if the indent doesn't match.
  343. if (!contents.consume_front(indent)) {
  344. const char* line_start = contents.begin();
  345. contents = contents.drop_while(IsHorizontalWhitespace);
  346. if (!contents.starts_with("\n")) {
  347. CARBON_DIAGNOSTIC(
  348. MismatchedIndentInString, Error,
  349. "indentation does not match that of the closing `'''` in "
  350. "multi-line string literal");
  351. emitter.Emit(line_start, MismatchedIndentInString);
  352. }
  353. }
  354. // Tracks the position at the last time we expanded an escape to ensure we
  355. // don't misinterpret it as unescaped when backtracking.
  356. char* buffer_last_escape = buffer_cursor;
  357. // Process the contents of the line.
  358. while (true) {
  359. // Append the next segment of plain text.
  360. auto end_of_regular_text = contents.find_if([](char c) {
  361. return c == '\n' || c == '\\' ||
  362. (IsHorizontalWhitespace(c) && c != ' ');
  363. });
  364. AppendFrontOfContents(buffer_cursor, contents, end_of_regular_text);
  365. if (end_of_regular_text == llvm::StringRef::npos) {
  366. return llvm::StringRef(buffer, buffer_cursor - buffer);
  367. }
  368. contents = contents.drop_front(end_of_regular_text);
  369. if (contents.consume_front("\n")) {
  370. // Trailing whitespace in the source before a newline doesn't contribute
  371. // to the string literal value. However, escaped whitespace (like `\t`)
  372. // and any whitespace just before that does contribute.
  373. while (buffer_cursor > buffer_last_escape) {
  374. char back = *(buffer_cursor - 1);
  375. if (back == '\n' || !IsSpace(back)) {
  376. break;
  377. }
  378. --buffer_cursor;
  379. }
  380. AppendChar(buffer_cursor, '\n');
  381. // Move onto to the next line.
  382. break;
  383. }
  384. if (IsHorizontalWhitespace(contents.front())) {
  385. // Horizontal whitespace other than ` ` is valid only at the end of a
  386. // line.
  387. CARBON_CHECK(contents.front() != ' ',
  388. "should not have stopped at a plain space");
  389. auto after_space = contents.find_if_not(IsHorizontalWhitespace);
  390. if (after_space == llvm::StringRef::npos ||
  391. contents[after_space] != '\n') {
  392. // TODO: Include the source range of the whitespace up to
  393. // `contents.begin() + after_space` in the diagnostic.
  394. CARBON_DIAGNOSTIC(
  395. InvalidHorizontalWhitespaceInString, Error,
  396. "whitespace other than plain space must be expressed with an "
  397. "escape sequence in a string literal");
  398. emitter.Emit(contents.begin(), InvalidHorizontalWhitespaceInString);
  399. // Include the whitespace in the string contents for error recovery.
  400. AppendFrontOfContents(buffer_cursor, contents, after_space);
  401. }
  402. contents = contents.substr(after_space);
  403. continue;
  404. }
  405. if (!contents.consume_front(escape)) {
  406. // This is not an escape sequence, just a raw `\`.
  407. AppendChar(buffer_cursor, contents.front());
  408. contents = contents.drop_front(1);
  409. continue;
  410. }
  411. if (contents.consume_front("\n")) {
  412. // An escaped newline ends the line without producing any content and
  413. // without trimming trailing whitespace.
  414. break;
  415. }
  416. // Handle this escape sequence.
  417. ExpandAndConsumeEscapeSequence(emitter, contents, buffer_cursor);
  418. buffer_last_escape = buffer_cursor;
  419. }
  420. }
  421. }
  422. auto StringLiteral::ComputeValue(llvm::BumpPtrAllocator& allocator,
  423. DiagnosticEmitter& emitter) const
  424. -> llvm::StringRef {
  425. if (!is_terminated_) {
  426. return "";
  427. }
  428. if (multi_line_ == MultiLineWithDoubleQuotes) {
  429. CARBON_DIAGNOSTIC(
  430. MultiLineStringWithDoubleQuotes, Error,
  431. "use `'''` delimiters for a multi-line string literal, not `\"\"\"`");
  432. emitter.Emit(text_.begin(), MultiLineStringWithDoubleQuotes);
  433. }
  434. llvm::StringRef indent =
  435. multi_line_ ? CheckIndent(emitter, text_, content_) : llvm::StringRef();
  436. if (!content_needs_validation_ && (!multi_line_ || indent.empty())) {
  437. return content_;
  438. }
  439. // "Expanding" escape sequences should only ever shorten content. As a
  440. // consequence, the output string should allows fit within this allocation.
  441. // Although this may waste some space, it avoids a reallocation.
  442. auto result = ExpandEscapeSequencesAndRemoveIndent(
  443. emitter, content_, hash_level_, indent,
  444. allocator.Allocate<char>(content_.size()));
  445. CARBON_CHECK(result.size() <= content_.size(),
  446. "Content grew from {0} to {1}: `{2}`", content_.size(),
  447. result.size(), content_);
  448. return result;
  449. }
  450. } // namespace Carbon::Lex