string_literal.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "toolchain/lexer/string_literal.h"
  5. #include "llvm/ADT/SmallString.h"
  6. #include "llvm/ADT/StringExtras.h"
  7. #include "llvm/Support/ConvertUTF.h"
  8. #include "llvm/Support/ErrorHandling.h"
  9. #include "llvm/Support/FormatVariadic.h"
  10. #include "toolchain/lexer/character_set.h"
  11. namespace Carbon {
  12. using LexerDiagnosticEmitter = DiagnosticEmitter<const char*>;
  13. struct ContentBeforeStringTerminator
  14. : SimpleDiagnostic<ContentBeforeStringTerminator> {
  15. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
  16. static constexpr llvm::StringLiteral Message =
  17. "Only whitespace is permitted before the closing `\"\"\"` of a "
  18. "multi-line string.";
  19. };
  20. struct UnicodeEscapeTooLarge : SimpleDiagnostic<UnicodeEscapeTooLarge> {
  21. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
  22. static constexpr llvm::StringLiteral Message =
  23. "Code point specified by `\\u{...}` escape is greater than 0x10FFFF.";
  24. };
  25. struct UnicodeEscapeSurrogate : SimpleDiagnostic<UnicodeEscapeSurrogate> {
  26. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
  27. static constexpr llvm::StringLiteral Message =
  28. "Code point specified by `\\u{...}` escape is a surrogate character.";
  29. };
  30. struct UnicodeEscapeMissingBracedDigits
  31. : SimpleDiagnostic<UnicodeEscapeMissingBracedDigits> {
  32. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
  33. static constexpr llvm::StringLiteral Message =
  34. "Escape sequence `\\u` must be followed by a braced sequence of "
  35. "uppercase hexadecimal digits, for example `\\u{70AD}`.";
  36. };
  37. struct HexadecimalEscapeMissingDigits
  38. : SimpleDiagnostic<HexadecimalEscapeMissingDigits> {
  39. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
  40. static constexpr llvm::StringLiteral Message =
  41. "Escape sequence `\\x` must be followed by two "
  42. "uppercase hexadecimal digits, for example `\\x0F`.";
  43. };
  44. struct DecimalEscapeSequence : SimpleDiagnostic<DecimalEscapeSequence> {
  45. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
  46. static constexpr llvm::StringLiteral Message =
  47. "Decimal digit follows `\\0` escape sequence. Use `\\x00` instead of "
  48. "`\\0` if the next character is a digit.";
  49. };
  50. struct UnknownEscapeSequence {
  51. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
  52. static constexpr const char* Message = "Unrecognized escape sequence `{0}`.";
  53. char first;
  54. auto Format() -> std::string { return llvm::formatv(Message, first).str(); }
  55. };
  56. struct MismatchedIndentInString : SimpleDiagnostic<MismatchedIndentInString> {
  57. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
  58. static constexpr llvm::StringLiteral Message =
  59. "Indentation does not match that of the closing \"\"\" in multi-line "
  60. "string literal.";
  61. };
  62. struct InvalidHorizontalWhitespaceInString
  63. : SimpleDiagnostic<InvalidHorizontalWhitespaceInString> {
  64. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
  65. static constexpr llvm::StringLiteral Message =
  66. "Whitespace other than plain space must be expressed with an escape "
  67. "sequence in a string literal.";
  68. };
  69. // Find and return the opening characters of a multi-line string literal,
  70. // after any '#'s, including the file type indicator and following newline.
  71. static auto TakeMultiLineStringLiteralPrefix(llvm::StringRef source_text)
  72. -> llvm::StringRef {
  73. llvm::StringRef remaining = source_text;
  74. if (!remaining.consume_front(R"(""")")) {
  75. return llvm::StringRef();
  76. }
  77. // The rest of the line must be a valid file type indicator: a sequence of
  78. // characters containing neither '#' nor '"' followed by a newline.
  79. remaining = remaining.drop_until(
  80. [](char c) { return c == '"' || c == '#' || c == '\n'; });
  81. if (!remaining.consume_front("\n")) {
  82. return llvm::StringRef();
  83. }
  84. return source_text.take_front(remaining.begin() - source_text.begin());
  85. }
  86. // If source_text begins with a string literal token, extract and return
  87. // information on that token.
  88. auto LexedStringLiteral::Lex(llvm::StringRef source_text)
  89. -> llvm::Optional<LexedStringLiteral> {
  90. const char* begin = source_text.begin();
  91. int hash_level = 0;
  92. while (source_text.consume_front("#")) {
  93. ++hash_level;
  94. }
  95. llvm::SmallString<16> terminator("\"");
  96. llvm::SmallString<16> escape("\\");
  97. llvm::StringRef multi_line_prefix =
  98. TakeMultiLineStringLiteralPrefix(source_text);
  99. bool multi_line = !multi_line_prefix.empty();
  100. if (multi_line) {
  101. source_text = source_text.drop_front(multi_line_prefix.size());
  102. terminator = R"(""")";
  103. } else if (!source_text.consume_front("\"")) {
  104. return llvm::None;
  105. }
  106. // The terminator and escape sequence marker require a number of '#'s
  107. // matching the leading sequence of '#'s.
  108. terminator.resize(terminator.size() + hash_level, '#');
  109. escape.resize(escape.size() + hash_level, '#');
  110. const char* content_begin = source_text.begin();
  111. const char* content_end = content_begin;
  112. while (!source_text.consume_front(terminator)) {
  113. // Let LexError figure out how to recover from an unterminated string
  114. // literal.
  115. if (source_text.empty()) {
  116. return llvm::None;
  117. }
  118. // Consume an escape sequence marker if present.
  119. (void)source_text.consume_front(escape);
  120. // Then consume one more character, either of the content or of an
  121. // escape sequence. This can be a newline in a multi-line string literal.
  122. // This relies on multi-character escape sequences not containing an
  123. // embedded and unescaped terminator or newline.
  124. if (!multi_line && source_text.startswith("\n")) {
  125. return llvm::None;
  126. }
  127. source_text = source_text.substr(1);
  128. content_end = source_text.begin();
  129. }
  130. return LexedStringLiteral(
  131. llvm::StringRef(begin, source_text.begin() - begin),
  132. llvm::StringRef(content_begin, content_end - content_begin), hash_level,
  133. multi_line);
  134. }
  135. // Given a string that contains at least one newline, find the indent (the
  136. // leading sequence of horizontal whitespace) of its final line.
  137. static auto ComputeIndentOfFinalLine(llvm::StringRef text) -> llvm::StringRef {
  138. int indent_end = text.size();
  139. for (int i = indent_end - 1; i >= 0; --i) {
  140. if (text[i] == '\n') {
  141. int indent_start = i + 1;
  142. return text.substr(indent_start, indent_end - indent_start);
  143. }
  144. if (!IsSpace(text[i])) {
  145. indent_end = i;
  146. }
  147. }
  148. llvm_unreachable("Given text is required to contain a newline.");
  149. }
  150. // Check the literal is indented properly, if it's a multi-line litera.
  151. // Find the leading whitespace that should be removed from each line of a
  152. // multi-line string literal.
  153. static auto CheckIndent(LexerDiagnosticEmitter& emitter, llvm::StringRef text,
  154. llvm::StringRef content) -> llvm::StringRef {
  155. // Find the leading horizontal whitespace on the final line of this literal.
  156. // Note that for an empty literal, this might not be inside the content.
  157. llvm::StringRef indent = ComputeIndentOfFinalLine(text);
  158. // The last line is not permitted to contain any content after its
  159. // indentation.
  160. if (indent.end() != content.end()) {
  161. emitter.EmitError<ContentBeforeStringTerminator>(indent.end());
  162. }
  163. return indent;
  164. }
  165. // Expand a `\u{HHHHHH}` escape sequence into a sequence of UTF-8 code units.
  166. static auto ExpandUnicodeEscapeSequence(LexerDiagnosticEmitter& emitter,
  167. llvm::StringRef digits,
  168. std::string& result) -> bool {
  169. unsigned code_point;
  170. if (digits.getAsInteger(16, code_point) || code_point > 0x10FFFF) {
  171. emitter.EmitError<UnicodeEscapeTooLarge>(digits.begin());
  172. return false;
  173. }
  174. if (code_point >= 0xD800 && code_point < 0xE000) {
  175. emitter.EmitError<UnicodeEscapeSurrogate>(digits.begin());
  176. return false;
  177. }
  178. // Convert the code point to a sequence of UTF-8 code units.
  179. // Every code point fits in 6 UTF-8 code units.
  180. const llvm::UTF32 utf32_code_units[1] = {code_point};
  181. llvm::UTF8 utf8_code_units[6];
  182. const llvm::UTF32* src_pos = utf32_code_units;
  183. llvm::UTF8* dest_pos = utf8_code_units;
  184. llvm::ConversionResult conv_result = llvm::ConvertUTF32toUTF8(
  185. &src_pos, src_pos + 1, &dest_pos, dest_pos + 6, llvm::strictConversion);
  186. if (conv_result != llvm::conversionOK) {
  187. llvm_unreachable("conversion of valid code point to UTF-8 cannot fail");
  188. }
  189. result.insert(result.end(), reinterpret_cast<char*>(utf8_code_units),
  190. reinterpret_cast<char*>(dest_pos));
  191. return true;
  192. }
  193. // Expand an escape sequence, appending the expanded value to the given
  194. // `result` string. `content` is the string content, starting from the first
  195. // character after the escape sequence introducer (for example, the `n` in
  196. // `\n`), and will be updated to remove the leading escape sequence.
  197. static auto ExpandAndConsumeEscapeSequence(LexerDiagnosticEmitter& emitter,
  198. llvm::StringRef& content,
  199. std::string& result) -> void {
  200. assert(!content.empty() && "should have escaped closing delimiter");
  201. char first = content.front();
  202. content = content.drop_front(1);
  203. switch (first) {
  204. case 't':
  205. result += '\t';
  206. return;
  207. case 'n':
  208. result += '\n';
  209. return;
  210. case 'r':
  211. result += '\r';
  212. return;
  213. case '"':
  214. result += '"';
  215. return;
  216. case '\'':
  217. result += '\'';
  218. return;
  219. case '\\':
  220. result += '\\';
  221. return;
  222. case '0':
  223. result += '\0';
  224. if (!content.empty() && IsDecimalDigit(content.front())) {
  225. emitter.EmitError<DecimalEscapeSequence>(content.begin());
  226. return;
  227. }
  228. return;
  229. case 'x':
  230. if (content.size() >= 2 && IsUpperHexDigit(content[0]) &&
  231. IsUpperHexDigit(content[1])) {
  232. result +=
  233. static_cast<char>(llvm::hexFromNibbles(content[0], content[1]));
  234. content = content.drop_front(2);
  235. return;
  236. }
  237. emitter.EmitError<HexadecimalEscapeMissingDigits>(content.begin());
  238. break;
  239. case 'u': {
  240. llvm::StringRef remaining = content;
  241. if (remaining.consume_front("{")) {
  242. llvm::StringRef digits = remaining.take_while(IsUpperHexDigit);
  243. remaining = remaining.drop_front(digits.size());
  244. if (!digits.empty() && remaining.consume_front("}")) {
  245. if (!ExpandUnicodeEscapeSequence(emitter, digits, result)) {
  246. break;
  247. }
  248. content = remaining;
  249. return;
  250. }
  251. }
  252. emitter.EmitError<UnicodeEscapeMissingBracedDigits>(content.begin());
  253. break;
  254. }
  255. default:
  256. emitter.EmitError<UnknownEscapeSequence>(content.begin() - 1,
  257. {.first = first});
  258. break;
  259. }
  260. // If we get here, we didn't recognize this escape sequence and have already
  261. // issued a diagnostic. For error recovery purposes, expand this escape
  262. // sequence to itself, dropping the introducer (for example, `\q` -> `q`).
  263. result += first;
  264. }
  265. // Expand any escape sequences in the given string literal.
  266. static auto ExpandEscapeSequencesAndRemoveIndent(
  267. LexerDiagnosticEmitter& emitter, llvm::StringRef contents, int hash_level,
  268. llvm::StringRef indent) -> std::string {
  269. std::string result;
  270. result.reserve(contents.size());
  271. llvm::SmallString<16> escape("\\");
  272. escape.resize(1 + hash_level, '#');
  273. // Process each line of the string literal.
  274. while (true) {
  275. // Every non-empty line (that contains anything other than horizontal
  276. // whitespace) is required to start with the string's indent. For error
  277. // recovery, remove all leading whitespace if the indent doesn't match.
  278. if (!contents.consume_front(indent)) {
  279. const char* line_start = contents.begin();
  280. contents = contents.drop_while(IsHorizontalWhitespace);
  281. if (!contents.startswith("\n")) {
  282. emitter.EmitError<MismatchedIndentInString>(line_start);
  283. }
  284. }
  285. // Process the contents of the line.
  286. while (true) {
  287. auto end_of_regular_text = contents.find_if([](char c) {
  288. return c == '\n' || c == '\\' ||
  289. (IsHorizontalWhitespace(c) && c != ' ');
  290. });
  291. result += contents.substr(0, end_of_regular_text);
  292. contents = contents.substr(end_of_regular_text);
  293. if (contents.empty()) {
  294. return result;
  295. }
  296. if (contents.consume_front("\n")) {
  297. // Trailing whitespace before a newline doesn't contribute to the string
  298. // literal value.
  299. while (!result.empty() && result.back() != '\n' &&
  300. IsSpace(result.back())) {
  301. result.pop_back();
  302. }
  303. result += '\n';
  304. // Move onto to the next line.
  305. break;
  306. }
  307. if (IsHorizontalWhitespace(contents.front())) {
  308. // Horizontal whitespace other than ` ` is valid only at the end of a
  309. // line.
  310. assert(contents.front() != ' ' &&
  311. "should not have stopped at a plain space");
  312. auto after_space = contents.find_if_not(IsHorizontalWhitespace);
  313. if (after_space == llvm::StringRef::npos ||
  314. contents[after_space] != '\n') {
  315. // TODO: Include the source range of the whitespace up to
  316. // `contents.begin() + after_space` in the diagnostic.
  317. emitter.EmitError<InvalidHorizontalWhitespaceInString>(
  318. contents.begin());
  319. // Include the whitespace in the string contents for error recovery.
  320. result += contents.substr(0, after_space);
  321. }
  322. contents = contents.substr(after_space);
  323. continue;
  324. }
  325. if (!contents.consume_front(escape)) {
  326. // This is not an escape sequence, just a raw `\`.
  327. result += contents.front();
  328. contents = contents.drop_front(1);
  329. continue;
  330. }
  331. if (contents.consume_front("\n")) {
  332. // An escaped newline ends the line without producing any content and
  333. // without trimming trailing whitespace.
  334. break;
  335. }
  336. // Handle this escape sequence.
  337. ExpandAndConsumeEscapeSequence(emitter, contents, result);
  338. }
  339. }
  340. }
  341. auto LexedStringLiteral::ComputeValue(LexerDiagnosticEmitter& emitter) const
  342. -> std::string {
  343. llvm::StringRef indent =
  344. multi_line ? CheckIndent(emitter, text, content) : llvm::StringRef();
  345. return ExpandEscapeSequencesAndRemoveIndent(emitter, content, hash_level,
  346. indent);
  347. }
  348. } // namespace Carbon