string_literal.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "toolchain/lexer/string_literal.h"
  5. #include "common/check.h"
  6. #include "llvm/ADT/SmallString.h"
  7. #include "llvm/ADT/StringExtras.h"
  8. #include "llvm/Support/ConvertUTF.h"
  9. #include "llvm/Support/ErrorHandling.h"
  10. #include "llvm/Support/FormatVariadic.h"
  11. #include "toolchain/lexer/character_set.h"
  12. #include "toolchain/lexer/lex_helpers.h"
  13. namespace Carbon {
  14. using LexerDiagnosticEmitter = DiagnosticEmitter<const char*>;
  15. static constexpr char MultiLineIndicator[] = R"(""")";
  16. // Return the number of opening characters of a multi-line string literal,
  17. // after any '#'s, including the file type indicator and following newline.
  18. static auto GetMultiLineStringLiteralPrefixSize(llvm::StringRef source_text)
  19. -> int {
  20. if (!source_text.startswith(MultiLineIndicator)) {
  21. return 0;
  22. }
  23. // The rest of the line must be a valid file type indicator: a sequence of
  24. // characters containing neither '#' nor '"' followed by a newline.
  25. auto prefix_end =
  26. source_text.find_first_of("#\n\"", strlen(MultiLineIndicator));
  27. if (prefix_end == llvm::StringRef::npos || source_text[prefix_end] != '\n') {
  28. return 0;
  29. }
  30. // Include the newline on return.
  31. return prefix_end + 1;
  32. }
  33. auto LexedStringLiteral::Lex(llvm::StringRef source_text)
  34. -> llvm::Optional<LexedStringLiteral> {
  35. int64_t cursor = 0;
  36. const int64_t source_text_size = source_text.size();
  37. // Determine the number of hashes prefixing.
  38. while (cursor < source_text_size && source_text[cursor] == '#') {
  39. ++cursor;
  40. }
  41. const int hash_level = cursor;
  42. llvm::SmallString<16> terminator("\"");
  43. llvm::SmallString<16> escape("\\");
  44. const int multi_line_prefix_size =
  45. GetMultiLineStringLiteralPrefixSize(source_text.substr(hash_level));
  46. const bool multi_line = multi_line_prefix_size > 0;
  47. if (multi_line) {
  48. cursor += multi_line_prefix_size;
  49. terminator = MultiLineIndicator;
  50. } else if (cursor < source_text_size && source_text[cursor] == '"') {
  51. ++cursor;
  52. } else {
  53. return llvm::None;
  54. }
  55. const int prefix_len = cursor;
  56. // The terminator and escape sequence marker require a number of '#'s
  57. // matching the leading sequence of '#'s.
  58. terminator.resize(terminator.size() + hash_level, '#');
  59. escape.resize(escape.size() + hash_level, '#');
  60. // TODO: Detect indent / dedent for multi-line string literals in order to
  61. // stop parsing on dedent before a terminator is found.
  62. for (; cursor < source_text_size; ++cursor) {
  63. // This switch and loop structure relies on multi-character terminators and
  64. // escape sequences starting with a predictable character and not containing
  65. // embedded and unescaped terminators or newlines.
  66. switch (source_text[cursor]) {
  67. case '\\':
  68. if (escape.size() == 1 ||
  69. source_text.substr(cursor).startswith(escape)) {
  70. cursor += escape.size();
  71. // If there's either not a character following the escape, or it's a
  72. // single-line string and the escaped character is a newline, we
  73. // should stop here.
  74. if (cursor >= source_text_size ||
  75. (!multi_line && source_text[cursor] == '\n')) {
  76. llvm::StringRef text = source_text.take_front(cursor);
  77. return LexedStringLiteral(text, text.drop_front(prefix_len),
  78. hash_level, multi_line,
  79. /*is_terminated=*/false);
  80. }
  81. }
  82. break;
  83. case '\n':
  84. if (!multi_line) {
  85. llvm::StringRef text = source_text.take_front(cursor);
  86. return LexedStringLiteral(text, text.drop_front(prefix_len),
  87. hash_level, multi_line,
  88. /*is_terminated=*/false);
  89. }
  90. break;
  91. case '\"': {
  92. if (terminator.size() == 1 ||
  93. source_text.substr(cursor).startswith(terminator)) {
  94. llvm::StringRef text =
  95. source_text.substr(0, cursor + terminator.size());
  96. llvm::StringRef content =
  97. source_text.substr(prefix_len, cursor - prefix_len);
  98. return LexedStringLiteral(text, content, hash_level, multi_line,
  99. /*is_terminated=*/true);
  100. }
  101. break;
  102. }
  103. }
  104. }
  105. // No terminator was found.
  106. return LexedStringLiteral(source_text, source_text.drop_front(prefix_len),
  107. hash_level, multi_line,
  108. /*is_terminated=*/false);
  109. }
  110. // Given a string that contains at least one newline, find the indent (the
  111. // leading sequence of horizontal whitespace) of its final line.
  112. static auto ComputeIndentOfFinalLine(llvm::StringRef text) -> llvm::StringRef {
  113. int indent_end = text.size();
  114. for (int i = indent_end - 1; i >= 0; --i) {
  115. if (text[i] == '\n') {
  116. int indent_start = i + 1;
  117. return text.substr(indent_start, indent_end - indent_start);
  118. }
  119. if (!IsSpace(text[i])) {
  120. indent_end = i;
  121. }
  122. }
  123. llvm_unreachable("Given text is required to contain a newline.");
  124. }
  125. // Check the literal is indented properly, if it's a multi-line litera.
  126. // Find the leading whitespace that should be removed from each line of a
  127. // multi-line string literal.
  128. static auto CheckIndent(LexerDiagnosticEmitter& emitter, llvm::StringRef text,
  129. llvm::StringRef content) -> llvm::StringRef {
  130. // Find the leading horizontal whitespace on the final line of this literal.
  131. // Note that for an empty literal, this might not be inside the content.
  132. llvm::StringRef indent = ComputeIndentOfFinalLine(text);
  133. // The last line is not permitted to contain any content after its
  134. // indentation.
  135. if (indent.end() != content.end()) {
  136. CARBON_DIAGNOSTIC(
  137. ContentBeforeStringTerminator, Error,
  138. "Only whitespace is permitted before the closing `\"\"\"` of a "
  139. "multi-line string.");
  140. emitter.Emit(indent.end(), ContentBeforeStringTerminator);
  141. }
  142. return indent;
  143. }
  144. // Expand a `\u{HHHHHH}` escape sequence into a sequence of UTF-8 code units.
  145. static auto ExpandUnicodeEscapeSequence(LexerDiagnosticEmitter& emitter,
  146. llvm::StringRef digits,
  147. std::string& result) -> bool {
  148. unsigned code_point;
  149. if (!CanLexInteger(emitter, digits)) {
  150. return false;
  151. }
  152. if (digits.getAsInteger(16, code_point) || code_point > 0x10FFFF) {
  153. CARBON_DIAGNOSTIC(UnicodeEscapeTooLarge, Error,
  154. "Code point specified by `\\u{{...}}` escape is greater "
  155. "than 0x10FFFF.");
  156. emitter.Emit(digits.begin(), UnicodeEscapeTooLarge);
  157. return false;
  158. }
  159. if (code_point >= 0xD800 && code_point < 0xE000) {
  160. CARBON_DIAGNOSTIC(UnicodeEscapeSurrogate, Error,
  161. "Code point specified by `\\u{{...}}` escape is a "
  162. "surrogate character.");
  163. emitter.Emit(digits.begin(), UnicodeEscapeSurrogate);
  164. return false;
  165. }
  166. // Convert the code point to a sequence of UTF-8 code units.
  167. // Every code point fits in 6 UTF-8 code units.
  168. const llvm::UTF32 utf32_code_units[1] = {code_point};
  169. llvm::UTF8 utf8_code_units[6];
  170. const llvm::UTF32* src_pos = utf32_code_units;
  171. llvm::UTF8* dest_pos = utf8_code_units;
  172. llvm::ConversionResult conv_result = llvm::ConvertUTF32toUTF8(
  173. &src_pos, src_pos + 1, &dest_pos, dest_pos + 6, llvm::strictConversion);
  174. if (conv_result != llvm::conversionOK) {
  175. llvm_unreachable("conversion of valid code point to UTF-8 cannot fail");
  176. }
  177. result.insert(result.end(), reinterpret_cast<char*>(utf8_code_units),
  178. reinterpret_cast<char*>(dest_pos));
  179. return true;
  180. }
  181. // Expand an escape sequence, appending the expanded value to the given
  182. // `result` string. `content` is the string content, starting from the first
  183. // character after the escape sequence introducer (for example, the `n` in
  184. // `\n`), and will be updated to remove the leading escape sequence.
  185. static auto ExpandAndConsumeEscapeSequence(LexerDiagnosticEmitter& emitter,
  186. llvm::StringRef& content,
  187. std::string& result) -> void {
  188. CARBON_CHECK(!content.empty()) << "should have escaped closing delimiter";
  189. char first = content.front();
  190. content = content.drop_front(1);
  191. switch (first) {
  192. case 't':
  193. result += '\t';
  194. return;
  195. case 'n':
  196. result += '\n';
  197. return;
  198. case 'r':
  199. result += '\r';
  200. return;
  201. case '"':
  202. result += '"';
  203. return;
  204. case '\'':
  205. result += '\'';
  206. return;
  207. case '\\':
  208. result += '\\';
  209. return;
  210. case '0':
  211. result += '\0';
  212. if (!content.empty() && IsDecimalDigit(content.front())) {
  213. CARBON_DIAGNOSTIC(
  214. DecimalEscapeSequence, Error,
  215. "Decimal digit follows `\\0` escape sequence. Use `\\x00` instead "
  216. "of `\\0` if the next character is a digit.");
  217. emitter.Emit(content.begin(), DecimalEscapeSequence);
  218. return;
  219. }
  220. return;
  221. case 'x':
  222. if (content.size() >= 2 && IsUpperHexDigit(content[0]) &&
  223. IsUpperHexDigit(content[1])) {
  224. result +=
  225. static_cast<char>(llvm::hexFromNibbles(content[0], content[1]));
  226. content = content.drop_front(2);
  227. return;
  228. }
  229. CARBON_DIAGNOSTIC(HexadecimalEscapeMissingDigits, Error,
  230. "Escape sequence `\\x` must be followed by two "
  231. "uppercase hexadecimal digits, for example `\\x0F`.");
  232. emitter.Emit(content.begin(), HexadecimalEscapeMissingDigits);
  233. break;
  234. case 'u': {
  235. llvm::StringRef remaining = content;
  236. if (remaining.consume_front("{")) {
  237. llvm::StringRef digits = remaining.take_while(IsUpperHexDigit);
  238. remaining = remaining.drop_front(digits.size());
  239. if (!digits.empty() && remaining.consume_front("}")) {
  240. if (!ExpandUnicodeEscapeSequence(emitter, digits, result)) {
  241. break;
  242. }
  243. content = remaining;
  244. return;
  245. }
  246. }
  247. CARBON_DIAGNOSTIC(
  248. UnicodeEscapeMissingBracedDigits, Error,
  249. "Escape sequence `\\u` must be followed by a braced sequence of "
  250. "uppercase hexadecimal digits, for example `\\u{{70AD}}`.");
  251. emitter.Emit(content.begin(), UnicodeEscapeMissingBracedDigits);
  252. break;
  253. }
  254. default:
  255. CARBON_DIAGNOSTIC(UnknownEscapeSequence, Error,
  256. "Unrecognized escape sequence `{0}`.", char);
  257. emitter.Emit(content.begin() - 1, UnknownEscapeSequence, first);
  258. break;
  259. }
  260. // If we get here, we didn't recognize this escape sequence and have already
  261. // issued a diagnostic. For error recovery purposes, expand this escape
  262. // sequence to itself, dropping the introducer (for example, `\q` -> `q`).
  263. result += first;
  264. }
  265. // Expand any escape sequences in the given string literal.
  266. static auto ExpandEscapeSequencesAndRemoveIndent(
  267. LexerDiagnosticEmitter& emitter, llvm::StringRef contents, int hash_level,
  268. llvm::StringRef indent) -> std::string {
  269. std::string result;
  270. result.reserve(contents.size());
  271. llvm::SmallString<16> escape("\\");
  272. escape.resize(1 + hash_level, '#');
  273. // Process each line of the string literal.
  274. while (true) {
  275. // Every non-empty line (that contains anything other than horizontal
  276. // whitespace) is required to start with the string's indent. For error
  277. // recovery, remove all leading whitespace if the indent doesn't match.
  278. if (!contents.consume_front(indent)) {
  279. const char* line_start = contents.begin();
  280. contents = contents.drop_while(IsHorizontalWhitespace);
  281. if (!contents.startswith("\n")) {
  282. CARBON_DIAGNOSTIC(
  283. MismatchedIndentInString, Error,
  284. "Indentation does not match that of the closing \"\"\" in "
  285. "multi-line string literal.");
  286. emitter.Emit(line_start, MismatchedIndentInString);
  287. }
  288. }
  289. // Process the contents of the line.
  290. while (true) {
  291. auto end_of_regular_text = contents.find_if([](char c) {
  292. return c == '\n' || c == '\\' ||
  293. (IsHorizontalWhitespace(c) && c != ' ');
  294. });
  295. result += contents.substr(0, end_of_regular_text);
  296. contents = contents.substr(end_of_regular_text);
  297. if (contents.empty()) {
  298. return result;
  299. }
  300. if (contents.consume_front("\n")) {
  301. // Trailing whitespace before a newline doesn't contribute to the string
  302. // literal value.
  303. while (!result.empty() && result.back() != '\n' &&
  304. IsSpace(result.back())) {
  305. result.pop_back();
  306. }
  307. result += '\n';
  308. // Move onto to the next line.
  309. break;
  310. }
  311. if (IsHorizontalWhitespace(contents.front())) {
  312. // Horizontal whitespace other than ` ` is valid only at the end of a
  313. // line.
  314. CARBON_CHECK(contents.front() != ' ')
  315. << "should not have stopped at a plain space";
  316. auto after_space = contents.find_if_not(IsHorizontalWhitespace);
  317. if (after_space == llvm::StringRef::npos ||
  318. contents[after_space] != '\n') {
  319. // TODO: Include the source range of the whitespace up to
  320. // `contents.begin() + after_space` in the diagnostic.
  321. CARBON_DIAGNOSTIC(
  322. InvalidHorizontalWhitespaceInString, Error,
  323. "Whitespace other than plain space must be expressed with an "
  324. "escape sequence in a string literal.");
  325. emitter.Emit(contents.begin(), InvalidHorizontalWhitespaceInString);
  326. // Include the whitespace in the string contents for error recovery.
  327. result += contents.substr(0, after_space);
  328. }
  329. contents = contents.substr(after_space);
  330. continue;
  331. }
  332. if (!contents.consume_front(escape)) {
  333. // This is not an escape sequence, just a raw `\`.
  334. result += contents.front();
  335. contents = contents.drop_front(1);
  336. continue;
  337. }
  338. if (contents.consume_front("\n")) {
  339. // An escaped newline ends the line without producing any content and
  340. // without trimming trailing whitespace.
  341. break;
  342. }
  343. // Handle this escape sequence.
  344. ExpandAndConsumeEscapeSequence(emitter, contents, result);
  345. }
  346. }
  347. }
  348. auto LexedStringLiteral::ComputeValue(LexerDiagnosticEmitter& emitter) const
  349. -> std::string {
  350. if (!is_terminated_) {
  351. return "";
  352. }
  353. llvm::StringRef indent =
  354. multi_line_ ? CheckIndent(emitter, text_, content_) : llvm::StringRef();
  355. return ExpandEscapeSequencesAndRemoveIndent(emitter, content_, hash_level_,
  356. indent);
  357. }
  358. } // namespace Carbon