tokenized_buffer_test.cpp 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "toolchain/lexer/tokenized_buffer.h"
  5. #include <gmock/gmock.h>
  6. #include <gtest/gtest.h>
  7. #include <forward_list>
  8. #include <iterator>
  9. #include "llvm/ADT/ArrayRef.h"
  10. #include "llvm/ADT/Sequence.h"
  11. #include "llvm/ADT/SmallString.h"
  12. #include "llvm/Support/SourceMgr.h"
  13. #include "llvm/Support/raw_ostream.h"
  14. #include "testing/util/test_raw_ostream.h"
  15. #include "toolchain/common/yaml_test_helpers.h"
  16. #include "toolchain/diagnostics/diagnostic_emitter.h"
  17. #include "toolchain/diagnostics/mocks.h"
  18. #include "toolchain/lexer/tokenized_buffer_test_helpers.h"
  19. namespace Carbon::Testing {
  20. namespace {
  21. using ::testing::_;
  22. using ::testing::ElementsAre;
  23. using ::testing::Eq;
  24. using ::testing::HasSubstr;
  25. using ::testing::StrEq;
  26. class LexerTest : public ::testing::Test {
  27. protected:
  28. auto GetSourceBuffer(llvm::StringRef text) -> SourceBuffer& {
  29. std::string filename = llvm::formatv("test{0}.carbon", ++file_index_);
  30. CARBON_CHECK(fs_.addFile(filename, /*ModificationTime=*/0,
  31. llvm::MemoryBuffer::getMemBuffer(text)));
  32. source_storage_.push_front(
  33. std::move(*SourceBuffer::CreateFromFile(fs_, filename)));
  34. return source_storage_.front();
  35. }
  36. auto Lex(llvm::StringRef text,
  37. DiagnosticConsumer& consumer = ConsoleDiagnosticConsumer())
  38. -> TokenizedBuffer {
  39. return TokenizedBuffer::Lex(GetSourceBuffer(text), consumer);
  40. }
  41. llvm::vfs::InMemoryFileSystem fs_;
  42. int file_index_ = 0;
  43. std::forward_list<SourceBuffer> source_storage_;
  44. };
  45. TEST_F(LexerTest, HandlesEmptyBuffer) {
  46. auto buffer = Lex("");
  47. EXPECT_FALSE(buffer.has_errors());
  48. EXPECT_THAT(buffer,
  49. HasTokens(llvm::ArrayRef<ExpectedToken>{{TokenKind::EndOfFile}}));
  50. }
  51. TEST_F(LexerTest, TracksLinesAndColumns) {
  52. auto buffer = Lex("\n ;;\n ;;;\n x\"foo\" '''baz\n a\n ''' y");
  53. EXPECT_FALSE(buffer.has_errors());
  54. EXPECT_THAT(
  55. buffer,
  56. HasTokens(llvm::ArrayRef<ExpectedToken>{
  57. {.kind = TokenKind::Semi, .line = 2, .column = 3, .indent_column = 3},
  58. {.kind = TokenKind::Semi, .line = 2, .column = 4, .indent_column = 3},
  59. {.kind = TokenKind::Semi, .line = 3, .column = 4, .indent_column = 4},
  60. {.kind = TokenKind::Semi, .line = 3, .column = 5, .indent_column = 4},
  61. {.kind = TokenKind::Semi, .line = 3, .column = 6, .indent_column = 4},
  62. {.kind = TokenKind::Identifier,
  63. .line = 4,
  64. .column = 4,
  65. .indent_column = 4,
  66. .text = "x"},
  67. {.kind = TokenKind::StringLiteral,
  68. .line = 4,
  69. .column = 5,
  70. .indent_column = 4},
  71. {.kind = TokenKind::StringLiteral,
  72. .line = 4,
  73. .column = 11,
  74. .indent_column = 4},
  75. {.kind = TokenKind::Identifier,
  76. .line = 6,
  77. .column = 6,
  78. .indent_column = 11,
  79. .text = "y"},
  80. {.kind = TokenKind::EndOfFile, .line = 6, .column = 7},
  81. }));
  82. }
  83. TEST_F(LexerTest, HandlesNumericLiteral) {
  84. auto buffer = Lex("12-578\n 1 2\n0x12_3ABC\n0b10_10_11\n1_234_567\n1.5e9");
  85. EXPECT_FALSE(buffer.has_errors());
  86. ASSERT_THAT(buffer,
  87. HasTokens(llvm::ArrayRef<ExpectedToken>{
  88. {.kind = TokenKind::IntegerLiteral,
  89. .line = 1,
  90. .column = 1,
  91. .indent_column = 1,
  92. .text = "12"},
  93. {.kind = TokenKind::Minus,
  94. .line = 1,
  95. .column = 3,
  96. .indent_column = 1},
  97. {.kind = TokenKind::IntegerLiteral,
  98. .line = 1,
  99. .column = 4,
  100. .indent_column = 1,
  101. .text = "578"},
  102. {.kind = TokenKind::IntegerLiteral,
  103. .line = 2,
  104. .column = 3,
  105. .indent_column = 3,
  106. .text = "1"},
  107. {.kind = TokenKind::IntegerLiteral,
  108. .line = 2,
  109. .column = 6,
  110. .indent_column = 3,
  111. .text = "2"},
  112. {.kind = TokenKind::IntegerLiteral,
  113. .line = 3,
  114. .column = 1,
  115. .indent_column = 1,
  116. .text = "0x12_3ABC"},
  117. {.kind = TokenKind::IntegerLiteral,
  118. .line = 4,
  119. .column = 1,
  120. .indent_column = 1,
  121. .text = "0b10_10_11"},
  122. {.kind = TokenKind::IntegerLiteral,
  123. .line = 5,
  124. .column = 1,
  125. .indent_column = 1,
  126. .text = "1_234_567"},
  127. {.kind = TokenKind::RealLiteral,
  128. .line = 6,
  129. .column = 1,
  130. .indent_column = 1,
  131. .text = "1.5e9"},
  132. {.kind = TokenKind::EndOfFile, .line = 6, .column = 6},
  133. }));
  134. auto token_12 = buffer.tokens().begin();
  135. EXPECT_EQ(buffer.GetIntegerLiteral(*token_12), 12);
  136. auto token_578 = buffer.tokens().begin() + 2;
  137. EXPECT_EQ(buffer.GetIntegerLiteral(*token_578), 578);
  138. auto token_1 = buffer.tokens().begin() + 3;
  139. EXPECT_EQ(buffer.GetIntegerLiteral(*token_1), 1);
  140. auto token_2 = buffer.tokens().begin() + 4;
  141. EXPECT_EQ(buffer.GetIntegerLiteral(*token_2), 2);
  142. auto token_0x12_3abc = buffer.tokens().begin() + 5;
  143. EXPECT_EQ(buffer.GetIntegerLiteral(*token_0x12_3abc), 0x12'3abc);
  144. auto token_0b10_10_11 = buffer.tokens().begin() + 6;
  145. EXPECT_EQ(buffer.GetIntegerLiteral(*token_0b10_10_11), 0b10'10'11);
  146. auto token_1_234_567 = buffer.tokens().begin() + 7;
  147. EXPECT_EQ(buffer.GetIntegerLiteral(*token_1_234_567), 1'234'567);
  148. auto token_1_5e9 = buffer.tokens().begin() + 8;
  149. auto value_1_5e9 = buffer.GetRealLiteral(*token_1_5e9);
  150. EXPECT_EQ(value_1_5e9.Mantissa().getZExtValue(), 15);
  151. EXPECT_EQ(value_1_5e9.Exponent().getSExtValue(), 8);
  152. EXPECT_EQ(value_1_5e9.IsDecimal(), true);
  153. }
  154. TEST_F(LexerTest, HandlesInvalidNumericLiterals) {
  155. auto buffer = Lex("14x 15_49 0x3.5q 0x3_4.5_6 0ops");
  156. EXPECT_TRUE(buffer.has_errors());
  157. ASSERT_THAT(buffer,
  158. HasTokens(llvm::ArrayRef<ExpectedToken>{
  159. {.kind = TokenKind::Error,
  160. .line = 1,
  161. .column = 1,
  162. .indent_column = 1,
  163. .text = "14x"},
  164. {.kind = TokenKind::IntegerLiteral,
  165. .line = 1,
  166. .column = 5,
  167. .indent_column = 1,
  168. .text = "15_49"},
  169. {.kind = TokenKind::Error,
  170. .line = 1,
  171. .column = 11,
  172. .indent_column = 1,
  173. .text = "0x3.5q"},
  174. {.kind = TokenKind::RealLiteral,
  175. .line = 1,
  176. .column = 18,
  177. .indent_column = 1,
  178. .text = "0x3_4.5_6"},
  179. {.kind = TokenKind::Error,
  180. .line = 1,
  181. .column = 28,
  182. .indent_column = 1,
  183. .text = "0ops"},
  184. {.kind = TokenKind::EndOfFile, .line = 1, .column = 32},
  185. }));
  186. }
  187. TEST_F(LexerTest, SplitsNumericLiteralsProperly) {
  188. llvm::StringLiteral source_text = R"(
  189. 1.
  190. .2
  191. 3.+foo
  192. 4.0-bar
  193. 5.0e+123+456
  194. 6.0e+1e+2
  195. 1e7
  196. 8..10
  197. 9.0.9.5
  198. 10.foo
  199. 11.0.foo
  200. 12e+1
  201. 13._
  202. )";
  203. auto buffer = Lex(source_text);
  204. EXPECT_TRUE(buffer.has_errors());
  205. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  206. {.kind = TokenKind::IntegerLiteral, .text = "1"},
  207. {.kind = TokenKind::Period},
  208. // newline
  209. {.kind = TokenKind::Period},
  210. {.kind = TokenKind::IntegerLiteral, .text = "2"},
  211. // newline
  212. {.kind = TokenKind::IntegerLiteral, .text = "3"},
  213. {.kind = TokenKind::Period},
  214. {.kind = TokenKind::Plus},
  215. {.kind = TokenKind::Identifier, .text = "foo"},
  216. // newline
  217. {.kind = TokenKind::RealLiteral, .text = "4.0"},
  218. {.kind = TokenKind::Minus},
  219. {.kind = TokenKind::Identifier, .text = "bar"},
  220. // newline
  221. {.kind = TokenKind::RealLiteral, .text = "5.0e+123"},
  222. {.kind = TokenKind::Plus},
  223. {.kind = TokenKind::IntegerLiteral, .text = "456"},
  224. // newline
  225. {.kind = TokenKind::Error, .text = "6.0e+1e"},
  226. {.kind = TokenKind::Plus},
  227. {.kind = TokenKind::IntegerLiteral, .text = "2"},
  228. // newline
  229. {.kind = TokenKind::Error, .text = "1e7"},
  230. // newline
  231. {.kind = TokenKind::IntegerLiteral, .text = "8"},
  232. {.kind = TokenKind::Period},
  233. {.kind = TokenKind::Period},
  234. {.kind = TokenKind::IntegerLiteral, .text = "10"},
  235. // newline
  236. {.kind = TokenKind::RealLiteral, .text = "9.0"},
  237. {.kind = TokenKind::Period},
  238. {.kind = TokenKind::RealLiteral, .text = "9.5"},
  239. // newline
  240. {.kind = TokenKind::Error, .text = "10.foo"},
  241. // newline
  242. {.kind = TokenKind::RealLiteral, .text = "11.0"},
  243. {.kind = TokenKind::Period},
  244. {.kind = TokenKind::Identifier, .text = "foo"},
  245. // newline
  246. {.kind = TokenKind::Error, .text = "12e"},
  247. {.kind = TokenKind::Plus},
  248. {.kind = TokenKind::IntegerLiteral, .text = "1"},
  249. // newline
  250. {.kind = TokenKind::IntegerLiteral, .text = "13"},
  251. {.kind = TokenKind::Period},
  252. {.kind = TokenKind::Underscore},
  253. // newline
  254. {.kind = TokenKind::EndOfFile},
  255. }));
  256. }
  257. TEST_F(LexerTest, HandlesGarbageCharacters) {
  258. constexpr char GarbageText[] = "$$💩-$\n$\0$12$\n\\\"\\\n\"x";
  259. auto buffer = Lex(llvm::StringRef(GarbageText, sizeof(GarbageText) - 1));
  260. EXPECT_TRUE(buffer.has_errors());
  261. EXPECT_THAT(
  262. buffer,
  263. HasTokens(llvm::ArrayRef<ExpectedToken>{
  264. {.kind = TokenKind::Error,
  265. .line = 1,
  266. .column = 1,
  267. // 💩 takes 4 bytes, and we count column as bytes offset.
  268. .text = llvm::StringRef("$$💩", 6)},
  269. {.kind = TokenKind::Minus, .line = 1, .column = 7},
  270. {.kind = TokenKind::Error, .line = 1, .column = 8, .text = "$"},
  271. // newline
  272. {.kind = TokenKind::Error,
  273. .line = 2,
  274. .column = 1,
  275. .text = llvm::StringRef("$\0$", 3)},
  276. {.kind = TokenKind::IntegerLiteral,
  277. .line = 2,
  278. .column = 4,
  279. .text = "12"},
  280. {.kind = TokenKind::Error, .line = 2, .column = 6, .text = "$"},
  281. // newline
  282. {.kind = TokenKind::Backslash, .line = 3, .column = 1, .text = "\\"},
  283. {.kind = TokenKind::Error, .line = 3, .column = 2, .text = "\"\\"},
  284. // newline
  285. {.kind = TokenKind::Error, .line = 4, .column = 1, .text = "\"x"},
  286. {.kind = TokenKind::EndOfFile, .line = 4, .column = 3},
  287. }));
  288. }
  289. TEST_F(LexerTest, Symbols) {
  290. // We don't need to exhaustively test symbols here as they're handled with
  291. // common code, but we want to check specific patterns to verify things like
  292. // max-munch rule and handling of interesting symbols.
  293. auto buffer = Lex("<<<");
  294. EXPECT_FALSE(buffer.has_errors());
  295. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  296. {TokenKind::LessLess},
  297. {TokenKind::Less},
  298. {TokenKind::EndOfFile},
  299. }));
  300. buffer = Lex("<<=>>");
  301. EXPECT_FALSE(buffer.has_errors());
  302. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  303. {TokenKind::LessLessEqual},
  304. {TokenKind::GreaterGreater},
  305. {TokenKind::EndOfFile},
  306. }));
  307. buffer = Lex("< <=> >");
  308. EXPECT_FALSE(buffer.has_errors());
  309. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  310. {TokenKind::Less},
  311. {TokenKind::LessEqualGreater},
  312. {TokenKind::Greater},
  313. {TokenKind::EndOfFile},
  314. }));
  315. buffer = Lex("\\/?@&^!");
  316. EXPECT_FALSE(buffer.has_errors());
  317. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  318. {TokenKind::Backslash},
  319. {TokenKind::Slash},
  320. {TokenKind::Question},
  321. {TokenKind::At},
  322. {TokenKind::Amp},
  323. {TokenKind::Caret},
  324. {TokenKind::Exclaim},
  325. {TokenKind::EndOfFile},
  326. }));
  327. }
  328. TEST_F(LexerTest, Parens) {
  329. auto buffer = Lex("()");
  330. EXPECT_FALSE(buffer.has_errors());
  331. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  332. {TokenKind::OpenParen},
  333. {TokenKind::CloseParen},
  334. {TokenKind::EndOfFile},
  335. }));
  336. buffer = Lex("((()()))");
  337. EXPECT_FALSE(buffer.has_errors());
  338. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  339. {TokenKind::OpenParen},
  340. {TokenKind::OpenParen},
  341. {TokenKind::OpenParen},
  342. {TokenKind::CloseParen},
  343. {TokenKind::OpenParen},
  344. {TokenKind::CloseParen},
  345. {TokenKind::CloseParen},
  346. {TokenKind::CloseParen},
  347. {TokenKind::EndOfFile},
  348. }));
  349. }
  350. TEST_F(LexerTest, CurlyBraces) {
  351. auto buffer = Lex("{}");
  352. EXPECT_FALSE(buffer.has_errors());
  353. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  354. {TokenKind::OpenCurlyBrace},
  355. {TokenKind::CloseCurlyBrace},
  356. {TokenKind::EndOfFile},
  357. }));
  358. buffer = Lex("{{{}{}}}");
  359. EXPECT_FALSE(buffer.has_errors());
  360. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  361. {TokenKind::OpenCurlyBrace},
  362. {TokenKind::OpenCurlyBrace},
  363. {TokenKind::OpenCurlyBrace},
  364. {TokenKind::CloseCurlyBrace},
  365. {TokenKind::OpenCurlyBrace},
  366. {TokenKind::CloseCurlyBrace},
  367. {TokenKind::CloseCurlyBrace},
  368. {TokenKind::CloseCurlyBrace},
  369. {TokenKind::EndOfFile},
  370. }));
  371. }
  372. TEST_F(LexerTest, MatchingGroups) {
  373. {
  374. TokenizedBuffer buffer = Lex("(){}");
  375. ASSERT_FALSE(buffer.has_errors());
  376. auto it = buffer.tokens().begin();
  377. auto open_paren_token = *it++;
  378. auto close_paren_token = *it++;
  379. EXPECT_EQ(close_paren_token,
  380. buffer.GetMatchedClosingToken(open_paren_token));
  381. EXPECT_EQ(open_paren_token,
  382. buffer.GetMatchedOpeningToken(close_paren_token));
  383. auto open_curly_token = *it++;
  384. auto close_curly_token = *it++;
  385. EXPECT_EQ(close_curly_token,
  386. buffer.GetMatchedClosingToken(open_curly_token));
  387. EXPECT_EQ(open_curly_token,
  388. buffer.GetMatchedOpeningToken(close_curly_token));
  389. auto eof_token = *it++;
  390. EXPECT_EQ(buffer.GetKind(eof_token), TokenKind::EndOfFile);
  391. EXPECT_EQ(buffer.tokens().end(), it);
  392. }
  393. {
  394. TokenizedBuffer buffer = Lex("({x}){(y)} {{((z))}}");
  395. ASSERT_FALSE(buffer.has_errors());
  396. auto it = buffer.tokens().begin();
  397. auto open_paren_token = *it++;
  398. auto open_curly_token = *it++;
  399. ASSERT_EQ("x", buffer.GetIdentifierText(buffer.GetIdentifier(*it++)));
  400. auto close_curly_token = *it++;
  401. auto close_paren_token = *it++;
  402. EXPECT_EQ(close_paren_token,
  403. buffer.GetMatchedClosingToken(open_paren_token));
  404. EXPECT_EQ(open_paren_token,
  405. buffer.GetMatchedOpeningToken(close_paren_token));
  406. EXPECT_EQ(close_curly_token,
  407. buffer.GetMatchedClosingToken(open_curly_token));
  408. EXPECT_EQ(open_curly_token,
  409. buffer.GetMatchedOpeningToken(close_curly_token));
  410. open_curly_token = *it++;
  411. open_paren_token = *it++;
  412. ASSERT_EQ("y", buffer.GetIdentifierText(buffer.GetIdentifier(*it++)));
  413. close_paren_token = *it++;
  414. close_curly_token = *it++;
  415. EXPECT_EQ(close_curly_token,
  416. buffer.GetMatchedClosingToken(open_curly_token));
  417. EXPECT_EQ(open_curly_token,
  418. buffer.GetMatchedOpeningToken(close_curly_token));
  419. EXPECT_EQ(close_paren_token,
  420. buffer.GetMatchedClosingToken(open_paren_token));
  421. EXPECT_EQ(open_paren_token,
  422. buffer.GetMatchedOpeningToken(close_paren_token));
  423. open_curly_token = *it++;
  424. auto inner_open_curly_token = *it++;
  425. open_paren_token = *it++;
  426. auto inner_open_paren_token = *it++;
  427. ASSERT_EQ("z", buffer.GetIdentifierText(buffer.GetIdentifier(*it++)));
  428. auto inner_close_paren_token = *it++;
  429. close_paren_token = *it++;
  430. auto inner_close_curly_token = *it++;
  431. close_curly_token = *it++;
  432. EXPECT_EQ(close_curly_token,
  433. buffer.GetMatchedClosingToken(open_curly_token));
  434. EXPECT_EQ(open_curly_token,
  435. buffer.GetMatchedOpeningToken(close_curly_token));
  436. EXPECT_EQ(inner_close_curly_token,
  437. buffer.GetMatchedClosingToken(inner_open_curly_token));
  438. EXPECT_EQ(inner_open_curly_token,
  439. buffer.GetMatchedOpeningToken(inner_close_curly_token));
  440. EXPECT_EQ(close_paren_token,
  441. buffer.GetMatchedClosingToken(open_paren_token));
  442. EXPECT_EQ(open_paren_token,
  443. buffer.GetMatchedOpeningToken(close_paren_token));
  444. EXPECT_EQ(inner_close_paren_token,
  445. buffer.GetMatchedClosingToken(inner_open_paren_token));
  446. EXPECT_EQ(inner_open_paren_token,
  447. buffer.GetMatchedOpeningToken(inner_close_paren_token));
  448. auto eof_token = *it++;
  449. EXPECT_EQ(buffer.GetKind(eof_token), TokenKind::EndOfFile);
  450. EXPECT_EQ(buffer.tokens().end(), it);
  451. }
  452. }
  453. TEST_F(LexerTest, MismatchedGroups) {
  454. auto buffer = Lex("{");
  455. EXPECT_TRUE(buffer.has_errors());
  456. EXPECT_THAT(buffer,
  457. HasTokens(llvm::ArrayRef<ExpectedToken>{
  458. {TokenKind::OpenCurlyBrace},
  459. {.kind = TokenKind::CloseCurlyBrace, .recovery = true},
  460. {TokenKind::EndOfFile},
  461. }));
  462. buffer = Lex("}");
  463. EXPECT_TRUE(buffer.has_errors());
  464. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  465. {.kind = TokenKind::Error, .text = "}"},
  466. {TokenKind::EndOfFile},
  467. }));
  468. buffer = Lex("{(}");
  469. EXPECT_TRUE(buffer.has_errors());
  470. EXPECT_THAT(
  471. buffer,
  472. HasTokens(llvm::ArrayRef<ExpectedToken>{
  473. {.kind = TokenKind::OpenCurlyBrace, .column = 1},
  474. {.kind = TokenKind::OpenParen, .column = 2},
  475. {.kind = TokenKind::CloseParen, .column = 3, .recovery = true},
  476. {.kind = TokenKind::CloseCurlyBrace, .column = 3},
  477. {TokenKind::EndOfFile},
  478. }));
  479. buffer = Lex(")({)");
  480. EXPECT_TRUE(buffer.has_errors());
  481. EXPECT_THAT(
  482. buffer,
  483. HasTokens(llvm::ArrayRef<ExpectedToken>{
  484. {.kind = TokenKind::Error, .column = 1, .text = ")"},
  485. {.kind = TokenKind::OpenParen, .column = 2},
  486. {.kind = TokenKind::OpenCurlyBrace, .column = 3},
  487. {.kind = TokenKind::CloseCurlyBrace, .column = 4, .recovery = true},
  488. {.kind = TokenKind::CloseParen, .column = 4},
  489. {TokenKind::EndOfFile},
  490. }));
  491. }
  492. TEST_F(LexerTest, Whitespace) {
  493. auto buffer = Lex("{( } {(");
  494. // Whether there should be whitespace before/after each token.
  495. bool space[] = {true,
  496. // {
  497. false,
  498. // (
  499. true,
  500. // inserted )
  501. true,
  502. // }
  503. true,
  504. // {
  505. false,
  506. // (
  507. true,
  508. // inserted )
  509. true,
  510. // inserted }
  511. true,
  512. // EOF
  513. false};
  514. int pos = 0;
  515. for (TokenizedBuffer::Token token : buffer.tokens()) {
  516. ASSERT_LT(pos, std::size(space));
  517. EXPECT_THAT(buffer.HasLeadingWhitespace(token), Eq(space[pos]));
  518. ++pos;
  519. ASSERT_LT(pos, std::size(space));
  520. EXPECT_THAT(buffer.HasTrailingWhitespace(token), Eq(space[pos]));
  521. }
  522. ASSERT_EQ(pos + 1, std::size(space));
  523. }
  524. TEST_F(LexerTest, Keywords) {
  525. auto buffer = Lex(" fn");
  526. EXPECT_FALSE(buffer.has_errors());
  527. EXPECT_THAT(buffer,
  528. HasTokens(llvm::ArrayRef<ExpectedToken>{
  529. {.kind = TokenKind::Fn, .column = 4, .indent_column = 4},
  530. {TokenKind::EndOfFile},
  531. }));
  532. buffer = Lex("and or not if else for return var break continue _");
  533. EXPECT_FALSE(buffer.has_errors());
  534. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  535. {TokenKind::And},
  536. {TokenKind::Or},
  537. {TokenKind::Not},
  538. {TokenKind::If},
  539. {TokenKind::Else},
  540. {TokenKind::For},
  541. {TokenKind::Return},
  542. {TokenKind::Var},
  543. {TokenKind::Break},
  544. {TokenKind::Continue},
  545. {TokenKind::Underscore},
  546. {TokenKind::EndOfFile},
  547. }));
  548. }
  549. TEST_F(LexerTest, Comments) {
  550. auto buffer = Lex(" ;\n // foo\n ;\n");
  551. EXPECT_FALSE(buffer.has_errors());
  552. EXPECT_THAT(
  553. buffer,
  554. HasTokens(llvm::ArrayRef<ExpectedToken>{
  555. {.kind = TokenKind::Semi, .line = 1, .column = 2, .indent_column = 2},
  556. {.kind = TokenKind::Semi, .line = 3, .column = 3, .indent_column = 3},
  557. {.kind = TokenKind::EndOfFile, .line = 3, .column = 4},
  558. }));
  559. buffer = Lex("// foo\n//\n// bar");
  560. EXPECT_FALSE(buffer.has_errors());
  561. EXPECT_THAT(buffer,
  562. HasTokens(llvm::ArrayRef<ExpectedToken>{{TokenKind::EndOfFile}}));
  563. // Make sure weird characters aren't a problem.
  564. buffer = Lex(" // foo#$!^?@-_💩🍫⃠ [̲̅$̲̅(̲̅ ͡° ͜ʖ ͡°̲̅)̲̅$̲̅]");
  565. EXPECT_FALSE(buffer.has_errors());
  566. EXPECT_THAT(buffer,
  567. HasTokens(llvm::ArrayRef<ExpectedToken>{{TokenKind::EndOfFile}}));
  568. // Make sure we can lex a comment at the end of the input.
  569. buffer = Lex("//");
  570. EXPECT_FALSE(buffer.has_errors());
  571. EXPECT_THAT(buffer,
  572. HasTokens(llvm::ArrayRef<ExpectedToken>{{TokenKind::EndOfFile}}));
  573. }
  574. TEST_F(LexerTest, InvalidComments) {
  575. llvm::StringLiteral testcases[] = {
  576. " /// foo\n",
  577. "foo // bar\n",
  578. "//! hello",
  579. " //world",
  580. };
  581. for (llvm::StringLiteral testcase : testcases) {
  582. auto buffer = Lex(testcase);
  583. EXPECT_TRUE(buffer.has_errors());
  584. }
  585. }
  586. TEST_F(LexerTest, Identifiers) {
  587. auto buffer = Lex(" foobar");
  588. EXPECT_FALSE(buffer.has_errors());
  589. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  590. {.kind = TokenKind::Identifier,
  591. .column = 4,
  592. .indent_column = 4,
  593. .text = "foobar"},
  594. {TokenKind::EndOfFile},
  595. }));
  596. // Check different kinds of identifier character sequences.
  597. buffer = Lex("_foo_bar");
  598. EXPECT_FALSE(buffer.has_errors());
  599. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  600. {.kind = TokenKind::Identifier, .text = "_foo_bar"},
  601. {TokenKind::EndOfFile},
  602. }));
  603. buffer = Lex("foo2bar00");
  604. EXPECT_FALSE(buffer.has_errors());
  605. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  606. {.kind = TokenKind::Identifier, .text = "foo2bar00"},
  607. {TokenKind::EndOfFile},
  608. }));
  609. // Check that we can parse identifiers that start with a keyword.
  610. buffer = Lex("fnord");
  611. EXPECT_FALSE(buffer.has_errors());
  612. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  613. {.kind = TokenKind::Identifier, .text = "fnord"},
  614. {TokenKind::EndOfFile},
  615. }));
  616. // Check multiple identifiers with indent and interning.
  617. buffer = Lex(" foo;bar\nbar \n foo\tfoo");
  618. EXPECT_FALSE(buffer.has_errors());
  619. EXPECT_THAT(buffer,
  620. HasTokens(llvm::ArrayRef<ExpectedToken>{
  621. {.kind = TokenKind::Identifier,
  622. .line = 1,
  623. .column = 4,
  624. .indent_column = 4,
  625. .text = "foo"},
  626. {.kind = TokenKind::Semi},
  627. {.kind = TokenKind::Identifier,
  628. .line = 1,
  629. .column = 8,
  630. .indent_column = 4,
  631. .text = "bar"},
  632. {.kind = TokenKind::Identifier,
  633. .line = 2,
  634. .column = 1,
  635. .indent_column = 1,
  636. .text = "bar"},
  637. {.kind = TokenKind::Identifier,
  638. .line = 3,
  639. .column = 3,
  640. .indent_column = 3,
  641. .text = "foo"},
  642. {.kind = TokenKind::Identifier,
  643. .line = 3,
  644. .column = 7,
  645. .indent_column = 3,
  646. .text = "foo"},
  647. {.kind = TokenKind::EndOfFile, .line = 3, .column = 10},
  648. }));
  649. }
  650. TEST_F(LexerTest, StringLiterals) {
  651. llvm::StringLiteral testcase = R"(
  652. "hello world\n"
  653. '''foo
  654. test \
  655. \xAB
  656. ''' trailing
  657. #"""#
  658. "\0"
  659. #"\0"foo"\1"#
  660. """x"""
  661. )";
  662. auto buffer = Lex(testcase);
  663. EXPECT_FALSE(buffer.has_errors());
  664. EXPECT_THAT(buffer,
  665. HasTokens(llvm::ArrayRef<ExpectedToken>{
  666. {.kind = TokenKind::StringLiteral,
  667. .line = 2,
  668. .column = 5,
  669. .indent_column = 5,
  670. .string_contents = {"hello world\n"}},
  671. {.kind = TokenKind::StringLiteral,
  672. .line = 4,
  673. .column = 5,
  674. .indent_column = 5,
  675. .string_contents = {" test \xAB\n"}},
  676. {.kind = TokenKind::Identifier,
  677. .line = 7,
  678. .column = 10,
  679. .indent_column = 5,
  680. .text = "trailing"},
  681. {.kind = TokenKind::StringLiteral,
  682. .line = 9,
  683. .column = 7,
  684. .indent_column = 7,
  685. .string_contents = {"\""}},
  686. {.kind = TokenKind::StringLiteral,
  687. .line = 11,
  688. .column = 5,
  689. .indent_column = 5,
  690. .string_contents = llvm::StringLiteral::withInnerNUL("\0")},
  691. {.kind = TokenKind::StringLiteral,
  692. .line = 13,
  693. .column = 5,
  694. .indent_column = 5,
  695. .string_contents = {"\\0\"foo\"\\1"}},
  696. // """x""" is three string literals, not one invalid
  697. // attempt at a block string literal.
  698. {.kind = TokenKind::StringLiteral,
  699. .line = 15,
  700. .column = 5,
  701. .indent_column = 5,
  702. .string_contents = {""}},
  703. {.kind = TokenKind::StringLiteral,
  704. .line = 15,
  705. .column = 7,
  706. .indent_column = 5,
  707. .string_contents = {"x"}},
  708. {.kind = TokenKind::StringLiteral,
  709. .line = 15,
  710. .column = 10,
  711. .indent_column = 5,
  712. .string_contents = {""}},
  713. {.kind = TokenKind::EndOfFile, .line = 16, .column = 3},
  714. }));
  715. }
  716. TEST_F(LexerTest, InvalidStringLiterals) {
  717. llvm::StringLiteral invalid[] = {
  718. // clang-format off
  719. R"(")",
  720. R"('''
  721. '')",
  722. R"("\)",
  723. R"("\")",
  724. R"("\\)",
  725. R"("\\\")",
  726. R"(''')",
  727. R"('''
  728. )",
  729. R"('''\)",
  730. R"(#'''
  731. ''')",
  732. // clang-format on
  733. };
  734. for (llvm::StringLiteral test : invalid) {
  735. SCOPED_TRACE(test);
  736. auto buffer = Lex(test);
  737. EXPECT_TRUE(buffer.has_errors());
  738. // We should have formed at least one error token.
  739. bool found_error = false;
  740. for (TokenizedBuffer::Token token : buffer.tokens()) {
  741. if (buffer.GetKind(token) == TokenKind::Error) {
  742. found_error = true;
  743. break;
  744. }
  745. }
  746. EXPECT_TRUE(found_error);
  747. }
  748. }
  749. TEST_F(LexerTest, TypeLiterals) {
  750. llvm::StringLiteral testcase = R"(
  751. i0 i1 i20 i999999999999 i0x1
  752. u0 u1 u64 u64b
  753. f32 f80 f1 fi
  754. s1
  755. )";
  756. auto buffer = Lex(testcase);
  757. EXPECT_FALSE(buffer.has_errors());
  758. ASSERT_THAT(buffer,
  759. HasTokens(llvm::ArrayRef<ExpectedToken>{
  760. {.kind = TokenKind::Identifier,
  761. .line = 2,
  762. .column = 5,
  763. .indent_column = 5,
  764. .text = {"i0"}},
  765. {.kind = TokenKind::IntegerTypeLiteral,
  766. .line = 2,
  767. .column = 8,
  768. .indent_column = 5,
  769. .text = {"i1"}},
  770. {.kind = TokenKind::IntegerTypeLiteral,
  771. .line = 2,
  772. .column = 11,
  773. .indent_column = 5,
  774. .text = {"i20"}},
  775. {.kind = TokenKind::IntegerTypeLiteral,
  776. .line = 2,
  777. .column = 15,
  778. .indent_column = 5,
  779. .text = {"i999999999999"}},
  780. {.kind = TokenKind::Identifier,
  781. .line = 2,
  782. .column = 29,
  783. .indent_column = 5,
  784. .text = {"i0x1"}},
  785. {.kind = TokenKind::Identifier,
  786. .line = 3,
  787. .column = 5,
  788. .indent_column = 5,
  789. .text = {"u0"}},
  790. {.kind = TokenKind::UnsignedIntegerTypeLiteral,
  791. .line = 3,
  792. .column = 8,
  793. .indent_column = 5,
  794. .text = {"u1"}},
  795. {.kind = TokenKind::UnsignedIntegerTypeLiteral,
  796. .line = 3,
  797. .column = 11,
  798. .indent_column = 5,
  799. .text = {"u64"}},
  800. {.kind = TokenKind::Identifier,
  801. .line = 3,
  802. .column = 15,
  803. .indent_column = 5,
  804. .text = {"u64b"}},
  805. {.kind = TokenKind::FloatingPointTypeLiteral,
  806. .line = 4,
  807. .column = 5,
  808. .indent_column = 5,
  809. .text = {"f32"}},
  810. {.kind = TokenKind::FloatingPointTypeLiteral,
  811. .line = 4,
  812. .column = 9,
  813. .indent_column = 5,
  814. .text = {"f80"}},
  815. {.kind = TokenKind::FloatingPointTypeLiteral,
  816. .line = 4,
  817. .column = 13,
  818. .indent_column = 5,
  819. .text = {"f1"}},
  820. {.kind = TokenKind::Identifier,
  821. .line = 4,
  822. .column = 16,
  823. .indent_column = 5,
  824. .text = {"fi"}},
  825. {.kind = TokenKind::Identifier,
  826. .line = 5,
  827. .column = 5,
  828. .indent_column = 5,
  829. .text = {"s1"}},
  830. {.kind = TokenKind::EndOfFile, .line = 6, .column = 3},
  831. }));
  832. auto token_i1 = buffer.tokens().begin() + 1;
  833. EXPECT_EQ(buffer.GetTypeLiteralSize(*token_i1), 1);
  834. auto token_i20 = buffer.tokens().begin() + 2;
  835. EXPECT_EQ(buffer.GetTypeLiteralSize(*token_i20), 20);
  836. auto token_i999999999999 = buffer.tokens().begin() + 3;
  837. EXPECT_EQ(buffer.GetTypeLiteralSize(*token_i999999999999), 999999999999ULL);
  838. auto token_u1 = buffer.tokens().begin() + 6;
  839. EXPECT_EQ(buffer.GetTypeLiteralSize(*token_u1), 1);
  840. auto token_u64 = buffer.tokens().begin() + 7;
  841. EXPECT_EQ(buffer.GetTypeLiteralSize(*token_u64), 64);
  842. auto token_f32 = buffer.tokens().begin() + 9;
  843. EXPECT_EQ(buffer.GetTypeLiteralSize(*token_f32), 32);
  844. auto token_f80 = buffer.tokens().begin() + 10;
  845. EXPECT_EQ(buffer.GetTypeLiteralSize(*token_f80), 80);
  846. auto token_f1 = buffer.tokens().begin() + 11;
  847. EXPECT_EQ(buffer.GetTypeLiteralSize(*token_f1), 1);
  848. }
  849. TEST_F(LexerTest, TypeLiteralTooManyDigits) {
  850. std::string code = "i";
  851. constexpr int Count = 10000;
  852. code.append(Count, '9');
  853. Testing::MockDiagnosticConsumer consumer;
  854. EXPECT_CALL(consumer,
  855. HandleDiagnostic(IsDiagnostic(
  856. DiagnosticKind::TooManyDigits, DiagnosticLevel::Error, 1, 2,
  857. HasSubstr(llvm::formatv(" {0} ", Count)))));
  858. auto buffer = Lex(code, consumer);
  859. EXPECT_TRUE(buffer.has_errors());
  860. ASSERT_THAT(
  861. buffer,
  862. HasTokens(llvm::ArrayRef<ExpectedToken>{
  863. {.kind = TokenKind::Error,
  864. .line = 1,
  865. .column = 1,
  866. .indent_column = 1,
  867. .text = {code}},
  868. {.kind = TokenKind::EndOfFile, .line = 1, .column = Count + 2},
  869. }));
  870. }
  871. TEST_F(LexerTest, DiagnosticTrailingComment) {
  872. llvm::StringLiteral testcase = R"(
  873. // Hello!
  874. var String x; // trailing comment
  875. )";
  876. Testing::MockDiagnosticConsumer consumer;
  877. EXPECT_CALL(consumer,
  878. HandleDiagnostic(IsDiagnostic(DiagnosticKind::TrailingComment,
  879. DiagnosticLevel::Error, 3, 19, _)));
  880. Lex(testcase, consumer);
  881. }
  882. TEST_F(LexerTest, DiagnosticWhitespace) {
  883. Testing::MockDiagnosticConsumer consumer;
  884. EXPECT_CALL(consumer, HandleDiagnostic(IsDiagnostic(
  885. DiagnosticKind::NoWhitespaceAfterCommentIntroducer,
  886. DiagnosticLevel::Error, 1, 3, _)));
  887. Lex("//no space after comment", consumer);
  888. }
  889. TEST_F(LexerTest, DiagnosticUnrecognizedEscape) {
  890. Testing::MockDiagnosticConsumer consumer;
  891. EXPECT_CALL(consumer, HandleDiagnostic(IsDiagnostic(
  892. DiagnosticKind::UnknownEscapeSequence,
  893. DiagnosticLevel::Error, 1, 8, HasSubstr("`b`"))));
  894. Lex(R"("hello\bworld")", consumer);
  895. }
  896. TEST_F(LexerTest, DiagnosticBadHex) {
  897. Testing::MockDiagnosticConsumer consumer;
  898. EXPECT_CALL(consumer, HandleDiagnostic(IsDiagnostic(
  899. DiagnosticKind::HexadecimalEscapeMissingDigits,
  900. DiagnosticLevel::Error, 1, 9, _)));
  901. Lex(R"("hello\xabworld")", consumer);
  902. }
  903. TEST_F(LexerTest, DiagnosticInvalidDigit) {
  904. Testing::MockDiagnosticConsumer consumer;
  905. EXPECT_CALL(consumer, HandleDiagnostic(IsDiagnostic(
  906. DiagnosticKind::InvalidDigit,
  907. DiagnosticLevel::Error, 1, 6, HasSubstr("'a'"))));
  908. Lex("0x123abc", consumer);
  909. }
  910. TEST_F(LexerTest, DiagnosticMissingTerminator) {
  911. Testing::MockDiagnosticConsumer consumer;
  912. EXPECT_CALL(consumer,
  913. HandleDiagnostic(IsDiagnostic(DiagnosticKind::UnterminatedString,
  914. DiagnosticLevel::Error, 1, 1, _)));
  915. Lex(R"(#" ")", consumer);
  916. }
  917. TEST_F(LexerTest, DiagnosticUnrecognizedChar) {
  918. Testing::MockDiagnosticConsumer consumer;
  919. EXPECT_CALL(consumer, HandleDiagnostic(
  920. IsDiagnostic(DiagnosticKind::UnrecognizedCharacters,
  921. DiagnosticLevel::Error, 1, 1, _)));
  922. Lex("\b", consumer);
  923. }
  924. TEST_F(LexerTest, PrintingAsYaml) {
  925. // Test that we can parse this into YAML and verify line and indent data.
  926. auto buffer = Lex("\n ;\n\n\n; ;\n\n\n\n\n\n\n\n\n\n\n");
  927. ASSERT_FALSE(buffer.has_errors());
  928. TestRawOstream print_stream;
  929. buffer.Print(print_stream);
  930. EXPECT_THAT(Yaml::Value::FromText(print_stream.TakeStr()),
  931. ElementsAre(Yaml::SequenceValue{
  932. Yaml::MappingValue{{"index", "0"},
  933. {"kind", "Semi"},
  934. {"line", "2"},
  935. {"column", "2"},
  936. {"indent", "2"},
  937. {"spelling", ";"},
  938. {"has_trailing_space", "true"}},
  939. Yaml::MappingValue{{"index", "1"},
  940. {"kind", "Semi"},
  941. {"line", "5"},
  942. {"column", "1"},
  943. {"indent", "1"},
  944. {"spelling", ";"},
  945. {"has_trailing_space", "true"}},
  946. Yaml::MappingValue{{"index", "2"},
  947. {"kind", "Semi"},
  948. {"line", "5"},
  949. {"column", "3"},
  950. {"indent", "1"},
  951. {"spelling", ";"},
  952. {"has_trailing_space", "true"}},
  953. Yaml::MappingValue{{"index", "3"},
  954. {"kind", "EndOfFile"},
  955. {"line", "15"},
  956. {"column", "1"},
  957. {"indent", "1"},
  958. {"spelling", ""}}}));
  959. }
  960. } // namespace
  961. } // namespace Carbon::Testing