tokenized_buffer_test.cpp 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "lexer/tokenized_buffer.h"
  5. #include <iterator>
  6. #include "diagnostics/diagnostic_emitter.h"
  7. #include "gmock/gmock.h"
  8. #include "gtest/gtest.h"
  9. #include "lexer/tokenized_buffer_test_helpers.h"
  10. #include "llvm/ADT/ArrayRef.h"
  11. #include "llvm/ADT/None.h"
  12. #include "llvm/ADT/Sequence.h"
  13. #include "llvm/ADT/SmallString.h"
  14. #include "llvm/ADT/Twine.h"
  15. #include "llvm/Support/SourceMgr.h"
  16. #include "llvm/Support/YAMLParser.h"
  17. #include "llvm/Support/raw_ostream.h"
  18. namespace Carbon {
  19. namespace {
  20. using ::Carbon::Testing::ExpectedToken;
  21. using ::Carbon::Testing::HasTokens;
  22. using ::Carbon::Testing::IsKeyValueScalars;
  23. using ::testing::Eq;
  24. using ::testing::NotNull;
  25. using ::testing::StrEq;
  26. struct LexerTest : ::testing::Test {
  27. llvm::SmallVector<SourceBuffer, 16> source_storage;
  28. auto GetSourceBuffer(llvm::Twine text) -> SourceBuffer& {
  29. source_storage.push_back(SourceBuffer::CreateFromText(text.str()));
  30. return source_storage.back();
  31. }
  32. auto Lex(llvm::Twine text) -> TokenizedBuffer {
  33. // TODO: build a full mock for this.
  34. return TokenizedBuffer::Lex(GetSourceBuffer(text), NullDiagnosticEmitter());
  35. }
  36. };
  37. TEST_F(LexerTest, HandlesEmptyBuffer) {
  38. auto buffer = Lex("");
  39. EXPECT_FALSE(buffer.HasErrors());
  40. EXPECT_EQ(buffer.Tokens().begin(), buffer.Tokens().end());
  41. }
  42. TEST_F(LexerTest, TracksLinesAndColumns) {
  43. auto buffer = Lex("\n ;;\n ;;;\n");
  44. EXPECT_FALSE(buffer.HasErrors());
  45. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  46. {.kind = TokenKind::Semi(),
  47. .line = 2,
  48. .column = 3,
  49. .indent_column = 3},
  50. {.kind = TokenKind::Semi(),
  51. .line = 2,
  52. .column = 4,
  53. .indent_column = 3},
  54. {.kind = TokenKind::Semi(),
  55. .line = 3,
  56. .column = 4,
  57. .indent_column = 4},
  58. {.kind = TokenKind::Semi(),
  59. .line = 3,
  60. .column = 5,
  61. .indent_column = 4},
  62. {.kind = TokenKind::Semi(),
  63. .line = 3,
  64. .column = 6,
  65. .indent_column = 4},
  66. }));
  67. }
  68. TEST_F(LexerTest, HandlesIntegerLiteral) {
  69. auto buffer = Lex("12-578\n 1 2\n0x12_3ABC\n0b10_10_11\n1_234_567");
  70. EXPECT_FALSE(buffer.HasErrors());
  71. ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  72. {.kind = TokenKind::IntegerLiteral(),
  73. .line = 1,
  74. .column = 1,
  75. .indent_column = 1,
  76. .text = "12"},
  77. {.kind = TokenKind::Minus(),
  78. .line = 1,
  79. .column = 3,
  80. .indent_column = 1},
  81. {.kind = TokenKind::IntegerLiteral(),
  82. .line = 1,
  83. .column = 4,
  84. .indent_column = 1,
  85. .text = "578"},
  86. {.kind = TokenKind::IntegerLiteral(),
  87. .line = 2,
  88. .column = 3,
  89. .indent_column = 3,
  90. .text = "1"},
  91. {.kind = TokenKind::IntegerLiteral(),
  92. .line = 2,
  93. .column = 6,
  94. .indent_column = 3,
  95. .text = "2"},
  96. {.kind = TokenKind::IntegerLiteral(),
  97. .line = 3,
  98. .column = 1,
  99. .indent_column = 1,
  100. .text = "0x12_3ABC"},
  101. {.kind = TokenKind::IntegerLiteral(),
  102. .line = 4,
  103. .column = 1,
  104. .indent_column = 1,
  105. .text = "0b10_10_11"},
  106. {.kind = TokenKind::IntegerLiteral(),
  107. .line = 5,
  108. .column = 1,
  109. .indent_column = 1,
  110. .text = "1_234_567"},
  111. }));
  112. auto token_12 = buffer.Tokens().begin();
  113. EXPECT_EQ(buffer.GetIntegerLiteral(*token_12), 12);
  114. auto token_578 = buffer.Tokens().begin() + 2;
  115. EXPECT_EQ(buffer.GetIntegerLiteral(*token_578), 578);
  116. auto token_1 = buffer.Tokens().begin() + 3;
  117. EXPECT_EQ(buffer.GetIntegerLiteral(*token_1), 1);
  118. auto token_2 = buffer.Tokens().begin() + 4;
  119. EXPECT_EQ(buffer.GetIntegerLiteral(*token_2), 2);
  120. auto token_0x12_3abc = buffer.Tokens().begin() + 5;
  121. EXPECT_EQ(buffer.GetIntegerLiteral(*token_0x12_3abc), 0x12'3abc);
  122. auto token_0b10_10_11 = buffer.Tokens().begin() + 6;
  123. EXPECT_EQ(buffer.GetIntegerLiteral(*token_0b10_10_11), 0b10'10'11);
  124. auto token_1_234_567 = buffer.Tokens().begin() + 7;
  125. EXPECT_EQ(buffer.GetIntegerLiteral(*token_1_234_567), 1'234'567);
  126. }
  127. TEST_F(LexerTest, ValidatesBaseSpecifier) {
  128. llvm::StringLiteral valid[] = {
  129. // Decimal integer literals.
  130. "0",
  131. "1",
  132. "123456789000000000000000000000000000000000000",
  133. // Hexadecimal integer literals.
  134. "0x0123456789ABCDEF",
  135. "0x0000000000000000000000000000000",
  136. // Binary integer literals.
  137. "0b10110100101001010",
  138. "0b0000000",
  139. };
  140. for (llvm::StringLiteral literal : valid) {
  141. auto buffer = Lex(literal);
  142. EXPECT_FALSE(buffer.HasErrors()) << literal;
  143. ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  144. {.kind = TokenKind::IntegerLiteral(),
  145. .line = 1,
  146. .column = 1,
  147. .indent_column = 1,
  148. .text = literal}}));
  149. }
  150. llvm::StringLiteral invalid[] = {
  151. "00", "0X123", "0o123", "0B1",
  152. "007", "123L", "123456789A", "0x",
  153. "0b", "0x123abc", "0b011101201001", "0b10A",
  154. };
  155. for (llvm::StringLiteral literal : invalid) {
  156. auto buffer = Lex(literal);
  157. EXPECT_TRUE(buffer.HasErrors()) << literal;
  158. ASSERT_THAT(
  159. buffer,
  160. HasTokens(llvm::ArrayRef<ExpectedToken>{{.kind = TokenKind::Error(),
  161. .line = 1,
  162. .column = 1,
  163. .indent_column = 1,
  164. .text = literal}}));
  165. }
  166. }
  167. TEST_F(LexerTest, ValidatesIntegerDigitSeparators) {
  168. llvm::StringLiteral valid[] = {
  169. // Decimal literals optionally have digit separators every 3 places.
  170. "1_234",
  171. "123_456",
  172. "1_234_567",
  173. // Hexadecimal literals optionally have digit separators every 4 places.
  174. "0x1_0000",
  175. "0x1000_0000",
  176. "0x1_0000_0000",
  177. // Binary integer literals can have digit separators anywhere..
  178. "0b1_0_1_0_1_0",
  179. "0b111_0000",
  180. };
  181. for (llvm::StringLiteral literal : valid) {
  182. auto buffer = Lex(literal);
  183. EXPECT_FALSE(buffer.HasErrors()) << literal;
  184. ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  185. {.kind = TokenKind::IntegerLiteral(),
  186. .line = 1,
  187. .column = 1,
  188. .indent_column = 1,
  189. .text = literal}}));
  190. }
  191. llvm::StringLiteral invalid[] = {
  192. // Decimal literals.
  193. "12_34",
  194. "123_4_6_789",
  195. "12_3456_789",
  196. "12__345",
  197. "1_",
  198. // Hexadecimal literals.
  199. "0x_1234",
  200. "0x123_",
  201. "0x12_3",
  202. "0x_234_5678",
  203. "0x1234_567",
  204. // Binary literals.
  205. "0b_10101",
  206. "0b1__01",
  207. "0b1011_",
  208. "0b1_01_01_",
  209. };
  210. for (llvm::StringLiteral literal : invalid) {
  211. auto buffer = Lex(literal);
  212. EXPECT_TRUE(buffer.HasErrors()) << literal;
  213. // We expect to produce a token even for a literal containing invalid digit
  214. // separators, for better error recovery.
  215. ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  216. {.kind = TokenKind::IntegerLiteral(),
  217. .line = 1,
  218. .column = 1,
  219. .indent_column = 1,
  220. .text = literal}}));
  221. }
  222. }
  223. TEST_F(LexerTest, HandlesGarbageCharacters) {
  224. constexpr char GarbageText[] = "$$💩-$\n$\0$12$";
  225. auto buffer = Lex(llvm::StringRef(GarbageText, sizeof(GarbageText) - 1));
  226. EXPECT_TRUE(buffer.HasErrors());
  227. EXPECT_THAT(
  228. buffer,
  229. HasTokens(llvm::ArrayRef<ExpectedToken>{
  230. {.kind = TokenKind::Error(),
  231. .line = 1,
  232. .column = 1,
  233. .text = llvm::StringRef("$$💩", 6)},
  234. // 💩 takes 4 bytes, and we count column as bytes offset.
  235. {.kind = TokenKind::Minus(), .line = 1, .column = 7},
  236. {.kind = TokenKind::Error(), .line = 1, .column = 8, .text = "$"},
  237. // newline
  238. {.kind = TokenKind::Error(),
  239. .line = 2,
  240. .column = 1,
  241. .text = llvm::StringRef("$\0$", 3)},
  242. {.kind = TokenKind::IntegerLiteral(),
  243. .line = 2,
  244. .column = 4,
  245. .text = "12"},
  246. {.kind = TokenKind::Error(), .line = 2, .column = 6, .text = "$"},
  247. }));
  248. }
  249. TEST_F(LexerTest, Symbols) {
  250. // We don't need to exhaustively test symbols here as they're handled with
  251. // common code, but we want to check specific patterns to verify things like
  252. // max-munch rule and handling of interesting symbols.
  253. auto buffer = Lex("<<<");
  254. EXPECT_FALSE(buffer.HasErrors());
  255. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  256. {TokenKind::LessLess()},
  257. {TokenKind::Less()},
  258. }));
  259. buffer = Lex("<<=>>");
  260. EXPECT_FALSE(buffer.HasErrors());
  261. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  262. {TokenKind::LessLessEqual()},
  263. {TokenKind::GreaterGreater()},
  264. }));
  265. buffer = Lex("< <=> >");
  266. EXPECT_FALSE(buffer.HasErrors());
  267. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  268. {TokenKind::Less()},
  269. {TokenKind::LessEqualGreater()},
  270. {TokenKind::Greater()},
  271. }));
  272. buffer = Lex("\\/?#@&^!");
  273. EXPECT_FALSE(buffer.HasErrors());
  274. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  275. {TokenKind::Backslash()},
  276. {TokenKind::Slash()},
  277. {TokenKind::Question()},
  278. {TokenKind::Hash()},
  279. {TokenKind::At()},
  280. {TokenKind::Amp()},
  281. {TokenKind::Caret()},
  282. {TokenKind::Exclaim()},
  283. }));
  284. }
  285. TEST_F(LexerTest, Parens) {
  286. auto buffer = Lex("()");
  287. EXPECT_FALSE(buffer.HasErrors());
  288. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  289. {TokenKind::OpenParen()},
  290. {TokenKind::CloseParen()},
  291. }));
  292. buffer = Lex("((()()))");
  293. EXPECT_FALSE(buffer.HasErrors());
  294. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  295. {TokenKind::OpenParen()},
  296. {TokenKind::OpenParen()},
  297. {TokenKind::OpenParen()},
  298. {TokenKind::CloseParen()},
  299. {TokenKind::OpenParen()},
  300. {TokenKind::CloseParen()},
  301. {TokenKind::CloseParen()},
  302. {TokenKind::CloseParen()},
  303. }));
  304. }
  305. TEST_F(LexerTest, CurlyBraces) {
  306. auto buffer = Lex("{}");
  307. EXPECT_FALSE(buffer.HasErrors());
  308. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  309. {TokenKind::OpenCurlyBrace()},
  310. {TokenKind::CloseCurlyBrace()},
  311. }));
  312. buffer = Lex("{{{}{}}}");
  313. EXPECT_FALSE(buffer.HasErrors());
  314. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  315. {TokenKind::OpenCurlyBrace()},
  316. {TokenKind::OpenCurlyBrace()},
  317. {TokenKind::OpenCurlyBrace()},
  318. {TokenKind::CloseCurlyBrace()},
  319. {TokenKind::OpenCurlyBrace()},
  320. {TokenKind::CloseCurlyBrace()},
  321. {TokenKind::CloseCurlyBrace()},
  322. {TokenKind::CloseCurlyBrace()},
  323. }));
  324. }
  325. TEST_F(LexerTest, MatchingGroups) {
  326. {
  327. TokenizedBuffer buffer = Lex("(){}");
  328. ASSERT_FALSE(buffer.HasErrors());
  329. auto it = buffer.Tokens().begin();
  330. auto open_paren_token = *it++;
  331. auto close_paren_token = *it++;
  332. EXPECT_EQ(close_paren_token,
  333. buffer.GetMatchedClosingToken(open_paren_token));
  334. EXPECT_EQ(open_paren_token,
  335. buffer.GetMatchedOpeningToken(close_paren_token));
  336. auto open_curly_token = *it++;
  337. auto close_curly_token = *it++;
  338. EXPECT_EQ(close_curly_token,
  339. buffer.GetMatchedClosingToken(open_curly_token));
  340. EXPECT_EQ(open_curly_token,
  341. buffer.GetMatchedOpeningToken(close_curly_token));
  342. EXPECT_EQ(buffer.Tokens().end(), it);
  343. }
  344. {
  345. TokenizedBuffer buffer = Lex("({x}){(y)} {{((z))}}");
  346. ASSERT_FALSE(buffer.HasErrors());
  347. auto it = buffer.Tokens().begin();
  348. auto open_paren_token = *it++;
  349. auto open_curly_token = *it++;
  350. ASSERT_EQ("x", buffer.GetIdentifierText(buffer.GetIdentifier(*it++)));
  351. auto close_curly_token = *it++;
  352. auto close_paren_token = *it++;
  353. EXPECT_EQ(close_paren_token,
  354. buffer.GetMatchedClosingToken(open_paren_token));
  355. EXPECT_EQ(open_paren_token,
  356. buffer.GetMatchedOpeningToken(close_paren_token));
  357. EXPECT_EQ(close_curly_token,
  358. buffer.GetMatchedClosingToken(open_curly_token));
  359. EXPECT_EQ(open_curly_token,
  360. buffer.GetMatchedOpeningToken(close_curly_token));
  361. open_curly_token = *it++;
  362. open_paren_token = *it++;
  363. ASSERT_EQ("y", buffer.GetIdentifierText(buffer.GetIdentifier(*it++)));
  364. close_paren_token = *it++;
  365. close_curly_token = *it++;
  366. EXPECT_EQ(close_curly_token,
  367. buffer.GetMatchedClosingToken(open_curly_token));
  368. EXPECT_EQ(open_curly_token,
  369. buffer.GetMatchedOpeningToken(close_curly_token));
  370. EXPECT_EQ(close_paren_token,
  371. buffer.GetMatchedClosingToken(open_paren_token));
  372. EXPECT_EQ(open_paren_token,
  373. buffer.GetMatchedOpeningToken(close_paren_token));
  374. open_curly_token = *it++;
  375. auto inner_open_curly_token = *it++;
  376. open_paren_token = *it++;
  377. auto inner_open_paren_token = *it++;
  378. ASSERT_EQ("z", buffer.GetIdentifierText(buffer.GetIdentifier(*it++)));
  379. auto inner_close_paren_token = *it++;
  380. close_paren_token = *it++;
  381. auto inner_close_curly_token = *it++;
  382. close_curly_token = *it++;
  383. EXPECT_EQ(close_curly_token,
  384. buffer.GetMatchedClosingToken(open_curly_token));
  385. EXPECT_EQ(open_curly_token,
  386. buffer.GetMatchedOpeningToken(close_curly_token));
  387. EXPECT_EQ(inner_close_curly_token,
  388. buffer.GetMatchedClosingToken(inner_open_curly_token));
  389. EXPECT_EQ(inner_open_curly_token,
  390. buffer.GetMatchedOpeningToken(inner_close_curly_token));
  391. EXPECT_EQ(close_paren_token,
  392. buffer.GetMatchedClosingToken(open_paren_token));
  393. EXPECT_EQ(open_paren_token,
  394. buffer.GetMatchedOpeningToken(close_paren_token));
  395. EXPECT_EQ(inner_close_paren_token,
  396. buffer.GetMatchedClosingToken(inner_open_paren_token));
  397. EXPECT_EQ(inner_open_paren_token,
  398. buffer.GetMatchedOpeningToken(inner_close_paren_token));
  399. EXPECT_EQ(buffer.Tokens().end(), it);
  400. }
  401. }
  402. TEST_F(LexerTest, MismatchedGroups) {
  403. auto buffer = Lex("{");
  404. EXPECT_TRUE(buffer.HasErrors());
  405. EXPECT_THAT(buffer,
  406. HasTokens(llvm::ArrayRef<ExpectedToken>{
  407. {TokenKind::OpenCurlyBrace()},
  408. {.kind = TokenKind::CloseCurlyBrace(), .recovery = true},
  409. }));
  410. buffer = Lex("}");
  411. EXPECT_TRUE(buffer.HasErrors());
  412. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  413. {.kind = TokenKind::Error(), .text = "}"},
  414. }));
  415. buffer = Lex("{(}");
  416. EXPECT_TRUE(buffer.HasErrors());
  417. EXPECT_THAT(
  418. buffer,
  419. HasTokens(llvm::ArrayRef<ExpectedToken>{
  420. {.kind = TokenKind::OpenCurlyBrace(), .column = 1},
  421. {.kind = TokenKind::OpenParen(), .column = 2},
  422. {.kind = TokenKind::CloseParen(), .column = 3, .recovery = true},
  423. {.kind = TokenKind::CloseCurlyBrace(), .column = 3},
  424. }));
  425. buffer = Lex(")({)");
  426. EXPECT_TRUE(buffer.HasErrors());
  427. EXPECT_THAT(
  428. buffer,
  429. HasTokens(llvm::ArrayRef<ExpectedToken>{
  430. {.kind = TokenKind::Error(), .column = 1, .text = ")"},
  431. {.kind = TokenKind::OpenParen(), .column = 2},
  432. {.kind = TokenKind::OpenCurlyBrace(), .column = 3},
  433. {.kind = TokenKind::CloseCurlyBrace(), .column = 4, .recovery = true},
  434. {.kind = TokenKind::CloseParen(), .column = 4},
  435. }));
  436. }
  437. TEST_F(LexerTest, Keywords) {
  438. auto buffer = Lex(" fn");
  439. EXPECT_FALSE(buffer.HasErrors());
  440. EXPECT_THAT(
  441. buffer,
  442. HasTokens(llvm::ArrayRef<ExpectedToken>{
  443. {.kind = TokenKind::FnKeyword(), .column = 4, .indent_column = 4},
  444. }));
  445. buffer = Lex("and or not if else for loop return var break continue _");
  446. EXPECT_FALSE(buffer.HasErrors());
  447. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  448. {TokenKind::AndKeyword()},
  449. {TokenKind::OrKeyword()},
  450. {TokenKind::NotKeyword()},
  451. {TokenKind::IfKeyword()},
  452. {TokenKind::ElseKeyword()},
  453. {TokenKind::ForKeyword()},
  454. {TokenKind::LoopKeyword()},
  455. {TokenKind::ReturnKeyword()},
  456. {TokenKind::VarKeyword()},
  457. {TokenKind::BreakKeyword()},
  458. {TokenKind::ContinueKeyword()},
  459. {TokenKind::UnderscoreKeyword()},
  460. }));
  461. }
  462. TEST_F(LexerTest, Comments) {
  463. auto buffer = Lex(" ;\n // foo\n ;");
  464. EXPECT_FALSE(buffer.HasErrors());
  465. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  466. {.kind = TokenKind::Semi(),
  467. .line = 1,
  468. .column = 2,
  469. .indent_column = 2},
  470. {.kind = TokenKind::Semi(),
  471. .line = 3,
  472. .column = 3,
  473. .indent_column = 3},
  474. }));
  475. buffer = Lex("// foo\n//\n// bar");
  476. EXPECT_FALSE(buffer.HasErrors());
  477. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{}));
  478. // Make sure weird characters aren't a problem.
  479. buffer = Lex(" // foo#$!^?@-_💩🍫⃠ [̲̅$̲̅(̲̅ ͡° ͜ʖ ͡°̲̅)̲̅$̲̅]");
  480. EXPECT_FALSE(buffer.HasErrors());
  481. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{}));
  482. // Make sure we can lex a comment at the end of the input.
  483. buffer = Lex("//");
  484. EXPECT_FALSE(buffer.HasErrors());
  485. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{}));
  486. }
  487. TEST_F(LexerTest, InvalidComments) {
  488. llvm::StringLiteral testcases[] = {
  489. " /// foo\n",
  490. "foo // bar\n",
  491. "//! hello",
  492. " //world",
  493. };
  494. for (llvm::StringLiteral testcase : testcases) {
  495. auto buffer = Lex(testcase);
  496. EXPECT_TRUE(buffer.HasErrors());
  497. }
  498. }
  499. TEST_F(LexerTest, Identifiers) {
  500. auto buffer = Lex(" foobar");
  501. EXPECT_FALSE(buffer.HasErrors());
  502. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  503. {.kind = TokenKind::Identifier(),
  504. .column = 4,
  505. .indent_column = 4,
  506. .text = "foobar"},
  507. }));
  508. // Check different kinds of identifier character sequences.
  509. buffer = Lex("_foo_bar");
  510. EXPECT_FALSE(buffer.HasErrors());
  511. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  512. {.kind = TokenKind::Identifier(), .text = "_foo_bar"},
  513. }));
  514. buffer = Lex("foo2bar00");
  515. EXPECT_FALSE(buffer.HasErrors());
  516. EXPECT_THAT(buffer,
  517. HasTokens(llvm::ArrayRef<ExpectedToken>{
  518. {.kind = TokenKind::Identifier(), .text = "foo2bar00"},
  519. }));
  520. // Check that we can parse identifiers that start with a keyword.
  521. buffer = Lex("fnord");
  522. EXPECT_FALSE(buffer.HasErrors());
  523. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  524. {.kind = TokenKind::Identifier(), .text = "fnord"},
  525. }));
  526. // Check multiple identifiers with indent and interning.
  527. buffer = Lex(" foo;bar\nbar \n foo\tfoo");
  528. EXPECT_FALSE(buffer.HasErrors());
  529. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  530. {.kind = TokenKind::Identifier(),
  531. .line = 1,
  532. .column = 4,
  533. .indent_column = 4,
  534. .text = "foo"},
  535. {.kind = TokenKind::Semi()},
  536. {.kind = TokenKind::Identifier(),
  537. .line = 1,
  538. .column = 8,
  539. .indent_column = 4,
  540. .text = "bar"},
  541. {.kind = TokenKind::Identifier(),
  542. .line = 2,
  543. .column = 1,
  544. .indent_column = 1,
  545. .text = "bar"},
  546. {.kind = TokenKind::Identifier(),
  547. .line = 3,
  548. .column = 3,
  549. .indent_column = 3,
  550. .text = "foo"},
  551. {.kind = TokenKind::Identifier(),
  552. .line = 3,
  553. .column = 7,
  554. .indent_column = 3,
  555. .text = "foo"},
  556. }));
  557. }
  558. auto GetAndDropLine(llvm::StringRef& text) -> std::string {
  559. auto newline_offset = text.find_first_of('\n');
  560. llvm::StringRef line = text.slice(0, newline_offset);
  561. if (newline_offset != llvm::StringRef::npos) {
  562. text = text.substr(newline_offset + 1);
  563. } else {
  564. text = "";
  565. }
  566. return line.str();
  567. }
  568. TEST_F(LexerTest, Printing) {
  569. auto buffer = Lex(";");
  570. ASSERT_FALSE(buffer.HasErrors());
  571. std::string print_storage;
  572. llvm::raw_string_ostream print_stream(print_storage);
  573. buffer.Print(print_stream);
  574. llvm::StringRef print = print_stream.str();
  575. EXPECT_THAT(GetAndDropLine(print),
  576. StrEq("token: { index: 0, kind: 'Semi', line: 1, column: 1, "
  577. "indent: 1, spelling: ';' }"));
  578. EXPECT_TRUE(print.empty()) << print;
  579. // Test kind padding.
  580. buffer = Lex("(;foo;)");
  581. ASSERT_FALSE(buffer.HasErrors());
  582. print_storage.clear();
  583. buffer.Print(print_stream);
  584. print = print_stream.str();
  585. EXPECT_THAT(GetAndDropLine(print),
  586. StrEq("token: { index: 0, kind: 'OpenParen', line: 1, column: "
  587. "1, indent: 1, spelling: '(', closing_token: 4 }"));
  588. EXPECT_THAT(GetAndDropLine(print),
  589. StrEq("token: { index: 1, kind: 'Semi', line: 1, column: "
  590. "2, indent: 1, spelling: ';' }"));
  591. EXPECT_THAT(GetAndDropLine(print),
  592. StrEq("token: { index: 2, kind: 'Identifier', line: 1, column: "
  593. "3, indent: 1, spelling: 'foo', identifier: 0 }"));
  594. EXPECT_THAT(GetAndDropLine(print),
  595. StrEq("token: { index: 3, kind: 'Semi', line: 1, column: "
  596. "6, indent: 1, spelling: ';' }"));
  597. EXPECT_THAT(GetAndDropLine(print),
  598. StrEq("token: { index: 4, kind: 'CloseParen', line: 1, column: "
  599. "7, indent: 1, spelling: ')', opening_token: 0 }"));
  600. EXPECT_TRUE(print.empty()) << print;
  601. // Test digit padding with max values of 9, 10, and 11.
  602. buffer = Lex(";\n\n\n\n\n\n\n\n\n\n ;;");
  603. ASSERT_FALSE(buffer.HasErrors());
  604. print_storage.clear();
  605. buffer.Print(print_stream);
  606. print = print_stream.str();
  607. EXPECT_THAT(GetAndDropLine(print),
  608. StrEq("token: { index: 0, kind: 'Semi', line: 1, column: 1, "
  609. "indent: 1, spelling: ';' }"));
  610. EXPECT_THAT(GetAndDropLine(print),
  611. StrEq("token: { index: 1, kind: 'Semi', line: 11, column: 9, "
  612. "indent: 9, spelling: ';' }"));
  613. EXPECT_THAT(GetAndDropLine(print),
  614. StrEq("token: { index: 2, kind: 'Semi', line: 11, column: 10, "
  615. "indent: 9, spelling: ';' }"));
  616. EXPECT_TRUE(print.empty()) << print;
  617. }
  618. TEST_F(LexerTest, PrintingAsYaml) {
  619. // Test that we can parse this into YAML and verify line and indent data.
  620. auto buffer = Lex("\n ;\n\n\n; ;\n\n\n\n\n\n\n\n\n\n\n");
  621. ASSERT_FALSE(buffer.HasErrors());
  622. std::string print_output;
  623. llvm::raw_string_ostream print_stream(print_output);
  624. buffer.Print(print_stream);
  625. print_stream.flush();
  626. // Parse the output into a YAML stream. This will print errors to stderr.
  627. llvm::SourceMgr source_manager;
  628. llvm::yaml::Stream yaml_stream(print_output, source_manager);
  629. auto yaml_it = yaml_stream.begin();
  630. auto* root_node = llvm::dyn_cast<llvm::yaml::MappingNode>(yaml_it->getRoot());
  631. ASSERT_THAT(root_node, NotNull());
  632. // Walk the top-level mapping of tokens, dig out the sub-mapping of data for
  633. // each taken, and then verify those entries.
  634. auto mapping_it = llvm::cast<llvm::yaml::MappingNode>(root_node)->begin();
  635. auto* token_node = llvm::dyn_cast<llvm::yaml::KeyValueNode>(&*mapping_it);
  636. ASSERT_THAT(token_node, NotNull());
  637. auto* token_key_node =
  638. llvm::dyn_cast<llvm::yaml::ScalarNode>(token_node->getKey());
  639. ASSERT_THAT(token_key_node, NotNull());
  640. EXPECT_THAT(token_key_node->getRawValue(), StrEq("token"));
  641. auto* token_value_node =
  642. llvm::dyn_cast<llvm::yaml::MappingNode>(token_node->getValue());
  643. ASSERT_THAT(token_value_node, NotNull());
  644. auto token_it = token_value_node->begin();
  645. EXPECT_THAT(&*token_it, IsKeyValueScalars("index", "0"));
  646. ++token_it;
  647. EXPECT_THAT(&*token_it, IsKeyValueScalars("kind", "Semi"));
  648. ++token_it;
  649. EXPECT_THAT(&*token_it, IsKeyValueScalars("line", "2"));
  650. ++token_it;
  651. EXPECT_THAT(&*token_it, IsKeyValueScalars("column", "2"));
  652. ++token_it;
  653. EXPECT_THAT(&*token_it, IsKeyValueScalars("indent", "2"));
  654. ++token_it;
  655. EXPECT_THAT(&*token_it, IsKeyValueScalars("spelling", ";"));
  656. EXPECT_THAT(++token_it, Eq(token_value_node->end()));
  657. ++mapping_it;
  658. token_node = llvm::dyn_cast<llvm::yaml::KeyValueNode>(&*mapping_it);
  659. ASSERT_THAT(token_node, NotNull());
  660. token_key_node = llvm::dyn_cast<llvm::yaml::ScalarNode>(token_node->getKey());
  661. ASSERT_THAT(token_key_node, NotNull());
  662. EXPECT_THAT(token_key_node->getRawValue(), StrEq("token"));
  663. token_value_node =
  664. llvm::dyn_cast<llvm::yaml::MappingNode>(token_node->getValue());
  665. ASSERT_THAT(token_value_node, NotNull());
  666. token_it = token_value_node->begin();
  667. EXPECT_THAT(&*token_it, IsKeyValueScalars("index", "1"));
  668. ++token_it;
  669. EXPECT_THAT(&*token_it, IsKeyValueScalars("kind", "Semi"));
  670. ++token_it;
  671. EXPECT_THAT(&*token_it, IsKeyValueScalars("line", "5"));
  672. ++token_it;
  673. EXPECT_THAT(&*token_it, IsKeyValueScalars("column", "1"));
  674. ++token_it;
  675. EXPECT_THAT(&*token_it, IsKeyValueScalars("indent", "1"));
  676. ++token_it;
  677. EXPECT_THAT(&*token_it, IsKeyValueScalars("spelling", ";"));
  678. EXPECT_THAT(++token_it, Eq(token_value_node->end()));
  679. ++mapping_it;
  680. token_node = llvm::dyn_cast<llvm::yaml::KeyValueNode>(&*mapping_it);
  681. ASSERT_THAT(token_node, NotNull());
  682. token_key_node = llvm::dyn_cast<llvm::yaml::ScalarNode>(token_node->getKey());
  683. ASSERT_THAT(token_key_node, NotNull());
  684. EXPECT_THAT(token_key_node->getRawValue(), StrEq("token"));
  685. token_value_node =
  686. llvm::dyn_cast<llvm::yaml::MappingNode>(token_node->getValue());
  687. ASSERT_THAT(token_value_node, NotNull());
  688. token_it = token_value_node->begin();
  689. EXPECT_THAT(&*token_it, IsKeyValueScalars("index", "2"));
  690. ++token_it;
  691. EXPECT_THAT(&*token_it, IsKeyValueScalars("kind", "Semi"));
  692. ++token_it;
  693. EXPECT_THAT(&*token_it, IsKeyValueScalars("line", "5"));
  694. ++token_it;
  695. EXPECT_THAT(&*token_it, IsKeyValueScalars("column", "3"));
  696. ++token_it;
  697. EXPECT_THAT(&*token_it, IsKeyValueScalars("indent", "1"));
  698. ++token_it;
  699. EXPECT_THAT(&*token_it, IsKeyValueScalars("spelling", ";"));
  700. EXPECT_THAT(++token_it, Eq(token_value_node->end()));
  701. ASSERT_THAT(++mapping_it, Eq(root_node->end()));
  702. ASSERT_THAT(++yaml_it, Eq(yaml_stream.end()));
  703. }
  704. } // namespace
  705. } // namespace Carbon