tokenized_buffer_test.cpp 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "lexer/tokenized_buffer.h"
  5. #include <iterator>
  6. #include "diagnostics/diagnostic_emitter.h"
  7. #include "gmock/gmock.h"
  8. #include "gtest/gtest.h"
  9. #include "lexer/tokenized_buffer_test_helpers.h"
  10. #include "llvm/ADT/ArrayRef.h"
  11. #include "llvm/ADT/None.h"
  12. #include "llvm/ADT/Sequence.h"
  13. #include "llvm/ADT/SmallString.h"
  14. #include "llvm/ADT/Twine.h"
  15. #include "llvm/Support/SourceMgr.h"
  16. #include "llvm/Support/YAMLParser.h"
  17. #include "llvm/Support/raw_ostream.h"
  18. namespace Carbon {
  19. namespace {
  20. using ::Carbon::Testing::ExpectedToken;
  21. using ::Carbon::Testing::HasTokens;
  22. using ::Carbon::Testing::IsKeyValueScalars;
  23. using ::testing::Eq;
  24. using ::testing::NotNull;
  25. using ::testing::StrEq;
  26. struct LexerTest : ::testing::Test {
  27. llvm::SmallVector<SourceBuffer, 16> source_storage;
  28. auto GetSourceBuffer(llvm::Twine text) -> SourceBuffer& {
  29. source_storage.push_back(SourceBuffer::CreateFromText(text.str()));
  30. return source_storage.back();
  31. }
  32. auto Lex(llvm::Twine text) -> TokenizedBuffer {
  33. // TODO: build a full mock for this.
  34. return TokenizedBuffer::Lex(GetSourceBuffer(text),
  35. ConsoleDiagnosticEmitter());
  36. }
  37. };
  38. TEST_F(LexerTest, HandlesEmptyBuffer) {
  39. auto buffer = Lex("");
  40. EXPECT_FALSE(buffer.HasErrors());
  41. EXPECT_EQ(buffer.Tokens().begin(), buffer.Tokens().end());
  42. }
  43. TEST_F(LexerTest, TracksLinesAndColumns) {
  44. auto buffer = Lex("\n ;;\n ;;;\n");
  45. EXPECT_FALSE(buffer.HasErrors());
  46. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  47. {.kind = TokenKind::Semi(),
  48. .line = 2,
  49. .column = 3,
  50. .indent_column = 3},
  51. {.kind = TokenKind::Semi(),
  52. .line = 2,
  53. .column = 4,
  54. .indent_column = 3},
  55. {.kind = TokenKind::Semi(),
  56. .line = 3,
  57. .column = 4,
  58. .indent_column = 4},
  59. {.kind = TokenKind::Semi(),
  60. .line = 3,
  61. .column = 5,
  62. .indent_column = 4},
  63. {.kind = TokenKind::Semi(),
  64. .line = 3,
  65. .column = 6,
  66. .indent_column = 4},
  67. }));
  68. }
  69. TEST_F(LexerTest, HandlesNumericLiteral) {
  70. auto buffer = Lex("12-578\n 1 2\n0x12_3ABC\n0b10_10_11\n1_234_567\n1.5e9");
  71. EXPECT_FALSE(buffer.HasErrors());
  72. ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  73. {.kind = TokenKind::IntegerLiteral(),
  74. .line = 1,
  75. .column = 1,
  76. .indent_column = 1,
  77. .text = "12"},
  78. {.kind = TokenKind::Minus(),
  79. .line = 1,
  80. .column = 3,
  81. .indent_column = 1},
  82. {.kind = TokenKind::IntegerLiteral(),
  83. .line = 1,
  84. .column = 4,
  85. .indent_column = 1,
  86. .text = "578"},
  87. {.kind = TokenKind::IntegerLiteral(),
  88. .line = 2,
  89. .column = 3,
  90. .indent_column = 3,
  91. .text = "1"},
  92. {.kind = TokenKind::IntegerLiteral(),
  93. .line = 2,
  94. .column = 6,
  95. .indent_column = 3,
  96. .text = "2"},
  97. {.kind = TokenKind::IntegerLiteral(),
  98. .line = 3,
  99. .column = 1,
  100. .indent_column = 1,
  101. .text = "0x12_3ABC"},
  102. {.kind = TokenKind::IntegerLiteral(),
  103. .line = 4,
  104. .column = 1,
  105. .indent_column = 1,
  106. .text = "0b10_10_11"},
  107. {.kind = TokenKind::IntegerLiteral(),
  108. .line = 5,
  109. .column = 1,
  110. .indent_column = 1,
  111. .text = "1_234_567"},
  112. {.kind = TokenKind::RealLiteral(),
  113. .line = 6,
  114. .column = 1,
  115. .indent_column = 1,
  116. .text = "1.5e9"},
  117. }));
  118. auto token_12 = buffer.Tokens().begin();
  119. EXPECT_EQ(buffer.GetIntegerLiteral(*token_12), 12);
  120. auto token_578 = buffer.Tokens().begin() + 2;
  121. EXPECT_EQ(buffer.GetIntegerLiteral(*token_578), 578);
  122. auto token_1 = buffer.Tokens().begin() + 3;
  123. EXPECT_EQ(buffer.GetIntegerLiteral(*token_1), 1);
  124. auto token_2 = buffer.Tokens().begin() + 4;
  125. EXPECT_EQ(buffer.GetIntegerLiteral(*token_2), 2);
  126. auto token_0x12_3abc = buffer.Tokens().begin() + 5;
  127. EXPECT_EQ(buffer.GetIntegerLiteral(*token_0x12_3abc), 0x12'3abc);
  128. auto token_0b10_10_11 = buffer.Tokens().begin() + 6;
  129. EXPECT_EQ(buffer.GetIntegerLiteral(*token_0b10_10_11), 0b10'10'11);
  130. auto token_1_234_567 = buffer.Tokens().begin() + 7;
  131. EXPECT_EQ(buffer.GetIntegerLiteral(*token_1_234_567), 1'234'567);
  132. auto token_1_5e9 = buffer.Tokens().begin() + 8;
  133. auto value_1_5e9 = buffer.GetRealLiteral(*token_1_5e9);
  134. EXPECT_EQ(value_1_5e9.Mantissa().getZExtValue(), 15);
  135. EXPECT_EQ(value_1_5e9.Exponent().getSExtValue(), 8);
  136. EXPECT_EQ(value_1_5e9.IsDecimal(), true);
  137. }
  138. TEST_F(LexerTest, HandlesInvalidNumericLiterals) {
  139. auto buffer = Lex("14x 15_49 0x3.5q 0x3_4.5_6 0ops");
  140. EXPECT_TRUE(buffer.HasErrors());
  141. ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  142. {.kind = TokenKind::Error(),
  143. .line = 1,
  144. .column = 1,
  145. .indent_column = 1,
  146. .text = "14x"},
  147. {.kind = TokenKind::IntegerLiteral(),
  148. .line = 1,
  149. .column = 5,
  150. .indent_column = 1,
  151. .text = "15_49"},
  152. {.kind = TokenKind::Error(),
  153. .line = 1,
  154. .column = 11,
  155. .indent_column = 1,
  156. .text = "0x3.5q"},
  157. {.kind = TokenKind::RealLiteral(),
  158. .line = 1,
  159. .column = 18,
  160. .indent_column = 1,
  161. .text = "0x3_4.5_6"},
  162. {.kind = TokenKind::Error(),
  163. .line = 1,
  164. .column = 28,
  165. .indent_column = 1,
  166. .text = "0ops"},
  167. }));
  168. }
  169. TEST_F(LexerTest, SplitsNumericLiteralsProperly) {
  170. llvm::StringLiteral source_text = R"(
  171. 1.
  172. .2
  173. 3.+foo
  174. 4.0-bar
  175. 5.0e+123+456
  176. 6.0e+1e+2
  177. 1e7
  178. 8..10
  179. 9.0.9.5
  180. 10.foo
  181. 11.0.foo
  182. 12e+1
  183. 13._
  184. )";
  185. auto buffer = Lex(source_text);
  186. EXPECT_TRUE(buffer.HasErrors());
  187. EXPECT_THAT(buffer,
  188. HasTokens(llvm::ArrayRef<ExpectedToken>{
  189. {.kind = TokenKind::IntegerLiteral(), .text = "1"},
  190. {.kind = TokenKind::Period()},
  191. // newline
  192. {.kind = TokenKind::Period()},
  193. {.kind = TokenKind::IntegerLiteral(), .text = "2"},
  194. // newline
  195. {.kind = TokenKind::IntegerLiteral(), .text = "3"},
  196. {.kind = TokenKind::Period()},
  197. {.kind = TokenKind::Plus()},
  198. {.kind = TokenKind::Identifier(), .text = "foo"},
  199. // newline
  200. {.kind = TokenKind::RealLiteral(), .text = "4.0"},
  201. {.kind = TokenKind::Minus()},
  202. {.kind = TokenKind::Identifier(), .text = "bar"},
  203. // newline
  204. {.kind = TokenKind::RealLiteral(), .text = "5.0e+123"},
  205. {.kind = TokenKind::Plus()},
  206. {.kind = TokenKind::IntegerLiteral(), .text = "456"},
  207. // newline
  208. {.kind = TokenKind::Error(), .text = "6.0e+1e"},
  209. {.kind = TokenKind::Plus()},
  210. {.kind = TokenKind::IntegerLiteral(), .text = "2"},
  211. // newline
  212. {.kind = TokenKind::Error(), .text = "1e7"},
  213. // newline
  214. {.kind = TokenKind::IntegerLiteral(), .text = "8"},
  215. {.kind = TokenKind::Period()},
  216. {.kind = TokenKind::Period()},
  217. {.kind = TokenKind::IntegerLiteral(), .text = "10"},
  218. // newline
  219. {.kind = TokenKind::RealLiteral(), .text = "9.0"},
  220. {.kind = TokenKind::Period()},
  221. {.kind = TokenKind::RealLiteral(), .text = "9.5"},
  222. // newline
  223. {.kind = TokenKind::Error(), .text = "10.foo"},
  224. // newline
  225. {.kind = TokenKind::RealLiteral(), .text = "11.0"},
  226. {.kind = TokenKind::Period()},
  227. {.kind = TokenKind::Identifier(), .text = "foo"},
  228. // newline
  229. {.kind = TokenKind::Error(), .text = "12e"},
  230. {.kind = TokenKind::Plus()},
  231. {.kind = TokenKind::IntegerLiteral(), .text = "1"},
  232. // newline
  233. {.kind = TokenKind::IntegerLiteral(), .text = "13"},
  234. {.kind = TokenKind::Period()},
  235. {.kind = TokenKind::UnderscoreKeyword()},
  236. }));
  237. }
  238. TEST_F(LexerTest, HandlesGarbageCharacters) {
  239. constexpr char GarbageText[] = "$$💩-$\n$\0$12$";
  240. auto buffer = Lex(llvm::StringRef(GarbageText, sizeof(GarbageText) - 1));
  241. EXPECT_TRUE(buffer.HasErrors());
  242. EXPECT_THAT(
  243. buffer,
  244. HasTokens(llvm::ArrayRef<ExpectedToken>{
  245. {.kind = TokenKind::Error(),
  246. .line = 1,
  247. .column = 1,
  248. .text = llvm::StringRef("$$💩", 6)},
  249. // 💩 takes 4 bytes, and we count column as bytes offset.
  250. {.kind = TokenKind::Minus(), .line = 1, .column = 7},
  251. {.kind = TokenKind::Error(), .line = 1, .column = 8, .text = "$"},
  252. // newline
  253. {.kind = TokenKind::Error(),
  254. .line = 2,
  255. .column = 1,
  256. .text = llvm::StringRef("$\0$", 3)},
  257. {.kind = TokenKind::IntegerLiteral(),
  258. .line = 2,
  259. .column = 4,
  260. .text = "12"},
  261. {.kind = TokenKind::Error(), .line = 2, .column = 6, .text = "$"},
  262. }));
  263. }
  264. TEST_F(LexerTest, Symbols) {
  265. // We don't need to exhaustively test symbols here as they're handled with
  266. // common code, but we want to check specific patterns to verify things like
  267. // max-munch rule and handling of interesting symbols.
  268. auto buffer = Lex("<<<");
  269. EXPECT_FALSE(buffer.HasErrors());
  270. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  271. {TokenKind::LessLess()},
  272. {TokenKind::Less()},
  273. }));
  274. buffer = Lex("<<=>>");
  275. EXPECT_FALSE(buffer.HasErrors());
  276. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  277. {TokenKind::LessLessEqual()},
  278. {TokenKind::GreaterGreater()},
  279. }));
  280. buffer = Lex("< <=> >");
  281. EXPECT_FALSE(buffer.HasErrors());
  282. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  283. {TokenKind::Less()},
  284. {TokenKind::LessEqualGreater()},
  285. {TokenKind::Greater()},
  286. }));
  287. buffer = Lex("\\/?#@&^!");
  288. EXPECT_FALSE(buffer.HasErrors());
  289. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  290. {TokenKind::Backslash()},
  291. {TokenKind::Slash()},
  292. {TokenKind::Question()},
  293. {TokenKind::Hash()},
  294. {TokenKind::At()},
  295. {TokenKind::Amp()},
  296. {TokenKind::Caret()},
  297. {TokenKind::Exclaim()},
  298. }));
  299. }
  300. TEST_F(LexerTest, Parens) {
  301. auto buffer = Lex("()");
  302. EXPECT_FALSE(buffer.HasErrors());
  303. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  304. {TokenKind::OpenParen()},
  305. {TokenKind::CloseParen()},
  306. }));
  307. buffer = Lex("((()()))");
  308. EXPECT_FALSE(buffer.HasErrors());
  309. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  310. {TokenKind::OpenParen()},
  311. {TokenKind::OpenParen()},
  312. {TokenKind::OpenParen()},
  313. {TokenKind::CloseParen()},
  314. {TokenKind::OpenParen()},
  315. {TokenKind::CloseParen()},
  316. {TokenKind::CloseParen()},
  317. {TokenKind::CloseParen()},
  318. }));
  319. }
  320. TEST_F(LexerTest, CurlyBraces) {
  321. auto buffer = Lex("{}");
  322. EXPECT_FALSE(buffer.HasErrors());
  323. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  324. {TokenKind::OpenCurlyBrace()},
  325. {TokenKind::CloseCurlyBrace()},
  326. }));
  327. buffer = Lex("{{{}{}}}");
  328. EXPECT_FALSE(buffer.HasErrors());
  329. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  330. {TokenKind::OpenCurlyBrace()},
  331. {TokenKind::OpenCurlyBrace()},
  332. {TokenKind::OpenCurlyBrace()},
  333. {TokenKind::CloseCurlyBrace()},
  334. {TokenKind::OpenCurlyBrace()},
  335. {TokenKind::CloseCurlyBrace()},
  336. {TokenKind::CloseCurlyBrace()},
  337. {TokenKind::CloseCurlyBrace()},
  338. }));
  339. }
  340. TEST_F(LexerTest, MatchingGroups) {
  341. {
  342. TokenizedBuffer buffer = Lex("(){}");
  343. ASSERT_FALSE(buffer.HasErrors());
  344. auto it = buffer.Tokens().begin();
  345. auto open_paren_token = *it++;
  346. auto close_paren_token = *it++;
  347. EXPECT_EQ(close_paren_token,
  348. buffer.GetMatchedClosingToken(open_paren_token));
  349. EXPECT_EQ(open_paren_token,
  350. buffer.GetMatchedOpeningToken(close_paren_token));
  351. auto open_curly_token = *it++;
  352. auto close_curly_token = *it++;
  353. EXPECT_EQ(close_curly_token,
  354. buffer.GetMatchedClosingToken(open_curly_token));
  355. EXPECT_EQ(open_curly_token,
  356. buffer.GetMatchedOpeningToken(close_curly_token));
  357. EXPECT_EQ(buffer.Tokens().end(), it);
  358. }
  359. {
  360. TokenizedBuffer buffer = Lex("({x}){(y)} {{((z))}}");
  361. ASSERT_FALSE(buffer.HasErrors());
  362. auto it = buffer.Tokens().begin();
  363. auto open_paren_token = *it++;
  364. auto open_curly_token = *it++;
  365. ASSERT_EQ("x", buffer.GetIdentifierText(buffer.GetIdentifier(*it++)));
  366. auto close_curly_token = *it++;
  367. auto close_paren_token = *it++;
  368. EXPECT_EQ(close_paren_token,
  369. buffer.GetMatchedClosingToken(open_paren_token));
  370. EXPECT_EQ(open_paren_token,
  371. buffer.GetMatchedOpeningToken(close_paren_token));
  372. EXPECT_EQ(close_curly_token,
  373. buffer.GetMatchedClosingToken(open_curly_token));
  374. EXPECT_EQ(open_curly_token,
  375. buffer.GetMatchedOpeningToken(close_curly_token));
  376. open_curly_token = *it++;
  377. open_paren_token = *it++;
  378. ASSERT_EQ("y", buffer.GetIdentifierText(buffer.GetIdentifier(*it++)));
  379. close_paren_token = *it++;
  380. close_curly_token = *it++;
  381. EXPECT_EQ(close_curly_token,
  382. buffer.GetMatchedClosingToken(open_curly_token));
  383. EXPECT_EQ(open_curly_token,
  384. buffer.GetMatchedOpeningToken(close_curly_token));
  385. EXPECT_EQ(close_paren_token,
  386. buffer.GetMatchedClosingToken(open_paren_token));
  387. EXPECT_EQ(open_paren_token,
  388. buffer.GetMatchedOpeningToken(close_paren_token));
  389. open_curly_token = *it++;
  390. auto inner_open_curly_token = *it++;
  391. open_paren_token = *it++;
  392. auto inner_open_paren_token = *it++;
  393. ASSERT_EQ("z", buffer.GetIdentifierText(buffer.GetIdentifier(*it++)));
  394. auto inner_close_paren_token = *it++;
  395. close_paren_token = *it++;
  396. auto inner_close_curly_token = *it++;
  397. close_curly_token = *it++;
  398. EXPECT_EQ(close_curly_token,
  399. buffer.GetMatchedClosingToken(open_curly_token));
  400. EXPECT_EQ(open_curly_token,
  401. buffer.GetMatchedOpeningToken(close_curly_token));
  402. EXPECT_EQ(inner_close_curly_token,
  403. buffer.GetMatchedClosingToken(inner_open_curly_token));
  404. EXPECT_EQ(inner_open_curly_token,
  405. buffer.GetMatchedOpeningToken(inner_close_curly_token));
  406. EXPECT_EQ(close_paren_token,
  407. buffer.GetMatchedClosingToken(open_paren_token));
  408. EXPECT_EQ(open_paren_token,
  409. buffer.GetMatchedOpeningToken(close_paren_token));
  410. EXPECT_EQ(inner_close_paren_token,
  411. buffer.GetMatchedClosingToken(inner_open_paren_token));
  412. EXPECT_EQ(inner_open_paren_token,
  413. buffer.GetMatchedOpeningToken(inner_close_paren_token));
  414. EXPECT_EQ(buffer.Tokens().end(), it);
  415. }
  416. }
  417. TEST_F(LexerTest, MismatchedGroups) {
  418. auto buffer = Lex("{");
  419. EXPECT_TRUE(buffer.HasErrors());
  420. EXPECT_THAT(buffer,
  421. HasTokens(llvm::ArrayRef<ExpectedToken>{
  422. {TokenKind::OpenCurlyBrace()},
  423. {.kind = TokenKind::CloseCurlyBrace(), .recovery = true},
  424. }));
  425. buffer = Lex("}");
  426. EXPECT_TRUE(buffer.HasErrors());
  427. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  428. {.kind = TokenKind::Error(), .text = "}"},
  429. }));
  430. buffer = Lex("{(}");
  431. EXPECT_TRUE(buffer.HasErrors());
  432. EXPECT_THAT(
  433. buffer,
  434. HasTokens(llvm::ArrayRef<ExpectedToken>{
  435. {.kind = TokenKind::OpenCurlyBrace(), .column = 1},
  436. {.kind = TokenKind::OpenParen(), .column = 2},
  437. {.kind = TokenKind::CloseParen(), .column = 3, .recovery = true},
  438. {.kind = TokenKind::CloseCurlyBrace(), .column = 3},
  439. }));
  440. buffer = Lex(")({)");
  441. EXPECT_TRUE(buffer.HasErrors());
  442. EXPECT_THAT(
  443. buffer,
  444. HasTokens(llvm::ArrayRef<ExpectedToken>{
  445. {.kind = TokenKind::Error(), .column = 1, .text = ")"},
  446. {.kind = TokenKind::OpenParen(), .column = 2},
  447. {.kind = TokenKind::OpenCurlyBrace(), .column = 3},
  448. {.kind = TokenKind::CloseCurlyBrace(), .column = 4, .recovery = true},
  449. {.kind = TokenKind::CloseParen(), .column = 4},
  450. }));
  451. }
  452. TEST_F(LexerTest, Keywords) {
  453. auto buffer = Lex(" fn");
  454. EXPECT_FALSE(buffer.HasErrors());
  455. EXPECT_THAT(
  456. buffer,
  457. HasTokens(llvm::ArrayRef<ExpectedToken>{
  458. {.kind = TokenKind::FnKeyword(), .column = 4, .indent_column = 4},
  459. }));
  460. buffer = Lex("and or not if else for loop return var break continue _");
  461. EXPECT_FALSE(buffer.HasErrors());
  462. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  463. {TokenKind::AndKeyword()},
  464. {TokenKind::OrKeyword()},
  465. {TokenKind::NotKeyword()},
  466. {TokenKind::IfKeyword()},
  467. {TokenKind::ElseKeyword()},
  468. {TokenKind::ForKeyword()},
  469. {TokenKind::LoopKeyword()},
  470. {TokenKind::ReturnKeyword()},
  471. {TokenKind::VarKeyword()},
  472. {TokenKind::BreakKeyword()},
  473. {TokenKind::ContinueKeyword()},
  474. {TokenKind::UnderscoreKeyword()},
  475. }));
  476. }
  477. TEST_F(LexerTest, Comments) {
  478. auto buffer = Lex(" ;\n // foo\n ;");
  479. EXPECT_FALSE(buffer.HasErrors());
  480. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  481. {.kind = TokenKind::Semi(),
  482. .line = 1,
  483. .column = 2,
  484. .indent_column = 2},
  485. {.kind = TokenKind::Semi(),
  486. .line = 3,
  487. .column = 3,
  488. .indent_column = 3},
  489. }));
  490. buffer = Lex("// foo\n//\n// bar");
  491. EXPECT_FALSE(buffer.HasErrors());
  492. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{}));
  493. // Make sure weird characters aren't a problem.
  494. buffer = Lex(" // foo#$!^?@-_💩🍫⃠ [̲̅$̲̅(̲̅ ͡° ͜ʖ ͡°̲̅)̲̅$̲̅]");
  495. EXPECT_FALSE(buffer.HasErrors());
  496. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{}));
  497. // Make sure we can lex a comment at the end of the input.
  498. buffer = Lex("//");
  499. EXPECT_FALSE(buffer.HasErrors());
  500. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{}));
  501. }
  502. TEST_F(LexerTest, InvalidComments) {
  503. llvm::StringLiteral testcases[] = {
  504. " /// foo\n",
  505. "foo // bar\n",
  506. "//! hello",
  507. " //world",
  508. };
  509. for (llvm::StringLiteral testcase : testcases) {
  510. auto buffer = Lex(testcase);
  511. EXPECT_TRUE(buffer.HasErrors());
  512. }
  513. }
  514. TEST_F(LexerTest, Identifiers) {
  515. auto buffer = Lex(" foobar");
  516. EXPECT_FALSE(buffer.HasErrors());
  517. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  518. {.kind = TokenKind::Identifier(),
  519. .column = 4,
  520. .indent_column = 4,
  521. .text = "foobar"},
  522. }));
  523. // Check different kinds of identifier character sequences.
  524. buffer = Lex("_foo_bar");
  525. EXPECT_FALSE(buffer.HasErrors());
  526. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  527. {.kind = TokenKind::Identifier(), .text = "_foo_bar"},
  528. }));
  529. buffer = Lex("foo2bar00");
  530. EXPECT_FALSE(buffer.HasErrors());
  531. EXPECT_THAT(buffer,
  532. HasTokens(llvm::ArrayRef<ExpectedToken>{
  533. {.kind = TokenKind::Identifier(), .text = "foo2bar00"},
  534. }));
  535. // Check that we can parse identifiers that start with a keyword.
  536. buffer = Lex("fnord");
  537. EXPECT_FALSE(buffer.HasErrors());
  538. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  539. {.kind = TokenKind::Identifier(), .text = "fnord"},
  540. }));
  541. // Check multiple identifiers with indent and interning.
  542. buffer = Lex(" foo;bar\nbar \n foo\tfoo");
  543. EXPECT_FALSE(buffer.HasErrors());
  544. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  545. {.kind = TokenKind::Identifier(),
  546. .line = 1,
  547. .column = 4,
  548. .indent_column = 4,
  549. .text = "foo"},
  550. {.kind = TokenKind::Semi()},
  551. {.kind = TokenKind::Identifier(),
  552. .line = 1,
  553. .column = 8,
  554. .indent_column = 4,
  555. .text = "bar"},
  556. {.kind = TokenKind::Identifier(),
  557. .line = 2,
  558. .column = 1,
  559. .indent_column = 1,
  560. .text = "bar"},
  561. {.kind = TokenKind::Identifier(),
  562. .line = 3,
  563. .column = 3,
  564. .indent_column = 3,
  565. .text = "foo"},
  566. {.kind = TokenKind::Identifier(),
  567. .line = 3,
  568. .column = 7,
  569. .indent_column = 3,
  570. .text = "foo"},
  571. }));
  572. }
  573. auto GetAndDropLine(llvm::StringRef& text) -> std::string {
  574. auto newline_offset = text.find_first_of('\n');
  575. llvm::StringRef line = text.slice(0, newline_offset);
  576. if (newline_offset != llvm::StringRef::npos) {
  577. text = text.substr(newline_offset + 1);
  578. } else {
  579. text = "";
  580. }
  581. return line.str();
  582. }
  583. TEST_F(LexerTest, Printing) {
  584. auto buffer = Lex(";");
  585. ASSERT_FALSE(buffer.HasErrors());
  586. std::string print_storage;
  587. llvm::raw_string_ostream print_stream(print_storage);
  588. buffer.Print(print_stream);
  589. llvm::StringRef print = print_stream.str();
  590. EXPECT_THAT(GetAndDropLine(print),
  591. StrEq("token: { index: 0, kind: 'Semi', line: 1, column: 1, "
  592. "indent: 1, spelling: ';' }"));
  593. EXPECT_TRUE(print.empty()) << print;
  594. // Test kind padding.
  595. buffer = Lex("(;foo;)");
  596. ASSERT_FALSE(buffer.HasErrors());
  597. print_storage.clear();
  598. buffer.Print(print_stream);
  599. print = print_stream.str();
  600. EXPECT_THAT(GetAndDropLine(print),
  601. StrEq("token: { index: 0, kind: 'OpenParen', line: 1, column: "
  602. "1, indent: 1, spelling: '(', closing_token: 4 }"));
  603. EXPECT_THAT(GetAndDropLine(print),
  604. StrEq("token: { index: 1, kind: 'Semi', line: 1, column: "
  605. "2, indent: 1, spelling: ';' }"));
  606. EXPECT_THAT(GetAndDropLine(print),
  607. StrEq("token: { index: 2, kind: 'Identifier', line: 1, column: "
  608. "3, indent: 1, spelling: 'foo', identifier: 0 }"));
  609. EXPECT_THAT(GetAndDropLine(print),
  610. StrEq("token: { index: 3, kind: 'Semi', line: 1, column: "
  611. "6, indent: 1, spelling: ';' }"));
  612. EXPECT_THAT(GetAndDropLine(print),
  613. StrEq("token: { index: 4, kind: 'CloseParen', line: 1, column: "
  614. "7, indent: 1, spelling: ')', opening_token: 0 }"));
  615. EXPECT_TRUE(print.empty()) << print;
  616. // Test digit padding with max values of 9, 10, and 11.
  617. buffer = Lex(";\n\n\n\n\n\n\n\n\n\n ;;");
  618. ASSERT_FALSE(buffer.HasErrors());
  619. print_storage.clear();
  620. buffer.Print(print_stream);
  621. print = print_stream.str();
  622. EXPECT_THAT(GetAndDropLine(print),
  623. StrEq("token: { index: 0, kind: 'Semi', line: 1, column: 1, "
  624. "indent: 1, spelling: ';' }"));
  625. EXPECT_THAT(GetAndDropLine(print),
  626. StrEq("token: { index: 1, kind: 'Semi', line: 11, column: 9, "
  627. "indent: 9, spelling: ';' }"));
  628. EXPECT_THAT(GetAndDropLine(print),
  629. StrEq("token: { index: 2, kind: 'Semi', line: 11, column: 10, "
  630. "indent: 9, spelling: ';' }"));
  631. EXPECT_TRUE(print.empty()) << print;
  632. }
  633. TEST_F(LexerTest, PrintingAsYaml) {
  634. // Test that we can parse this into YAML and verify line and indent data.
  635. auto buffer = Lex("\n ;\n\n\n; ;\n\n\n\n\n\n\n\n\n\n\n");
  636. ASSERT_FALSE(buffer.HasErrors());
  637. std::string print_output;
  638. llvm::raw_string_ostream print_stream(print_output);
  639. buffer.Print(print_stream);
  640. print_stream.flush();
  641. // Parse the output into a YAML stream. This will print errors to stderr.
  642. llvm::SourceMgr source_manager;
  643. llvm::yaml::Stream yaml_stream(print_output, source_manager);
  644. auto yaml_it = yaml_stream.begin();
  645. auto* root_node = llvm::dyn_cast<llvm::yaml::MappingNode>(yaml_it->getRoot());
  646. ASSERT_THAT(root_node, NotNull());
  647. // Walk the top-level mapping of tokens, dig out the sub-mapping of data for
  648. // each taken, and then verify those entries.
  649. auto mapping_it = llvm::cast<llvm::yaml::MappingNode>(root_node)->begin();
  650. auto* token_node = llvm::dyn_cast<llvm::yaml::KeyValueNode>(&*mapping_it);
  651. ASSERT_THAT(token_node, NotNull());
  652. auto* token_key_node =
  653. llvm::dyn_cast<llvm::yaml::ScalarNode>(token_node->getKey());
  654. ASSERT_THAT(token_key_node, NotNull());
  655. EXPECT_THAT(token_key_node->getRawValue(), StrEq("token"));
  656. auto* token_value_node =
  657. llvm::dyn_cast<llvm::yaml::MappingNode>(token_node->getValue());
  658. ASSERT_THAT(token_value_node, NotNull());
  659. auto token_it = token_value_node->begin();
  660. EXPECT_THAT(&*token_it, IsKeyValueScalars("index", "0"));
  661. ++token_it;
  662. EXPECT_THAT(&*token_it, IsKeyValueScalars("kind", "Semi"));
  663. ++token_it;
  664. EXPECT_THAT(&*token_it, IsKeyValueScalars("line", "2"));
  665. ++token_it;
  666. EXPECT_THAT(&*token_it, IsKeyValueScalars("column", "2"));
  667. ++token_it;
  668. EXPECT_THAT(&*token_it, IsKeyValueScalars("indent", "2"));
  669. ++token_it;
  670. EXPECT_THAT(&*token_it, IsKeyValueScalars("spelling", ";"));
  671. EXPECT_THAT(++token_it, Eq(token_value_node->end()));
  672. ++mapping_it;
  673. token_node = llvm::dyn_cast<llvm::yaml::KeyValueNode>(&*mapping_it);
  674. ASSERT_THAT(token_node, NotNull());
  675. token_key_node = llvm::dyn_cast<llvm::yaml::ScalarNode>(token_node->getKey());
  676. ASSERT_THAT(token_key_node, NotNull());
  677. EXPECT_THAT(token_key_node->getRawValue(), StrEq("token"));
  678. token_value_node =
  679. llvm::dyn_cast<llvm::yaml::MappingNode>(token_node->getValue());
  680. ASSERT_THAT(token_value_node, NotNull());
  681. token_it = token_value_node->begin();
  682. EXPECT_THAT(&*token_it, IsKeyValueScalars("index", "1"));
  683. ++token_it;
  684. EXPECT_THAT(&*token_it, IsKeyValueScalars("kind", "Semi"));
  685. ++token_it;
  686. EXPECT_THAT(&*token_it, IsKeyValueScalars("line", "5"));
  687. ++token_it;
  688. EXPECT_THAT(&*token_it, IsKeyValueScalars("column", "1"));
  689. ++token_it;
  690. EXPECT_THAT(&*token_it, IsKeyValueScalars("indent", "1"));
  691. ++token_it;
  692. EXPECT_THAT(&*token_it, IsKeyValueScalars("spelling", ";"));
  693. EXPECT_THAT(++token_it, Eq(token_value_node->end()));
  694. ++mapping_it;
  695. token_node = llvm::dyn_cast<llvm::yaml::KeyValueNode>(&*mapping_it);
  696. ASSERT_THAT(token_node, NotNull());
  697. token_key_node = llvm::dyn_cast<llvm::yaml::ScalarNode>(token_node->getKey());
  698. ASSERT_THAT(token_key_node, NotNull());
  699. EXPECT_THAT(token_key_node->getRawValue(), StrEq("token"));
  700. token_value_node =
  701. llvm::dyn_cast<llvm::yaml::MappingNode>(token_node->getValue());
  702. ASSERT_THAT(token_value_node, NotNull());
  703. token_it = token_value_node->begin();
  704. EXPECT_THAT(&*token_it, IsKeyValueScalars("index", "2"));
  705. ++token_it;
  706. EXPECT_THAT(&*token_it, IsKeyValueScalars("kind", "Semi"));
  707. ++token_it;
  708. EXPECT_THAT(&*token_it, IsKeyValueScalars("line", "5"));
  709. ++token_it;
  710. EXPECT_THAT(&*token_it, IsKeyValueScalars("column", "3"));
  711. ++token_it;
  712. EXPECT_THAT(&*token_it, IsKeyValueScalars("indent", "1"));
  713. ++token_it;
  714. EXPECT_THAT(&*token_it, IsKeyValueScalars("spelling", ";"));
  715. EXPECT_THAT(++token_it, Eq(token_value_node->end()));
  716. ASSERT_THAT(++mapping_it, Eq(root_node->end()));
  717. ASSERT_THAT(++yaml_it, Eq(yaml_stream.end()));
  718. }
  719. } // namespace
  720. } // namespace Carbon