parser_impl.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "parser/parser_impl.h"
  5. #include <cstdlib>
  6. #include "lexer/token_kind.h"
  7. #include "lexer/tokenized_buffer.h"
  8. #include "llvm/ADT/Optional.h"
  9. #include "llvm/Support/raw_ostream.h"
  10. #include "parser/parse_node_kind.h"
  11. #include "parser/parse_tree.h"
  12. namespace Carbon {
  13. auto ParseTree::Parser::Parse(TokenizedBuffer& tokens,
  14. TokenDiagnosticEmitter& /*unused*/) -> ParseTree {
  15. ParseTree tree(tokens);
  16. // We expect to have a 1:1 correspondence between tokens and tree nodes, so
  17. // reserve the space we expect to need here to avoid allocation and copying
  18. // overhead.
  19. tree.node_impls.reserve(tokens.Size());
  20. Parser parser(tree, tokens);
  21. while (!parser.AtEndOfFile()) {
  22. parser.ParseDeclaration();
  23. }
  24. parser.AddLeafNode(ParseNodeKind::FileEnd(), *parser.position);
  25. assert(tree.Verify() && "Parse tree built but does not verify!");
  26. return tree;
  27. }
  28. ParseTree::Parser::Parser(ParseTree& tree_arg, TokenizedBuffer& tokens_arg)
  29. : tree(tree_arg),
  30. tokens(tokens_arg),
  31. position(tokens.Tokens().begin()),
  32. end(tokens.Tokens().end()) {
  33. assert(std::find_if(position, end,
  34. [&](TokenizedBuffer::Token t) {
  35. return tokens.GetKind(t) == TokenKind::EndOfFile();
  36. }) != end &&
  37. "No EndOfFileToken in token buffer.");
  38. }
  39. auto ParseTree::Parser::Consume(TokenKind kind) -> TokenizedBuffer::Token {
  40. TokenizedBuffer::Token t = *position;
  41. assert(kind != TokenKind::EndOfFile() && "Cannot consume the EOF token!");
  42. assert(tokens.GetKind(t) == kind && "The current token is the wrong kind!");
  43. ++position;
  44. assert(position != end && "Reached end of tokens without finding EOF token.");
  45. return t;
  46. }
  47. auto ParseTree::Parser::ConsumeIf(TokenKind kind)
  48. -> llvm::Optional<TokenizedBuffer::Token> {
  49. if (tokens.GetKind(*position) != kind) {
  50. return {};
  51. }
  52. return Consume(kind);
  53. }
  54. auto ParseTree::Parser::AddLeafNode(ParseNodeKind kind,
  55. TokenizedBuffer::Token token) -> Node {
  56. Node n(tree.node_impls.size());
  57. tree.node_impls.push_back(NodeImpl(kind, token, /*subtree_size_arg=*/1));
  58. return n;
  59. }
  60. auto ParseTree::Parser::ConsumeAndAddLeafNodeIf(TokenKind t_kind,
  61. ParseNodeKind n_kind)
  62. -> llvm::Optional<Node> {
  63. auto t = ConsumeIf(t_kind);
  64. if (!t) {
  65. return {};
  66. }
  67. return AddLeafNode(n_kind, *t);
  68. }
  69. auto ParseTree::Parser::MarkNodeError(Node n) -> void {
  70. tree.node_impls[n.index].has_error = true;
  71. tree.has_errors = true;
  72. }
  73. // A marker for the start of a node's subtree.
  74. //
  75. // This is used to track the size of the node's subtree and ensure at least one
  76. // parse node is added. It can be used repeatedly if multiple subtrees start at
  77. // the same position.
  78. struct ParseTree::Parser::SubtreeStart {
  79. int tree_size;
  80. bool node_added = false;
  81. ~SubtreeStart() {
  82. assert(node_added && "Never added a node for a subtree region!");
  83. }
  84. };
  85. auto ParseTree::Parser::StartSubtree() -> SubtreeStart {
  86. return {static_cast<int>(tree.node_impls.size())};
  87. }
  88. auto ParseTree::Parser::AddNode(ParseNodeKind n_kind, TokenizedBuffer::Token t,
  89. SubtreeStart& start, bool has_error) -> Node {
  90. // The size of the subtree is the change in size from when we started this
  91. // subtree to now, but including the node we're about to add.
  92. int tree_stop_size = static_cast<int>(tree.node_impls.size()) + 1;
  93. int subtree_size = tree_stop_size - start.tree_size;
  94. Node n(tree.node_impls.size());
  95. tree.node_impls.push_back(NodeImpl(n_kind, t, subtree_size));
  96. if (has_error) {
  97. MarkNodeError(n);
  98. }
  99. start.node_added = true;
  100. return n;
  101. }
  102. auto ParseTree::Parser::SkipMatchingGroup() -> bool {
  103. TokenizedBuffer::Token t = *position;
  104. TokenKind t_kind = tokens.GetKind(t);
  105. if (!t_kind.IsOpeningSymbol()) {
  106. return false;
  107. }
  108. SkipTo(tokens.GetMatchedClosingToken(t));
  109. Consume(t_kind.GetClosingSymbol());
  110. return true;
  111. }
  112. auto ParseTree::Parser::SkipTo(TokenizedBuffer::Token t) -> void {
  113. assert(t >= *position && "Tried to skip backwards.");
  114. position = TokenizedBuffer::TokenIterator(t);
  115. assert(position != end && "Skipped past EOF.");
  116. }
  117. auto ParseTree::Parser::SkipPastLikelyDeclarationEnd(
  118. TokenizedBuffer::Token skip_root, bool is_inside_declaration)
  119. -> llvm::Optional<Node> {
  120. if (AtEndOfFile()) {
  121. return {};
  122. }
  123. TokenizedBuffer::Line root_line = tokens.GetLine(skip_root);
  124. int root_line_indent = tokens.GetIndentColumnNumber(root_line);
  125. // We will keep scanning through tokens on the same line as the root or
  126. // lines with greater indentation than root's line.
  127. auto is_same_line_or_indent_greater_than_root =
  128. [&](TokenizedBuffer::Token t) {
  129. TokenizedBuffer::Line l = tokens.GetLine(t);
  130. if (l == root_line) {
  131. return true;
  132. }
  133. return tokens.GetIndentColumnNumber(l) > root_line_indent;
  134. };
  135. do {
  136. TokenKind current_kind = tokens.GetKind(*position);
  137. if (current_kind == TokenKind::CloseCurlyBrace()) {
  138. // Immediately bail out if we hit an unmatched close curly, this will
  139. // pop us up a level of the syntax grouping.
  140. return {};
  141. }
  142. // If we find a semicolon, parse it and add a corresponding node. If we're
  143. // inside of a declaration, this is a declaration ending semicolon,
  144. // otherwise it simply forms an empty declaration.
  145. if (auto end_node = ConsumeAndAddLeafNodeIf(
  146. TokenKind::Semi(), is_inside_declaration
  147. ? ParseNodeKind::DeclarationEnd()
  148. : ParseNodeKind::EmptyDeclaration())) {
  149. return end_node;
  150. }
  151. // Skip over any matching group of tokens.
  152. if (SkipMatchingGroup()) {
  153. continue;
  154. }
  155. // Otherwise just step forward one token.
  156. Consume(current_kind);
  157. } while (!AtEndOfFile() &&
  158. is_same_line_or_indent_greater_than_root(*position));
  159. return {};
  160. }
  161. auto ParseTree::Parser::ParseFunctionSignature() -> Node {
  162. TokenizedBuffer::Token open_paren = Consume(TokenKind::OpenParen());
  163. auto start = StartSubtree();
  164. // FIXME: Add support for parsing parameters.
  165. bool has_errors = false;
  166. if (tokens.GetKind(*position) != TokenKind::CloseParen()) {
  167. llvm::errs() << "ERROR: unexpected token before the close of the "
  168. "parameters on line "
  169. << tokens.GetLineNumber(*position) << "!\n";
  170. has_errors = true;
  171. // We can trivially skip to the actual close parenthesis from here.
  172. SkipTo(tokens.GetMatchedClosingToken(open_paren));
  173. }
  174. AddLeafNode(ParseNodeKind::ParameterListEnd(),
  175. Consume(TokenKind::CloseParen()));
  176. // FIXME: Implement parsing of a return type.
  177. return AddNode(ParseNodeKind::ParameterList(), open_paren, start, has_errors);
  178. }
  179. auto ParseTree::Parser::ParseCodeBlock() -> Node {
  180. TokenizedBuffer::Token open_curly = Consume(TokenKind::OpenCurlyBrace());
  181. auto start = StartSubtree();
  182. bool has_errors = false;
  183. // Loop over all the different possibly nested elements in the code block.
  184. for (;;) {
  185. switch (tokens.GetKind(*position)) {
  186. default:
  187. // FIXME: Add support for parsing more expressions & statements.
  188. llvm::errs() << "ERROR: unexpected token before the close of the "
  189. "function definition on line "
  190. << tokens.GetLineNumber(*position) << "!\n";
  191. has_errors = true;
  192. // We can trivially skip to the actual close curly brace from here.
  193. SkipTo(tokens.GetMatchedClosingToken(open_curly));
  194. // Now fall through to the close curly brace handling code.
  195. LLVM_FALLTHROUGH;
  196. case TokenKind::CloseCurlyBrace():
  197. break;
  198. case TokenKind::OpenCurlyBrace():
  199. // FIXME: We should consider avoiding recursion here with some side
  200. // stack.
  201. ParseCodeBlock();
  202. continue;
  203. }
  204. // We only continue looping with `continue` above.
  205. break;
  206. }
  207. // We always reach here having set our position in the token stream to the
  208. // close curly brace.
  209. AddLeafNode(ParseNodeKind::CodeBlockEnd(),
  210. Consume(TokenKind::CloseCurlyBrace()));
  211. return AddNode(ParseNodeKind::CodeBlock(), open_curly, start, has_errors);
  212. }
  213. auto ParseTree::Parser::ParseFunctionDeclaration() -> Node {
  214. TokenizedBuffer::Token function_intro_token = Consume(TokenKind::FnKeyword());
  215. auto start = StartSubtree();
  216. auto add_error_function_node = [&] {
  217. return AddNode(ParseNodeKind::FunctionDeclaration(), function_intro_token,
  218. start, /*has_error=*/true);
  219. };
  220. auto name_n = ConsumeAndAddLeafNodeIf(TokenKind::Identifier(),
  221. ParseNodeKind::Identifier());
  222. if (!name_n) {
  223. llvm::errs() << "ERROR: Function declaration with no name on line "
  224. << tokens.GetLineNumber(function_intro_token) << "!\n";
  225. // FIXME: We could change the lexer to allow us to synthesize certain
  226. // kinds of tokens and try to "recover" here, but unclear that this is
  227. // really useful.
  228. SkipPastLikelyDeclarationEnd(function_intro_token);
  229. return add_error_function_node();
  230. }
  231. TokenizedBuffer::Token open_paren = *position;
  232. if (tokens.GetKind(open_paren) != TokenKind::OpenParen()) {
  233. llvm::errs()
  234. << "ERROR: Missing open parentheses in declaration of function '"
  235. << tokens.GetTokenText(tree.GetNodeToken(*name_n)) << "' on line "
  236. << tokens.GetLineNumber(function_intro_token) << "!\n";
  237. SkipPastLikelyDeclarationEnd(function_intro_token);
  238. return add_error_function_node();
  239. }
  240. TokenizedBuffer::Token close_paren =
  241. tokens.GetMatchedClosingToken(open_paren);
  242. Node signature_n = ParseFunctionSignature();
  243. assert(*std::prev(position) == close_paren &&
  244. "Should have parsed through the close paren, whether successfully "
  245. "or with errors.");
  246. if (tree.node_impls[signature_n.index].has_error) {
  247. // Don't try to parse more of the function declaration, but consume a
  248. // declaration ending semicolon if found (without going to a new line).
  249. SkipPastLikelyDeclarationEnd(function_intro_token);
  250. return add_error_function_node();
  251. }
  252. // See if we should parse a definition which is represented as a code block.
  253. if (tokens.GetKind(*position) == TokenKind::OpenCurlyBrace()) {
  254. ParseCodeBlock();
  255. } else if (!ConsumeAndAddLeafNodeIf(TokenKind::Semi(),
  256. ParseNodeKind::DeclarationEnd())) {
  257. llvm::errs() << "ERROR: Function declaration not terminated by a "
  258. "semicolon on line "
  259. << tokens.GetLineNumber(close_paren) << "!\n";
  260. if (tokens.GetLine(*position) == tokens.GetLine(close_paren)) {
  261. // Only need to skip if we've not already found a new line.
  262. SkipPastLikelyDeclarationEnd(function_intro_token);
  263. }
  264. return add_error_function_node();
  265. }
  266. // Successfully parsed the function, add that node.
  267. return AddNode(ParseNodeKind::FunctionDeclaration(), function_intro_token,
  268. start);
  269. }
  270. auto ParseTree::Parser::ParseEmptyDeclaration() -> Node {
  271. return AddLeafNode(ParseNodeKind::EmptyDeclaration(),
  272. Consume(TokenKind::Semi()));
  273. }
  274. auto ParseTree::Parser::ParseDeclaration() -> llvm::Optional<Node> {
  275. TokenizedBuffer::Token t = *position;
  276. switch (tokens.GetKind(t)) {
  277. case TokenKind::FnKeyword():
  278. return ParseFunctionDeclaration();
  279. case TokenKind::Semi():
  280. return ParseEmptyDeclaration();
  281. case TokenKind::EndOfFile():
  282. return llvm::None;
  283. default:
  284. // Errors are handled outside the switch.
  285. break;
  286. }
  287. // We didn't recognize an introducer for a valid declaration.
  288. llvm::errs() << "ERROR: Unrecognized declaration introducer '"
  289. << tokens.GetTokenText(t) << "' on line "
  290. << tokens.GetLineNumber(t) << "!\n";
  291. // Skip forward past any end of a declaration we simply didn't understand so
  292. // that we can find the start of the next declaration or the end of a scope.
  293. if (auto found_semi_n =
  294. SkipPastLikelyDeclarationEnd(t, /*is_inside_declaration=*/false)) {
  295. MarkNodeError(*found_semi_n);
  296. return *found_semi_n;
  297. }
  298. // Nothing, not even a semicolon found. We still need to mark that an error
  299. // occurred though.
  300. tree.has_errors = true;
  301. return {};
  302. }
  303. } // namespace Carbon