Pārlūkot izejas kodu

Update comment lexing to match #198. (#292)

Require a space after a `//`, and recognize but reject a `//` that follows non-whitespace text in the same line. Remove DocComment token kind that ended up not being part of the design.
Richard Smith 5 gadi atpakaļ
vecāks
revīzija
bbaa0e788c
3 mainītis faili ar 58 papildinājumiem un 80 dzēšanām
  1. 0 1
      lexer/token_registry.def
  2. 42 23
      lexer/tokenized_buffer.cpp
  3. 16 56
      lexer/tokenized_buffer_test.cpp

+ 0 - 1
lexer/token_registry.def

@@ -156,7 +156,6 @@ CARBON_KEYWORD_TOKEN(XorKeyword,        "xor")
 
 CARBON_TOKEN(Identifier)
 CARBON_TOKEN(IntegerLiteral)
-CARBON_TOKEN(DocComment)
 CARBON_TOKEN(Error)
 
 #undef CARBON_TOKEN

+ 42 - 23
lexer/tokenized_buffer.cpp

@@ -35,6 +35,28 @@ static auto TakeLeadingIntegerLiteral(llvm::StringRef source_text)
       [](char c) { return llvm::isAlnum(c) || c == '_'; });
 }
 
+struct TrailingComment {
+  static constexpr llvm::StringLiteral ShortName = "syntax-comments";
+  static constexpr llvm::StringLiteral Message =
+      "Trailing comments are not permitted.";
+
+  struct Substitutions {};
+  static auto Format(const Substitutions&) -> std::string {
+    return Message.str();
+  }
+};
+
+struct NoWhitespaceAfterCommentIntroducer {
+  static constexpr llvm::StringLiteral ShortName = "syntax-comments";
+  static constexpr llvm::StringLiteral Message =
+      "Whitespace is required after '//'.";
+
+  struct Substitutions {};
+  static auto Format(const Substitutions&) -> std::string {
+    return Message.str();
+  }
+};
+
 struct UnmatchedClosing {
   static constexpr llvm::StringLiteral ShortName = "syntax-balanced-delimiters";
   static constexpr llvm::StringLiteral Message =
@@ -138,6 +160,12 @@ struct UnrecognizedCharacters {
   }
 };
 
+// TODO(zygoloid): Update this to match whatever we decide qualifies as
+// acceptable whitespace.
+static bool isSpace(char c) {
+  return c == ' ' || c == '\n' || c == '\t';
+}
+
 // Implementation of the lexer logic itself.
 //
 // The design is that lexing can loop over the source buffer, consuming it into
@@ -166,21 +194,19 @@ class TokenizedBuffer::Lexer {
   auto SkipWhitespace(llvm::StringRef& source_text) -> bool {
     while (!source_text.empty()) {
       // We only support line-oriented commenting and lex comments as-if they
-      // were whitespace. Any comment must be the only non-whitespace on the
-      // line.
-      if (source_text.startswith("//") && !set_indent) {
-        // Check if the comment has a special starting sequence of three slashes
-        // followed by a space. This represents a documentation comment that is
-        // preserved as a token in the buffer. When parsing, these comments will
-        // only be accepted in specific parts of the grammar and will be
-        // associated with the parsed constructs as structure documentation. All
-        // other comments are simply treated as whitespace.
-        if (source_text.startswith("///")) {
-          current_line_info->indent = current_column;
-          set_indent = true;
-          buffer.AddToken({.kind = TokenKind::DocComment(),
-                           .token_line = current_line,
-                           .column = current_column});
+      // were whitespace.
+      if (source_text.startswith("//")) {
+        // Any comment must be the only non-whitespace on the line.
+        if (set_indent) {
+          emitter.EmitError<TrailingComment>(
+              [](TrailingComment::Substitutions&) {});
+          buffer.has_errors = true;
+        }
+        // The introducer '//' must be followed by whitespace or EOF.
+        if (source_text.size() > 2 && !isSpace(source_text[2])) {
+          emitter.EmitError<NoWhitespaceAfterCommentIntroducer>(
+              [](NoWhitespaceAfterCommentIntroducer::Substitutions&) {});
+          buffer.has_errors = true;
         }
         while (!source_text.empty() && source_text.front() != '\n') {
           ++current_column;
@@ -195,6 +221,7 @@ class TokenizedBuffer::Lexer {
         default:
           // If we find a non-whitespace character without exhausting the
           // buffer, return true to continue lexing.
+          assert(!isSpace(source_text.front()));
           return true;
 
         case '\n':
@@ -631,14 +658,6 @@ auto TokenizedBuffer::GetTokenText(Token token) const -> llvm::StringRef {
     return source->Text().substr(token_start, token_info.error_length);
   }
 
-  // Documentation comment tokens refer back to the source text.
-  if (token_info.kind == TokenKind::DocComment()) {
-    auto& line_info = GetLineInfo(token_info.token_line);
-    int64_t token_start = line_info.start + token_info.column;
-    int64_t token_stop = line_info.start + line_info.length;
-    return source->Text().slice(token_start, token_stop);
-  }
-
   // Refer back to the source text to preserve oddities like radix or digit
   // separators the author included.
   if (token_info.kind == TokenKind::IntegerLiteral()) {

+ 16 - 56
lexer/tokenized_buffer_test.cpp

@@ -520,67 +520,27 @@ TEST_F(LexerTest, Comments) {
   EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{}));
 
   // Make sure weird characters aren't a problem.
-  buffer = Lex("  //foo#$!^?@-_💩🍫⃠ [̲̅$̲̅(̲̅ ͡° ͜ʖ ͡°̲̅)̲̅$̲̅]");
+  buffer = Lex("  // foo#$!^?@-_💩🍫⃠ [̲̅$̲̅(̲̅ ͡° ͜ʖ ͡°̲̅)̲̅$̲̅]");
   EXPECT_FALSE(buffer.HasErrors());
   EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{}));
-}
-
-TEST_F(LexerTest, DocComments) {
-  auto buffer = Lex("  /// foo");
-  EXPECT_FALSE(buffer.HasErrors());
-  EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
-                          {.kind = TokenKind::DocComment(),
-                           .line = 1,
-                           .column = 3,
-                           .indent_column = 3,
-                           .text = "/// foo"},
-                      }));
-
-  buffer = Lex("/// foo\n//\n/// bar");
-  EXPECT_FALSE(buffer.HasErrors());
-  EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
-                          {.kind = TokenKind::DocComment(),
-                           .line = 1,
-                           .column = 1,
-                           .indent_column = 1,
-                           .text = "/// foo"},
-                          {.kind = TokenKind::DocComment(),
-                           .line = 3,
-                           .column = 1,
-                           .indent_column = 1,
-                           .text = "/// bar"},
-                      }));
 
-  buffer = Lex("/// foo\n///\n/// bar");
+  // Make sure we can lex a comment at the end of the input.
+  buffer = Lex("//");
   EXPECT_FALSE(buffer.HasErrors());
-  EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
-                          {.kind = TokenKind::DocComment(),
-                           .line = 1,
-                           .column = 1,
-                           .indent_column = 1,
-                           .text = "/// foo"},
-                          {.kind = TokenKind::DocComment(),
-                           .line = 2,
-                           .column = 1,
-                           .indent_column = 1,
-                           .text = "///"},
-                          {.kind = TokenKind::DocComment(),
-                           .line = 3,
-                           .column = 1,
-                           .indent_column = 1,
-                           .text = "/// bar"},
-                      }));
+  EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{}));
+}
 
-  // Make sure weird characters aren't a problem.
-  buffer = Lex("  ///foo#$!^?@-_💩🍫⃠ [̲̅$̲̅(̲̅ ͡° ͜ʖ ͡°̲̅)̲̅$̲̅]");
-  EXPECT_FALSE(buffer.HasErrors());
-  EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
-                          {.kind = TokenKind::DocComment(),
-                           .line = 1,
-                           .column = 3,
-                           .indent_column = 3,
-                           .text = "///foo#$!^?@-_💩🍫⃠ [̲̅$̲̅(̲̅ ͡° ͜ʖ ͡°̲̅)̲̅$̲̅]"},
-                      }));
+TEST_F(LexerTest, InvalidComments) {
+  llvm::StringLiteral testcases[] = {
+      "  /// foo\n",
+      "foo // bar\n",
+      "//! hello",
+      " //world",
+  };
+  for (llvm::StringLiteral testcase : testcases) {
+    auto buffer = Lex(testcase);
+    EXPECT_TRUE(buffer.HasErrors());
+  }
 }
 
 TEST_F(LexerTest, Identifiers) {