Jelajahi Sumber

Factor out character set handling into a separate file. (#352)

Richard Smith 5 tahun lalu
induk
melakukan
97bb6f4e80
5 mengubah file dengan 108 tambahan dan 38 penghapusan
  1. 10 1
      lexer/BUILD
  2. 75 0
      lexer/character_set.h
  3. 6 8
      lexer/numeric_literal.cpp
  4. 8 17
      lexer/string_literal.cpp
  5. 9 12
      lexer/tokenized_buffer.cpp

+ 10 - 1
lexer/BUILD

@@ -26,11 +26,18 @@ cc_test(
     ],
     ],
 )
 )
 
 
+cc_library(
+    name = "character_set",
+    hdrs = ["character_set.h"],
+    deps = ["@llvm-project//llvm:Support"],
+)
+
 cc_library(
 cc_library(
     name = "numeric_literal",
     name = "numeric_literal",
     srcs = ["numeric_literal.cpp"],
     srcs = ["numeric_literal.cpp"],
     hdrs = ["numeric_literal.h"],
     hdrs = ["numeric_literal.h"],
     deps = [
     deps = [
+        ":character_set",
         "//diagnostics:diagnostic_emitter",
         "//diagnostics:diagnostic_emitter",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Support",
     ],
     ],
@@ -54,6 +61,7 @@ cc_library(
     srcs = ["string_literal.cpp"],
     srcs = ["string_literal.cpp"],
     hdrs = ["string_literal.h"],
     hdrs = ["string_literal.h"],
     deps = [
     deps = [
+        ":character_set",
         "//diagnostics:diagnostic_emitter",
         "//diagnostics:diagnostic_emitter",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Support",
     ],
     ],
@@ -77,9 +85,10 @@ cc_library(
     srcs = ["tokenized_buffer.cpp"],
     srcs = ["tokenized_buffer.cpp"],
     hdrs = ["tokenized_buffer.h"],
     hdrs = ["tokenized_buffer.h"],
     deps = [
     deps = [
+        ":character_set",
+        ":numeric_literal",
         ":string_literal",
         ":string_literal",
         ":token_kind",
         ":token_kind",
-        ":numeric_literal",
         "//diagnostics:diagnostic_emitter",
         "//diagnostics:diagnostic_emitter",
         "//source:source_buffer",
         "//source:source_buffer",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Support",

+ 75 - 0
lexer/character_set.h

@@ -0,0 +1,75 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef LEXER_CHARACTER_SET_H_
+#define LEXER_CHARACTER_SET_H_
+
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace Carbon {
+
+// TODO: These definitions need to be updated to match whatever Unicode lexical
+// rules we pick. The function interfaces will need to change to accommodate
+// multi-byte characters.
+
+// Is this an alphabetical character according to Carbon's lexical rules?
+//
+// Alphabetical characters are permitted at the start of identifiers. This
+// currently includes 'A'..'Z' and 'a'..'z'.
+inline bool IsAlpha(char c) { return llvm::isAlpha(c); }
+
+// Is this a decimal digit according to Carbon's lexical rules?
+//
+// This currently includes '0'..'9'.
+inline bool IsDecimalDigit(char c) { return llvm::isDigit(c); }
+
+// Is this an alphanumeric character according to Carbon's lexical rules?
+//
+// Alphanumeric characters are permitted as trailing characters in identifiers
+// and numeric literals. This includes alphabetical characters plus decimal
+// digits.
+//
+// Note that '_' is not considered alphanumeric, despite in most circumstances
+// being a valid continuation character of an identifier or numeric literal.
+inline bool IsAlnum(char c) { return llvm::isAlnum(c); }
+
+// Is this a hexadecimal digit according to Carbon's lexical rules?
+//
+// Hexadecimal digits are permitted in `0x`-prefixed literals, as well as after
+// a `\x` escape sequence.
+//
+// Note that lowercase 'a'..'f' are currently not considered hexadecimal digits
+// in any context.
+inline bool IsUpperHexDigit(char c) {
+  return ('0' <= c && c <= '9') || ('A' <= c && c <= 'F');
+}
+
+// Is this a lowercase letter?
+//
+// Lowercase letters in numeric literals can be followed by `+` or `-` to
+// extend the literal.
+inline bool IsLower(char c) { return 'a' <= c && c <= 'z'; }
+
+// Is this character considered to be horizontal whitespace?
+//
+// Such characters can appear in the indentation of a line.
+inline bool IsHorizontalWhitespace(char c) { return c == ' ' || c == '\t'; }
+
+// Is this character considered to be vertical whitespace?
+//
+// Such characters are considered to terminate lines.
+inline bool IsVerticalWhitespace(char c) { return c == '\n'; }
+
+// Is this character considered to be whitespace?
+//
+// Changes here will need matching changes in
+// `TokenizedBuffer::Lexer::SkipWhitespace`.
+inline bool IsSpace(char c) {
+  return IsHorizontalWhitespace(c) || IsVerticalWhitespace(c);
+}
+
+}  // namespace Carbon
+
+#endif  // LEXER_CHARACTER_SET_H_

+ 6 - 8
lexer/numeric_literal.cpp

@@ -6,6 +6,7 @@
 
 
 #include <bitset>
 #include <bitset>
 
 
+#include "lexer/character_set.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/FormatVariadic.h"
 
 
@@ -79,13 +80,11 @@ struct WrongRealLiteralExponent {
 };
 };
 }  // namespace
 }  // namespace
 
 
-static bool isLower(char c) { return 'a' <= c && c <= 'z'; }
-
 auto NumericLiteralToken::Lex(llvm::StringRef source_text)
 auto NumericLiteralToken::Lex(llvm::StringRef source_text)
     -> llvm::Optional<NumericLiteralToken> {
     -> llvm::Optional<NumericLiteralToken> {
   NumericLiteralToken result;
   NumericLiteralToken result;
 
 
-  if (source_text.empty() || !llvm::isDigit(source_text.front())) {
+  if (source_text.empty() || !IsDecimalDigit(source_text.front())) {
     return llvm::None;
     return llvm::None;
   }
   }
 
 
@@ -101,8 +100,8 @@ auto NumericLiteralToken::Lex(llvm::StringRef source_text)
   int i = 1, n = source_text.size();
   int i = 1, n = source_text.size();
   for (; i != n; ++i) {
   for (; i != n; ++i) {
     char c = source_text[i];
     char c = source_text[i];
-    if (llvm::isAlnum(c) || c == '_') {
-      if (isLower(c) && seen_radix_point && !seen_plus_minus) {
+    if (IsAlnum(c) || c == '_') {
+      if (IsLower(c) && seen_radix_point && !seen_plus_minus) {
         result.exponent = i;
         result.exponent = i;
         seen_potential_exponent = true;
         seen_potential_exponent = true;
       }
       }
@@ -111,7 +110,7 @@ auto NumericLiteralToken::Lex(llvm::StringRef source_text)
 
 
     // Exactly one `.` can be part of the literal, but only if it's followed by
     // Exactly one `.` can be part of the literal, but only if it's followed by
     // an alphanumeric character.
     // an alphanumeric character.
-    if (c == '.' && i + 1 != n && llvm::isAlnum(source_text[i + 1]) &&
+    if (c == '.' && i + 1 != n && IsAlnum(source_text[i + 1]) &&
         !seen_radix_point) {
         !seen_radix_point) {
       result.radix_point = i;
       result.radix_point = i;
       seen_radix_point = true;
       seen_radix_point = true;
@@ -123,8 +122,7 @@ auto NumericLiteralToken::Lex(llvm::StringRef source_text)
     // followed by an alphanumeric character. This '+' or '-' cannot be an
     // followed by an alphanumeric character. This '+' or '-' cannot be an
     // operator because a literal cannot end in a lowercase letter.
     // operator because a literal cannot end in a lowercase letter.
     if ((c == '+' || c == '-') && seen_potential_exponent &&
     if ((c == '+' || c == '-') && seen_potential_exponent &&
-        result.exponent == i - 1 && i + 1 != n &&
-        llvm::isAlnum(source_text[i + 1])) {
+        result.exponent == i - 1 && i + 1 != n && IsAlnum(source_text[i + 1])) {
       // This is not possible because we don't update result.exponent after we
       // This is not possible because we don't update result.exponent after we
       // see a '+' or '-'.
       // see a '+' or '-'.
       assert(!seen_plus_minus && "should only consume one + or -");
       assert(!seen_plus_minus && "should only consume one + or -");

+ 8 - 17
lexer/string_literal.cpp

@@ -4,6 +4,7 @@
 
 
 #include "lexer/string_literal.h"
 #include "lexer/string_literal.h"
 
 
+#include "lexer/character_set.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/ConvertUTF.h"
@@ -71,16 +72,6 @@ struct MismatchedIndentInString : SimpleDiagnostic<MismatchedIndentInString> {
       "string literal.";
       "string literal.";
 };
 };
 
 
-// TODO(zygoloid): Update this to match whatever we decide qualifies as
-// acceptable whitespace.
-static bool isSpace(char c) { return c == ' ' || c == '\n' || c == '\t'; }
-
-static constexpr llvm::StringLiteral HorizontalWhitespace = " \t";
-
-static bool isUpperHexDigit(char c) {
-  return ('0' <= c && c <= '9') || ('A' <= c && c <= 'F');
-}
-
 // Find and return the opening characters of a multi-line string literal,
 // Find and return the opening characters of a multi-line string literal,
 // after any '#'s, including the file type indicator and following newline.
 // after any '#'s, including the file type indicator and following newline.
 static auto TakeMultiLineStringLiteralPrefix(llvm::StringRef source_text)
 static auto TakeMultiLineStringLiteralPrefix(llvm::StringRef source_text)
@@ -166,7 +157,7 @@ static auto ComputeIndentOfFinalLine(llvm::StringRef text) -> llvm::StringRef {
       int indent_start = i + 1;
       int indent_start = i + 1;
       return text.substr(indent_start, indent_end - indent_start);
       return text.substr(indent_start, indent_end - indent_start);
     }
     }
-    if (!isSpace(text[i])) {
+    if (!IsSpace(text[i])) {
       indent_end = i;
       indent_end = i;
     }
     }
   }
   }
@@ -264,14 +255,14 @@ static auto ExpandAndConsumeEscapeSequence(DiagnosticEmitter& emitter,
       return true;
       return true;
     case '0':
     case '0':
       result += '\0';
       result += '\0';
-      if (!content.empty() && llvm::isDigit(content.front())) {
+      if (!content.empty() && IsDecimalDigit(content.front())) {
         emitter.EmitError<DecimalEscapeSequence>();
         emitter.EmitError<DecimalEscapeSequence>();
         return false;
         return false;
       }
       }
       return true;
       return true;
     case 'x':
     case 'x':
-      if (content.size() >= 2 && isUpperHexDigit(content[0]) &&
-          isUpperHexDigit(content[1])) {
+      if (content.size() >= 2 && IsUpperHexDigit(content[0]) &&
+          IsUpperHexDigit(content[1])) {
         result +=
         result +=
             static_cast<char>(llvm::hexFromNibbles(content[0], content[1]));
             static_cast<char>(llvm::hexFromNibbles(content[0], content[1]));
         content = content.drop_front(2);
         content = content.drop_front(2);
@@ -282,7 +273,7 @@ static auto ExpandAndConsumeEscapeSequence(DiagnosticEmitter& emitter,
     case 'u': {
     case 'u': {
       llvm::StringRef remaining = content;
       llvm::StringRef remaining = content;
       if (remaining.consume_front("{")) {
       if (remaining.consume_front("{")) {
-        llvm::StringRef digits = remaining.take_while(isUpperHexDigit);
+        llvm::StringRef digits = remaining.take_while(IsUpperHexDigit);
         remaining = remaining.drop_front(digits.size());
         remaining = remaining.drop_front(digits.size());
         if (!digits.empty() && remaining.consume_front("}")) {
         if (!digits.empty() && remaining.consume_front("}")) {
           if (!ExpandUnicodeEscapeSequence(emitter, digits, result)) {
           if (!ExpandUnicodeEscapeSequence(emitter, digits, result)) {
@@ -326,7 +317,7 @@ static auto ExpandEscapeSequencesAndRemoveIndent(DiagnosticEmitter& emitter,
     // whitespace) is required to start with the string's indent. For error
     // whitespace) is required to start with the string's indent. For error
     // recovery, remove all leading whitespace if the indent doesn't match.
     // recovery, remove all leading whitespace if the indent doesn't match.
     if (!contents.consume_front(indent)) {
     if (!contents.consume_front(indent)) {
-      contents = contents.ltrim(HorizontalWhitespace);
+      contents = contents.drop_while(IsHorizontalWhitespace);
       if (!contents.startswith("\n")) {
       if (!contents.startswith("\n")) {
         emitter.EmitError<MismatchedIndentInString>();
         emitter.EmitError<MismatchedIndentInString>();
         has_errors = true;
         has_errors = true;
@@ -347,7 +338,7 @@ static auto ExpandEscapeSequencesAndRemoveIndent(DiagnosticEmitter& emitter,
         // Trailing whitespace before a newline doesn't contribute to the string
         // Trailing whitespace before a newline doesn't contribute to the string
         // literal value.
         // literal value.
         while (!result.empty() && result.back() != '\n' &&
         while (!result.empty() && result.back() != '\n' &&
-               isSpace(result.back())) {
+               IsSpace(result.back())) {
           result.pop_back();
           result.pop_back();
         }
         }
         result += '\n';
         result += '\n';

+ 9 - 12
lexer/tokenized_buffer.cpp

@@ -9,6 +9,7 @@
 #include <iterator>
 #include <iterator>
 #include <string>
 #include <string>
 
 
+#include "lexer/character_set.h"
 #include "lexer/numeric_literal.h"
 #include "lexer/numeric_literal.h"
 #include "lexer/string_literal.h"
 #include "lexer/string_literal.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringRef.h"
@@ -53,10 +54,6 @@ struct UnrecognizedCharacters : SimpleDiagnostic<UnrecognizedCharacters> {
       "Encountered unrecognized characters while parsing.";
       "Encountered unrecognized characters while parsing.";
 };
 };
 
 
-// TODO(zygoloid): Update this to match whatever we decide qualifies as
-// acceptable whitespace.
-static bool isSpace(char c) { return c == ' ' || c == '\n' || c == '\t'; }
-
 // Implementation of the lexer logic itself.
 // Implementation of the lexer logic itself.
 //
 //
 // The design is that lexing can loop over the source buffer, consuming it into
 // The design is that lexing can loop over the source buffer, consuming it into
@@ -110,8 +107,8 @@ class TokenizedBuffer::Lexer {
   auto HandleNewline() -> void {
   auto HandleNewline() -> void {
     current_line_info->length = current_column;
     current_line_info->length = current_column;
 
 
-    current_line = buffer.AddLine(
-        {current_line_info->start + current_column + 1, 0, 0});
+    current_line =
+        buffer.AddLine({current_line_info->start + current_column + 1, 0, 0});
     current_line_info = &buffer.GetLineInfo(current_line);
     current_line_info = &buffer.GetLineInfo(current_line);
     current_column = 0;
     current_column = 0;
     set_indent = false;
     set_indent = false;
@@ -128,7 +125,7 @@ class TokenizedBuffer::Lexer {
           buffer.has_errors = true;
           buffer.has_errors = true;
         }
         }
         // The introducer '//' must be followed by whitespace or EOF.
         // The introducer '//' must be followed by whitespace or EOF.
-        if (source_text.size() > 2 && !isSpace(source_text[2])) {
+        if (source_text.size() > 2 && !IsSpace(source_text[2])) {
           emitter.EmitError<NoWhitespaceAfterCommentIntroducer>();
           emitter.EmitError<NoWhitespaceAfterCommentIntroducer>();
           buffer.has_errors = true;
           buffer.has_errors = true;
         }
         }
@@ -145,7 +142,7 @@ class TokenizedBuffer::Lexer {
         default:
         default:
           // If we find a non-whitespace character without exhausting the
           // If we find a non-whitespace character without exhausting the
           // buffer, return true to continue lexing.
           // buffer, return true to continue lexing.
-          assert(!isSpace(source_text.front()));
+          assert(!IsSpace(source_text.front()));
           return true;
           return true;
 
 
         case '\n':
         case '\n':
@@ -383,7 +380,7 @@ class TokenizedBuffer::Lexer {
   }
   }
 
 
   auto LexKeywordOrIdentifier(llvm::StringRef& source_text) -> LexResult {
   auto LexKeywordOrIdentifier(llvm::StringRef& source_text) -> LexResult {
-    if (!llvm::isAlpha(source_text.front()) && source_text.front() != '_') {
+    if (!IsAlpha(source_text.front()) && source_text.front() != '_') {
       return LexResult::NoMatch();
       return LexResult::NoMatch();
     }
     }
 
 
@@ -393,8 +390,8 @@ class TokenizedBuffer::Lexer {
     }
     }
 
 
     // Take the valid characters off the front of the source buffer.
     // Take the valid characters off the front of the source buffer.
-    llvm::StringRef identifier_text = source_text.take_while(
-        [](char c) { return llvm::isAlnum(c) || c == '_'; });
+    llvm::StringRef identifier_text =
+        source_text.take_while([](char c) { return IsAlnum(c) || c == '_'; });
     assert(!identifier_text.empty() && "Must have at least one character!");
     assert(!identifier_text.empty() && "Must have at least one character!");
     int identifier_column = current_column;
     int identifier_column = current_column;
     current_column += identifier_text.size();
     current_column += identifier_text.size();
@@ -420,7 +417,7 @@ class TokenizedBuffer::Lexer {
 
 
   auto LexError(llvm::StringRef& source_text) -> LexResult {
   auto LexError(llvm::StringRef& source_text) -> LexResult {
     llvm::StringRef error_text = source_text.take_while([](char c) {
     llvm::StringRef error_text = source_text.take_while([](char c) {
-      if (llvm::isAlnum(c)) {
+      if (IsAlnum(c)) {
         return false;
         return false;
       }
       }
       switch (c) {
       switch (c) {