5 tahun lalu · 97bb6f4e80
--- a/lexer/BUILD
+++ b/lexer/BUILD
@@ -26,11 +26,18 @@ cc_test(
 
															     ],
														
 
															 )
														
 
															+cc_library(
														
 
															+    name = "character_set",
														
 
															+    hdrs = ["character_set.h"],
														
 
															+    deps = ["@llvm-project//llvm:Support"],
														
 
															+)
														
 
															+
														
 
															 cc_library(
														
 
															     name = "numeric_literal",
														
 
															     srcs = ["numeric_literal.cpp"],
														
 
															     hdrs = ["numeric_literal.h"],
														
 
															     deps = [
														
 
															+        ":character_set",
														
 
															         "//diagnostics:diagnostic_emitter",
														
 
															         "@llvm-project//llvm:Support",
														
 
															     ],
														
@@ -54,6 +61,7 @@ cc_library(
 
															     srcs = ["string_literal.cpp"],
														
 
															     hdrs = ["string_literal.h"],
														
 
															     deps = [
														
 
															+        ":character_set",
														
 
															         "//diagnostics:diagnostic_emitter",
														
 
															         "@llvm-project//llvm:Support",
														
 
															     ],
														
@@ -77,9 +85,10 @@ cc_library(
 
															     srcs = ["tokenized_buffer.cpp"],
														
 
															     hdrs = ["tokenized_buffer.h"],
														
 
															     deps = [
														
 
															+        ":character_set",
														
 
															+        ":numeric_literal",
														
 
															         ":string_literal",
														
 
															         ":token_kind",
														
 
															-        ":numeric_literal",
														
 
															         "//diagnostics:diagnostic_emitter",
														
 
															         "//source:source_buffer",
														
 
															         "@llvm-project//llvm:Support",
														
--- a/lexer/character_set.h
+++ b/lexer/character_set.h
@@ -0,0 +1,75 @@
 
															+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
														
 
															+// Exceptions. See /LICENSE for license information.
														
 
															+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
														
 
															+
														
 
															+#ifndef LEXER_CHARACTER_SET_H_
														
 
															+#define LEXER_CHARACTER_SET_H_
														
 
															+
														
 
															+#include "llvm/ADT/StringExtras.h"
														
 
															+#include "llvm/ADT/StringRef.h"
														
 
															+
														
 
															+namespace Carbon {
														
 
															+
														
 
															+// TODO: These definitions need to be updated to match whatever Unicode lexical
														
 
															+// rules we pick. The function interfaces will need to change to accommodate
														
 
															+// multi-byte characters.
														
 
															+
														
 
															+// Is this an alphabetical character according to Carbon's lexical rules?
														
 
															+//
														
 
															+// Alphabetical characters are permitted at the start of identifiers. This
														
 
															+// currently includes 'A'..'Z' and 'a'..'z'.
														
 
															+inline bool IsAlpha(char c) { return llvm::isAlpha(c); }
														
 
															+
														
 
															+// Is this a decimal digit according to Carbon's lexical rules?
														
 
															+//
														
 
															+// This currently includes '0'..'9'.
														
 
															+inline bool IsDecimalDigit(char c) { return llvm::isDigit(c); }
														
 
															+
														
 
															+// Is this an alphanumeric character according to Carbon's lexical rules?
														
 
															+//
														
 
															+// Alphanumeric characters are permitted as trailing characters in identifiers
														
 
															+// and numeric literals. This includes alphabetical characters plus decimal
														
 
															+// digits.
														
 
															+//
														
 
															+// Note that '_' is not considered alphanumeric, despite in most circumstances
														
 
															+// being a valid continuation character of an identifier or numeric literal.
														
 
															+inline bool IsAlnum(char c) { return llvm::isAlnum(c); }
														
 
															+
														
 
															+// Is this a hexadecimal digit according to Carbon's lexical rules?
														
 
															+//
														
 
															+// Hexadecimal digits are permitted in `0x`-prefixed literals, as well as after
														
 
															+// a `\x` escape sequence.
														
 
															+//
														
 
															+// Note that lowercase 'a'..'f' are currently not considered hexadecimal digits
														
 
															+// in any context.
														
 
															+inline bool IsUpperHexDigit(char c) {
														
 
															+  return ('0' <= c && c <= '9') || ('A' <= c && c <= 'F');
														
 
															+}
														
 
															+
														
 
															+// Is this a lowercase letter?
														
 
															+//
														
 
															+// Lowercase letters in numeric literals can be followed by `+` or `-` to
														
 
															+// extend the literal.
														
 
															+inline bool IsLower(char c) { return 'a' <= c && c <= 'z'; }
														
 
															+
														
 
															+// Is this character considered to be horizontal whitespace?
														
 
															+//
														
 
															+// Such characters can appear in the indentation of a line.
														
 
															+inline bool IsHorizontalWhitespace(char c) { return c == ' ' || c == '\t'; }
														
 
															+
														
 
															+// Is this character considered to be vertical whitespace?
														
 
															+//
														
 
															+// Such characters are considered to terminate lines.
														
 
															+inline bool IsVerticalWhitespace(char c) { return c == '\n'; }
														
 
															+
														
 
															+// Is this character considered to be whitespace?
														
 
															+//
														
 
															+// Changes here will need matching changes in
														
 
															+// `TokenizedBuffer::Lexer::SkipWhitespace`.
														
 
															+inline bool IsSpace(char c) {
														
 
															+  return IsHorizontalWhitespace(c) || IsVerticalWhitespace(c);
														
 
															+}
														
 
															+
														
 
															+}  // namespace Carbon
														
 
															+
														
 
															+#endif  // LEXER_CHARACTER_SET_H_
														
--- a/lexer/numeric_literal.cpp
+++ b/lexer/numeric_literal.cpp
@@ -6,6 +6,7 @@
 
															 #include <bitset>
														
 
															+#include "lexer/character_set.h"
														
 
															 #include "llvm/ADT/StringExtras.h"
														
 
															 #include "llvm/Support/FormatVariadic.h"
														
@@ -79,13 +80,11 @@ struct WrongRealLiteralExponent {
 
															 };
														
 
															 }  // namespace
														
 
															-static bool isLower(char c) { return 'a' <= c && c <= 'z'; }
														
 
															-
														
 
															 auto NumericLiteralToken::Lex(llvm::StringRef source_text)
														
 
															     -> llvm::Optional<NumericLiteralToken> {
														
 
															   NumericLiteralToken result;
														
 
															-  if (source_text.empty() || !llvm::isDigit(source_text.front())) {
														
 
															+  if (source_text.empty() || !IsDecimalDigit(source_text.front())) {
														
 
															     return llvm::None;
														
 
															   }
														
@@ -101,8 +100,8 @@ auto NumericLiteralToken::Lex(llvm::StringRef source_text)
 
															   int i = 1, n = source_text.size();
														
 
															   for (; i != n; ++i) {
														
 
															     char c = source_text[i];
														
 
															-    if (llvm::isAlnum(c) || c == '_') {
														
 
															-      if (isLower(c) && seen_radix_point && !seen_plus_minus) {
														
 
															+    if (IsAlnum(c) || c == '_') {
														
 
															+      if (IsLower(c) && seen_radix_point && !seen_plus_minus) {
														
 
															         result.exponent = i;
														
 
															         seen_potential_exponent = true;
														
 
															       }
														
@@ -111,7 +110,7 @@ auto NumericLiteralToken::Lex(llvm::StringRef source_text)
 
															     // Exactly one `.` can be part of the literal, but only if it's followed by
														
 
															     // an alphanumeric character.
														
 
															-    if (c == '.' && i + 1 != n && llvm::isAlnum(source_text[i + 1]) &&
														
 
															+    if (c == '.' && i + 1 != n && IsAlnum(source_text[i + 1]) &&
														
 
															         !seen_radix_point) {
														
 
															       result.radix_point = i;
														
 
															       seen_radix_point = true;
														
@@ -123,8 +122,7 @@ auto NumericLiteralToken::Lex(llvm::StringRef source_text)
 
															     // followed by an alphanumeric character. This '+' or '-' cannot be an
														
 
															     // operator because a literal cannot end in a lowercase letter.
														
 
															     if ((c == '+' || c == '-') && seen_potential_exponent &&
														
 
															-        result.exponent == i - 1 && i + 1 != n &&
														
 
															-        llvm::isAlnum(source_text[i + 1])) {
														
 
															+        result.exponent == i - 1 && i + 1 != n && IsAlnum(source_text[i + 1])) {
														
 
															       // This is not possible because we don't update result.exponent after we
														
 
															       // see a '+' or '-'.
														
 
															       assert(!seen_plus_minus && "should only consume one + or -");
														
--- a/lexer/string_literal.cpp
+++ b/lexer/string_literal.cpp
@@ -4,6 +4,7 @@
 
															 #include "lexer/string_literal.h"
														
 
															+#include "lexer/character_set.h"
														
 
															 #include "llvm/ADT/SmallString.h"
														
 
															 #include "llvm/ADT/StringExtras.h"
														
 
															 #include "llvm/Support/ConvertUTF.h"
														
@@ -71,16 +72,6 @@ struct MismatchedIndentInString : SimpleDiagnostic<MismatchedIndentInString> {
 
															       "string literal.";
														
 
															 };
														
 
															-// TODO(zygoloid): Update this to match whatever we decide qualifies as
														
 
															-// acceptable whitespace.
														
 
															-static bool isSpace(char c) { return c == ' ' || c == '\n' || c == '\t'; }
														
 
															-
														
 
															-static constexpr llvm::StringLiteral HorizontalWhitespace = " \t";
														
 
															-
														
 
															-static bool isUpperHexDigit(char c) {
														
 
															-  return ('0' <= c && c <= '9') || ('A' <= c && c <= 'F');
														
 
															-}
														
 
															-
														
 
															 // Find and return the opening characters of a multi-line string literal,
														
 
															 // after any '#'s, including the file type indicator and following newline.
														
 
															 static auto TakeMultiLineStringLiteralPrefix(llvm::StringRef source_text)
														
@@ -166,7 +157,7 @@ static auto ComputeIndentOfFinalLine(llvm::StringRef text) -> llvm::StringRef {
 
															       int indent_start = i + 1;
														
 
															       return text.substr(indent_start, indent_end - indent_start);
														
 
															     }
														
 
															-    if (!isSpace(text[i])) {
														
 
															+    if (!IsSpace(text[i])) {
														
 
															       indent_end = i;
														
 
															     }
														
 
															   }
														
@@ -264,14 +255,14 @@ static auto ExpandAndConsumeEscapeSequence(DiagnosticEmitter& emitter,
 
															       return true;
														
 
															     case '0':
														
 
															       result += '\0';
														
 
															-      if (!content.empty() && llvm::isDigit(content.front())) {
														
 
															+      if (!content.empty() && IsDecimalDigit(content.front())) {
														
 
															         emitter.EmitError<DecimalEscapeSequence>();
														
 
															         return false;
														
 
															       }
														
 
															       return true;
														
 
															     case 'x':
														
 
															-      if (content.size() >= 2 && isUpperHexDigit(content[0]) &&
														
 
															-          isUpperHexDigit(content[1])) {
														
 
															+      if (content.size() >= 2 && IsUpperHexDigit(content[0]) &&
														
 
															+          IsUpperHexDigit(content[1])) {
														
 
															         result +=
														
 
															             static_cast<char>(llvm::hexFromNibbles(content[0], content[1]));
														
 
															         content = content.drop_front(2);
														
@@ -282,7 +273,7 @@ static auto ExpandAndConsumeEscapeSequence(DiagnosticEmitter& emitter,
 
															     case 'u': {
														
 
															       llvm::StringRef remaining = content;
														
 
															       if (remaining.consume_front("{")) {
														
 
															-        llvm::StringRef digits = remaining.take_while(isUpperHexDigit);
														
 
															+        llvm::StringRef digits = remaining.take_while(IsUpperHexDigit);
														
 
															         remaining = remaining.drop_front(digits.size());
														
 
															         if (!digits.empty() && remaining.consume_front("}")) {
														
 
															           if (!ExpandUnicodeEscapeSequence(emitter, digits, result)) {
														
@@ -326,7 +317,7 @@ static auto ExpandEscapeSequencesAndRemoveIndent(DiagnosticEmitter& emitter,
 
															     // whitespace) is required to start with the string's indent. For error
														
 
															     // recovery, remove all leading whitespace if the indent doesn't match.
														
 
															     if (!contents.consume_front(indent)) {
														
 
															-      contents = contents.ltrim(HorizontalWhitespace);
														
 
															+      contents = contents.drop_while(IsHorizontalWhitespace);
														
 
															       if (!contents.startswith("\n")) {
														
 
															         emitter.EmitError<MismatchedIndentInString>();
														
 
															         has_errors = true;
														
@@ -347,7 +338,7 @@ static auto ExpandEscapeSequencesAndRemoveIndent(DiagnosticEmitter& emitter,
 
															         // Trailing whitespace before a newline doesn't contribute to the string
														
 
															         // literal value.
														
 
															         while (!result.empty() && result.back() != '\n' &&
														
 
															-               isSpace(result.back())) {
														
 
															+               IsSpace(result.back())) {
														
 
															           result.pop_back();
														
 
															         }
														
 
															         result += '\n';
														
--- a/lexer/tokenized_buffer.cpp
+++ b/lexer/tokenized_buffer.cpp
@@ -9,6 +9,7 @@
 
															 #include <iterator>
														
 
															 #include <string>
														
 
															+#include "lexer/character_set.h"
														
 
															 #include "lexer/numeric_literal.h"
														
 
															 #include "lexer/string_literal.h"
														
 
															 #include "llvm/ADT/StringRef.h"
														
@@ -53,10 +54,6 @@ struct UnrecognizedCharacters : SimpleDiagnostic<UnrecognizedCharacters> {
 
															       "Encountered unrecognized characters while parsing.";
														
 
															 };
														
 
															-// TODO(zygoloid): Update this to match whatever we decide qualifies as
														
 
															-// acceptable whitespace.
														
 
															-static bool isSpace(char c) { return c == ' ' || c == '\n' || c == '\t'; }
														
 
															-
														
 
															 // Implementation of the lexer logic itself.
														
 
															 //
														
 
															 // The design is that lexing can loop over the source buffer, consuming it into
														
@@ -110,8 +107,8 @@ class TokenizedBuffer::Lexer {
 
															   auto HandleNewline() -> void {
														
 
															     current_line_info->length = current_column;
														
 
															-    current_line = buffer.AddLine(
														
 
															-        {current_line_info->start + current_column + 1, 0, 0});
														
 
															+    current_line =
														
 
															+        buffer.AddLine({current_line_info->start + current_column + 1, 0, 0});
														
 
															     current_line_info = &buffer.GetLineInfo(current_line);
														
 
															     current_column = 0;
														
 
															     set_indent = false;
														
@@ -128,7 +125,7 @@ class TokenizedBuffer::Lexer {
 
															           buffer.has_errors = true;
														
 
															         }
														
 
															         // The introducer '//' must be followed by whitespace or EOF.
														
 
															-        if (source_text.size() > 2 && !isSpace(source_text[2])) {
														
 
															+        if (source_text.size() > 2 && !IsSpace(source_text[2])) {
														
 
															           emitter.EmitError<NoWhitespaceAfterCommentIntroducer>();
														
 
															           buffer.has_errors = true;
														
 
															         }
														
@@ -145,7 +142,7 @@ class TokenizedBuffer::Lexer {
 
															         default:
														
 
															           // If we find a non-whitespace character without exhausting the
														
 
															           // buffer, return true to continue lexing.
														
 
															-          assert(!isSpace(source_text.front()));
														
 
															+          assert(!IsSpace(source_text.front()));
														
 
															           return true;
														
 
															         case '\n':
														
@@ -383,7 +380,7 @@ class TokenizedBuffer::Lexer {
 
															   }
														
 
															   auto LexKeywordOrIdentifier(llvm::StringRef& source_text) -> LexResult {
														
 
															-    if (!llvm::isAlpha(source_text.front()) && source_text.front() != '_') {
														
 
															+    if (!IsAlpha(source_text.front()) && source_text.front() != '_') {
														
 
															       return LexResult::NoMatch();
														
 
															     }
														
@@ -393,8 +390,8 @@ class TokenizedBuffer::Lexer {
 
															     }
														
 
															     // Take the valid characters off the front of the source buffer.
														
 
															-    llvm::StringRef identifier_text = source_text.take_while(
														
 
															-        [](char c) { return llvm::isAlnum(c) || c == '_'; });
														
 
															+    llvm::StringRef identifier_text =
														
 
															+        source_text.take_while([](char c) { return IsAlnum(c) || c == '_'; });
														
 
															     assert(!identifier_text.empty() && "Must have at least one character!");
														
 
															     int identifier_column = current_column;
														
 
															     current_column += identifier_text.size();
														
@@ -420,7 +417,7 @@ class TokenizedBuffer::Lexer {
 
															   auto LexError(llvm::StringRef& source_text) -> LexResult {
														
 
															     llvm::StringRef error_text = source_text.take_while([](char c) {
														
 
															-      if (llvm::isAlnum(c)) {
														
 
															+      if (IsAlnum(c)) {
														
 
															         return false;
														
 
															       }
														
 
															       switch (c) {