4 years ago · f5f02babdd
--- a/toolchain/lexer/BUILD
+++ b/toolchain/lexer/BUILD
@@ -34,6 +34,16 @@ cc_library(
 
				     deps = ["@llvm-project//llvm:Support"],
			
 
				 )
			
 
				 
			
 
				+cc_library(
			
 
				+    name = "lex_helpers",
			
 
				+    srcs = ["lex_helpers.cpp"],
			
 
				+    hdrs = ["lex_helpers.h"],
			
 
				+    deps = [
			
 
				+        "//toolchain/diagnostics:diagnostic_emitter",
			
 
				+        "@llvm-project//llvm:Support",
			
 
				+    ],
			
 
				+)
			
 
				+
			
 
				 cc_library(
			
 
				     name = "test_helpers",
			
 
				     testonly = 1,
			
@@ -53,6 +63,7 @@ cc_library(
 
				     hdrs = ["numeric_literal.h"],
			
 
				     deps = [
			
 
				         ":character_set",
			
 
				+        ":lex_helpers",
			
 
				         "//common:check",
			
 
				         "//toolchain/diagnostics:diagnostic_emitter",
			
 
				         "@llvm-project//llvm:Support",
			
@@ -105,6 +116,7 @@ cc_library(
 
				     hdrs = ["string_literal.h"],
			
 
				     deps = [
			
 
				         ":character_set",
			
 
				+        ":lex_helpers",
			
 
				         "//common:check",
			
 
				         "//toolchain/diagnostics:diagnostic_emitter",
			
 
				         "@llvm-project//llvm:Support",
			
@@ -156,6 +168,7 @@ cc_library(
 
				     hdrs = ["tokenized_buffer.h"],
			
 
				     deps = [
			
 
				         ":character_set",
			
 
				+        ":lex_helpers",
			
 
				         ":numeric_literal",
			
 
				         ":string_literal",
			
 
				         ":token_kind",
			
--- a/toolchain/lexer/lex_helpers.cpp
+++ b/toolchain/lexer/lex_helpers.cpp
@@ -0,0 +1,47 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+#include "toolchain/lexer/lex_helpers.h"
			
 
				+
			
 
				+#include "llvm/Support/FormatVariadic.h"
			
 
				+
			
 
				+namespace Carbon {
			
 
				+
			
 
				+namespace {
			
 
				+struct TooManyDigits : DiagnosticBase<TooManyDigits> {
			
 
				+  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
			
 
				+
			
 
				+  auto Format() -> std::string {
			
 
				+    return llvm::formatv(
			
 
				+               "Found a sequence of {0} digits, which is greater than the "
			
 
				+               "limit of {1}.",
			
 
				+               count, limit)
			
 
				+        .str();
			
 
				+  }
			
 
				+
			
 
				+  size_t count;
			
 
				+  size_t limit;
			
 
				+};
			
 
				+}  // namespace
			
 
				+
			
 
				+auto CanLexInteger(DiagnosticEmitter<const char*>& emitter,
			
 
				+                   llvm::StringRef text) -> bool {
			
 
				+  // llvm::getAsInteger is used for parsing, but it's quadratic and visibly slow
			
 
				+  // on large integer values. This limit exists to avoid hitting those limits.
			
 
				+  // Per https://github.com/carbon-language/carbon-lang/issues/980, it may be
			
 
				+  // feasible to optimize integer parsing in order to address performance if
			
 
				+  // this limit becomes an issue.
			
 
				+  //
			
 
				+  // 2^128 would be 39 decimal digits or 128 binary. In either case, this limit
			
 
				+  // is far above the threshold for normal integers.
			
 
				+  constexpr size_t DigitLimit = 1000;
			
 
				+  if (text.size() > DigitLimit) {
			
 
				+    emitter.EmitError<TooManyDigits>(
			
 
				+        text.begin(), {.count = text.size(), .limit = DigitLimit});
			
 
				+    return false;
			
 
				+  }
			
 
				+  return true;
			
 
				+}
			
 
				+
			
 
				+}  // namespace Carbon
			
--- a/toolchain/lexer/lex_helpers.h
+++ b/toolchain/lexer/lex_helpers.h
@@ -0,0 +1,21 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+#ifndef TOOLCHAIN_LEXER_LEX_HELPERS_H_
			
 
				+#define TOOLCHAIN_LEXER_LEX_HELPERS_H_
			
 
				+
			
 
				+#include <optional>
			
 
				+
			
 
				+#include "toolchain/diagnostics/diagnostic_emitter.h"
			
 
				+
			
 
				+namespace Carbon {
			
 
				+
			
 
				+// Should guard calls to getAsInteger due to performance issues with large
			
 
				+// integers. Emits an error if the text cannot be lexed.
			
 
				+auto CanLexInteger(DiagnosticEmitter<const char*>& emitter,
			
 
				+                   llvm::StringRef text) -> bool;
			
 
				+
			
 
				+}  // namespace Carbon
			
 
				+
			
 
				+#endif  // TOOLCHAIN_LEXER_LEX_HELPERS_H_
			
--- a/toolchain/lexer/numeric_literal.cpp
+++ b/toolchain/lexer/numeric_literal.cpp
@@ -10,6 +10,7 @@
 
				 #include "llvm/ADT/StringExtras.h"
			
 
				 #include "llvm/Support/FormatVariadic.h"
			
 
				 #include "toolchain/lexer/character_set.h"
			
 
				+#include "toolchain/lexer/lex_helpers.h"
			
 
				 
			
 
				 namespace Carbon {
			
 
				 
			
@@ -81,20 +82,6 @@ struct WrongRealLiteralExponent : DiagnosticBase<WrongRealLiteralExponent> {
 
				   char expected;
			
 
				 };
			
 
				 
			
 
				-struct TooManyDigits : DiagnosticBase<TooManyDigits> {
			
 
				-  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
			
 
				-
			
 
				-  auto Format() -> std::string {
			
 
				-    return llvm::formatv(
			
 
				-               "Found a sequence of {0} digits, which is greater than the "
			
 
				-               "limit of {1}.",
			
 
				-               count, limit)
			
 
				-        .str();
			
 
				-  }
			
 
				-
			
 
				-  size_t count;
			
 
				-  size_t limit;
			
 
				-};
			
 
				 }  // namespace
			
 
				 
			
 
				 auto LexedNumericLiteral::Lex(llvm::StringRef source_text)
			
@@ -381,18 +368,7 @@ auto LexedNumericLiteral::Parser::CheckDigitSequence(
 
				     CheckDigitSeparatorPlacement(text, radix, num_digit_separators);
			
 
				   }
			
 
				 
			
 
				-  // llvm::getAsInteger is used for parsing, but it's quadratic and visibly slow
			
 
				-  // on large integer values. This limit exists to avoid hitting those limits.
			
 
				-  // Per https://github.com/carbon-language/carbon-lang/issues/980, it may be
			
 
				-  // feasible to optimize integer parsing in order to address performance if
			
 
				-  // this limit becomes an issue.
			
 
				-  //
			
 
				-  // 2^128 would be 39 decimal digits or 128 binary. In either case, this limit
			
 
				-  // is far above the threshold for normal integers.
			
 
				-  constexpr size_t DigitLimit = 1000;
			
 
				-  if (text.size() > DigitLimit) {
			
 
				-    emitter_.EmitError<TooManyDigits>(
			
 
				-        text.begin(), {.count = text.size(), .limit = DigitLimit});
			
 
				+  if (!CanLexInteger(emitter_, text)) {
			
 
				     return {.ok = false};
			
 
				   }
			
 
				 
			
--- a/toolchain/lexer/string_literal.cpp
+++ b/toolchain/lexer/string_literal.cpp
@@ -11,6 +11,7 @@
 
				 #include "llvm/Support/ErrorHandling.h"
			
 
				 #include "llvm/Support/FormatVariadic.h"
			
 
				 #include "toolchain/lexer/character_set.h"
			
 
				+#include "toolchain/lexer/lex_helpers.h"
			
 
				 
			
 
				 namespace Carbon {
			
 
				 
			
@@ -228,6 +229,9 @@ static auto ExpandUnicodeEscapeSequence(LexerDiagnosticEmitter& emitter,
 
				                                         llvm::StringRef digits,
			
 
				                                         std::string& result) -> bool {
			
 
				   unsigned code_point;
			
 
				+  if (!CanLexInteger(emitter, digits)) {
			
 
				+    return false;
			
 
				+  }
			
 
				   if (digits.getAsInteger(16, code_point) || code_point > 0x10FFFF) {
			
 
				     emitter.EmitError<UnicodeEscapeTooLarge>(digits.begin());
			
 
				     return false;
			
--- a/toolchain/lexer/string_literal_test.cpp
+++ b/toolchain/lexer/string_literal_test.cpp
@@ -298,5 +298,14 @@ TEST_F(StringLiteralTest, TabInBlockString) {
 
				   EXPECT_EQ(value, "x\ty\n");
			
 
				 }
			
 
				 
			
 
				+TEST_F(StringLiteralTest, UnicodeTooManyDigits) {
			
 
				+  std::string text = "u{";
			
 
				+  text.append(10000, '9');
			
 
				+  text.append("}");
			
 
				+  auto value = Parse("\"\\" + text + "\"");
			
 
				+  EXPECT_TRUE(error_tracker.SeenError());
			
 
				+  EXPECT_EQ(value, text);
			
 
				+}
			
 
				+
			
 
				 }  // namespace
			
 
				 }  // namespace Carbon::Testing
			
--- a/toolchain/lexer/tokenized_buffer.cpp
+++ b/toolchain/lexer/tokenized_buffer.cpp
@@ -21,6 +21,7 @@
 
				 #include "llvm/Support/FormatVariadic.h"
			
 
				 #include "llvm/Support/raw_ostream.h"
			
 
				 #include "toolchain/lexer/character_set.h"
			
 
				+#include "toolchain/lexer/lex_helpers.h"
			
 
				 #include "toolchain/lexer/numeric_literal.h"
			
 
				 #include "toolchain/lexer/string_literal.h"
			
 
				 
			
@@ -402,6 +403,13 @@ class TokenizedBuffer::Lexer {
 
				     };
			
 
				 
			
 
				     llvm::StringRef suffix = word.substr(1);
			
 
				+    if (!CanLexInteger(emitter_, suffix)) {
			
 
				+      return buffer_.AddToken(
			
 
				+          {.kind = TokenKind::Error(),
			
 
				+           .token_line = current_line_,
			
 
				+           .column = column,
			
 
				+           .error_length = static_cast<int32_t>(word.size())});
			
 
				+    }
			
 
				     llvm::APInt suffix_value;
			
 
				     if (suffix.getAsInteger(10, suffix_value)) {
			
 
				       return LexResult::NoMatch();
			
@@ -690,8 +698,8 @@ auto TokenizedBuffer::GetRealLiteral(Token token) const -> RealLiteralValue {
 
				       << "The token must be a real literal!";
			
 
				 
			
 
				   // Note that every real literal is at least three characters long, so we can
			
 
				-  // safely look at the second character to determine whether we have a decimal
			
 
				-  // or hexadecimal literal.
			
 
				+  // safely look at the second character to determine whether we have a
			
 
				+  // decimal or hexadecimal literal.
			
 
				   auto& line_info = GetLineInfo(token_info.token_line);
			
 
				   int64_t token_start = line_info.start + token_info.column;
			
 
				   char second_char = source_->Text()[token_start + 1];
			
@@ -766,8 +774,8 @@ auto TokenizedBuffer::PrintWidths::Widen(const PrintWidths& widths) -> void {
 
				 }
			
 
				 
			
 
				 // Compute the printed width of a number. When numbers are printed in decimal,
			
 
				-// the number of digits needed is is one more than the log-base-10 of the value.
			
 
				-// We handle a value of `zero` explicitly.
			
 
				+// the number of digits needed is is one more than the log-base-10 of the
			
 
				+// value. We handle a value of `zero` explicitly.
			
 
				 //
			
 
				 // This routine requires its argument to be *non-negative*.
			
 
				 static auto ComputeDecimalPrintedWidth(int number) -> int {
			
--- a/toolchain/lexer/tokenized_buffer_test.cpp
+++ b/toolchain/lexer/tokenized_buffer_test.cpp
@@ -932,6 +932,29 @@ TEST_F(LexerTest, TypeLiterals) {
 
				   EXPECT_EQ(buffer.GetTypeLiteralSize(*token_f1), 1);
			
 
				 }
			
 
				 
			
 
				+TEST_F(LexerTest, TypeLiteralTooManyDigits) {
			
 
				+  std::string code = "i";
			
 
				+  code.append(10000, '9');
			
 
				+
			
 
				+  Testing::MockDiagnosticConsumer consumer;
			
 
				+  EXPECT_CALL(
			
 
				+      consumer,
			
 
				+      HandleDiagnostic(AllOf(
			
 
				+          DiagnosticAt(1, 2),
			
 
				+          DiagnosticMessage(HasSubstr("Found a sequence of 10000 digits")))));
			
 
				+  auto buffer = Lex(code, consumer);
			
 
				+  EXPECT_TRUE(buffer.HasErrors());
			
 
				+  ASSERT_THAT(buffer,
			
 
				+              HasTokens(llvm::ArrayRef<ExpectedToken>{
			
 
				+                  {.kind = TokenKind::Error(),
			
 
				+                   .line = 1,
			
 
				+                   .column = 1,
			
 
				+                   .indent_column = 1,
			
 
				+                   .text = {code}},
			
 
				+                  {.kind = TokenKind::EndOfFile(), .line = 1, .column = 10002},
			
 
				+              }));
			
 
				+}
			
 
				+
			
 
				 TEST_F(LexerTest, DiagnosticTrailingComment) {
			
 
				   llvm::StringLiteral testcase = R"(
			
 
				     // Hello!