Browse Source

Apply a digit limit for all getAsInteger calls (#1117)

In particular noticed the issue in type literal parsing, but given this has come up once before, trying to address it consistently.
Jon Meow 4 years ago
parent
commit
f5f02babdd

+ 13 - 0
toolchain/lexer/BUILD

@@ -34,6 +34,16 @@ cc_library(
     deps = ["@llvm-project//llvm:Support"],
 )
 
+cc_library(
+    name = "lex_helpers",
+    srcs = ["lex_helpers.cpp"],
+    hdrs = ["lex_helpers.h"],
+    deps = [
+        "//toolchain/diagnostics:diagnostic_emitter",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
 cc_library(
     name = "test_helpers",
     testonly = 1,
@@ -53,6 +63,7 @@ cc_library(
     hdrs = ["numeric_literal.h"],
     deps = [
         ":character_set",
+        ":lex_helpers",
         "//common:check",
         "//toolchain/diagnostics:diagnostic_emitter",
         "@llvm-project//llvm:Support",
@@ -105,6 +116,7 @@ cc_library(
     hdrs = ["string_literal.h"],
     deps = [
         ":character_set",
+        ":lex_helpers",
         "//common:check",
         "//toolchain/diagnostics:diagnostic_emitter",
         "@llvm-project//llvm:Support",
@@ -156,6 +168,7 @@ cc_library(
     hdrs = ["tokenized_buffer.h"],
     deps = [
         ":character_set",
+        ":lex_helpers",
         ":numeric_literal",
         ":string_literal",
         ":token_kind",

+ 47 - 0
toolchain/lexer/lex_helpers.cpp

@@ -0,0 +1,47 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "toolchain/lexer/lex_helpers.h"
+
+#include "llvm/Support/FormatVariadic.h"
+
+namespace Carbon {
+
+namespace {
+struct TooManyDigits : DiagnosticBase<TooManyDigits> {
+  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
+
+  auto Format() -> std::string {
+    return llvm::formatv(
+               "Found a sequence of {0} digits, which is greater than the "
+               "limit of {1}.",
+               count, limit)
+        .str();
+  }
+
+  size_t count;
+  size_t limit;
+};
+}  // namespace
+
+auto CanLexInteger(DiagnosticEmitter<const char*>& emitter,
+                   llvm::StringRef text) -> bool {
+  // llvm::getAsInteger is used for parsing, but it's quadratic and visibly slow
+  // on large integer values. This limit exists to avoid hitting those limits.
+  // Per https://github.com/carbon-language/carbon-lang/issues/980, it may be
+  // feasible to optimize integer parsing in order to address performance if
+  // this limit becomes an issue.
+  //
+  // 2^128 would be 39 decimal digits or 128 binary. In either case, this limit
+  // is far above the threshold for normal integers.
+  constexpr size_t DigitLimit = 1000;
+  if (text.size() > DigitLimit) {
+    emitter.EmitError<TooManyDigits>(
+        text.begin(), {.count = text.size(), .limit = DigitLimit});
+    return false;
+  }
+  return true;
+}
+
+}  // namespace Carbon

+ 21 - 0
toolchain/lexer/lex_helpers.h

@@ -0,0 +1,21 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef TOOLCHAIN_LEXER_LEX_HELPERS_H_
+#define TOOLCHAIN_LEXER_LEX_HELPERS_H_
+
+#include <optional>
+
+#include "toolchain/diagnostics/diagnostic_emitter.h"
+
+namespace Carbon {
+
+// Should guard calls to getAsInteger due to performance issues with large
+// integers. Emits an error if the text cannot be lexed.
+auto CanLexInteger(DiagnosticEmitter<const char*>& emitter,
+                   llvm::StringRef text) -> bool;
+
+}  // namespace Carbon
+
+#endif  // TOOLCHAIN_LEXER_LEX_HELPERS_H_

+ 2 - 26
toolchain/lexer/numeric_literal.cpp

@@ -10,6 +10,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "toolchain/lexer/character_set.h"
+#include "toolchain/lexer/lex_helpers.h"
 
 namespace Carbon {
 
@@ -81,20 +82,6 @@ struct WrongRealLiteralExponent : DiagnosticBase<WrongRealLiteralExponent> {
   char expected;
 };
 
-struct TooManyDigits : DiagnosticBase<TooManyDigits> {
-  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
-
-  auto Format() -> std::string {
-    return llvm::formatv(
-               "Found a sequence of {0} digits, which is greater than the "
-               "limit of {1}.",
-               count, limit)
-        .str();
-  }
-
-  size_t count;
-  size_t limit;
-};
 }  // namespace
 
 auto LexedNumericLiteral::Lex(llvm::StringRef source_text)
@@ -381,18 +368,7 @@ auto LexedNumericLiteral::Parser::CheckDigitSequence(
     CheckDigitSeparatorPlacement(text, radix, num_digit_separators);
   }
 
-  // llvm::getAsInteger is used for parsing, but it's quadratic and visibly slow
-  // on large integer values. This limit exists to avoid hitting those limits.
-  // Per https://github.com/carbon-language/carbon-lang/issues/980, it may be
-  // feasible to optimize integer parsing in order to address performance if
-  // this limit becomes an issue.
-  //
-  // 2^128 would be 39 decimal digits or 128 binary. In either case, this limit
-  // is far above the threshold for normal integers.
-  constexpr size_t DigitLimit = 1000;
-  if (text.size() > DigitLimit) {
-    emitter_.EmitError<TooManyDigits>(
-        text.begin(), {.count = text.size(), .limit = DigitLimit});
+  if (!CanLexInteger(emitter_, text)) {
     return {.ok = false};
   }
 

+ 4 - 0
toolchain/lexer/string_literal.cpp

@@ -11,6 +11,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "toolchain/lexer/character_set.h"
+#include "toolchain/lexer/lex_helpers.h"
 
 namespace Carbon {
 
@@ -228,6 +229,9 @@ static auto ExpandUnicodeEscapeSequence(LexerDiagnosticEmitter& emitter,
                                         llvm::StringRef digits,
                                         std::string& result) -> bool {
   unsigned code_point;
+  if (!CanLexInteger(emitter, digits)) {
+    return false;
+  }
   if (digits.getAsInteger(16, code_point) || code_point > 0x10FFFF) {
     emitter.EmitError<UnicodeEscapeTooLarge>(digits.begin());
     return false;

+ 9 - 0
toolchain/lexer/string_literal_test.cpp

@@ -298,5 +298,14 @@ TEST_F(StringLiteralTest, TabInBlockString) {
   EXPECT_EQ(value, "x\ty\n");
 }
 
+TEST_F(StringLiteralTest, UnicodeTooManyDigits) {
+  std::string text = "u{";
+  text.append(10000, '9');
+  text.append("}");
+  auto value = Parse("\"\\" + text + "\"");
+  EXPECT_TRUE(error_tracker.SeenError());
+  EXPECT_EQ(value, text);
+}
+
 }  // namespace
 }  // namespace Carbon::Testing

+ 12 - 4
toolchain/lexer/tokenized_buffer.cpp

@@ -21,6 +21,7 @@
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
 #include "toolchain/lexer/character_set.h"
+#include "toolchain/lexer/lex_helpers.h"
 #include "toolchain/lexer/numeric_literal.h"
 #include "toolchain/lexer/string_literal.h"
 
@@ -402,6 +403,13 @@ class TokenizedBuffer::Lexer {
     };
 
     llvm::StringRef suffix = word.substr(1);
+    if (!CanLexInteger(emitter_, suffix)) {
+      return buffer_.AddToken(
+          {.kind = TokenKind::Error(),
+           .token_line = current_line_,
+           .column = column,
+           .error_length = static_cast<int32_t>(word.size())});
+    }
     llvm::APInt suffix_value;
     if (suffix.getAsInteger(10, suffix_value)) {
       return LexResult::NoMatch();
@@ -690,8 +698,8 @@ auto TokenizedBuffer::GetRealLiteral(Token token) const -> RealLiteralValue {
       << "The token must be a real literal!";
 
   // Note that every real literal is at least three characters long, so we can
-  // safely look at the second character to determine whether we have a decimal
-  // or hexadecimal literal.
+  // safely look at the second character to determine whether we have a
+  // decimal or hexadecimal literal.
   auto& line_info = GetLineInfo(token_info.token_line);
   int64_t token_start = line_info.start + token_info.column;
   char second_char = source_->Text()[token_start + 1];
@@ -766,8 +774,8 @@ auto TokenizedBuffer::PrintWidths::Widen(const PrintWidths& widths) -> void {
 }
 
 // Compute the printed width of a number. When numbers are printed in decimal,
-// the number of digits needed is is one more than the log-base-10 of the value.
-// We handle a value of `zero` explicitly.
+// the number of digits needed is is one more than the log-base-10 of the
+// value. We handle a value of `zero` explicitly.
 //
 // This routine requires its argument to be *non-negative*.
 static auto ComputeDecimalPrintedWidth(int number) -> int {

+ 23 - 0
toolchain/lexer/tokenized_buffer_test.cpp

@@ -932,6 +932,29 @@ TEST_F(LexerTest, TypeLiterals) {
   EXPECT_EQ(buffer.GetTypeLiteralSize(*token_f1), 1);
 }
 
+TEST_F(LexerTest, TypeLiteralTooManyDigits) {
+  std::string code = "i";
+  code.append(10000, '9');
+
+  Testing::MockDiagnosticConsumer consumer;
+  EXPECT_CALL(
+      consumer,
+      HandleDiagnostic(AllOf(
+          DiagnosticAt(1, 2),
+          DiagnosticMessage(HasSubstr("Found a sequence of 10000 digits")))));
+  auto buffer = Lex(code, consumer);
+  EXPECT_TRUE(buffer.HasErrors());
+  ASSERT_THAT(buffer,
+              HasTokens(llvm::ArrayRef<ExpectedToken>{
+                  {.kind = TokenKind::Error(),
+                   .line = 1,
+                   .column = 1,
+                   .indent_column = 1,
+                   .text = {code}},
+                  {.kind = TokenKind::EndOfFile(), .line = 1, .column = 10002},
+              }));
+}
+
 TEST_F(LexerTest, DiagnosticTrailingComment) {
   llvm::StringLiteral testcase = R"(
     // Hello!