numeric_literal.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "lexer/numeric_literal.h"
  5. #include <bitset>
  6. #include "lexer/character_set.h"
  7. #include "llvm/ADT/StringExtras.h"
  8. #include "llvm/Support/FormatVariadic.h"
  9. namespace Carbon {
  10. namespace {
  11. struct EmptyDigitSequence : SimpleDiagnostic<EmptyDigitSequence> {
  12. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
  13. static constexpr llvm::StringLiteral Message =
  14. "Empty digit sequence in numeric literal.";
  15. };
  16. struct InvalidDigit {
  17. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
  18. char digit;
  19. int radix;
  20. auto Format() -> std::string {
  21. return llvm::formatv(
  22. "Invalid digit '{0}' in {1} numeric literal.", digit,
  23. (radix == 2 ? "binary"
  24. : (radix == 16 ? "hexadecimal" : "decimal")))
  25. .str();
  26. }
  27. };
  28. struct InvalidDigitSeparator : SimpleDiagnostic<InvalidDigitSeparator> {
  29. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
  30. static constexpr llvm::StringLiteral Message =
  31. "Misplaced digit separator in numeric literal.";
  32. };
  33. struct IrregularDigitSeparators {
  34. static constexpr llvm::StringLiteral ShortName =
  35. "syntax-irregular-digit-separators";
  36. int radix;
  37. auto Format() -> std::string {
  38. assert((radix == 10 || radix == 16) && "unexpected radix");
  39. return llvm::formatv(
  40. "Digit separators in {0} number should appear every {1} "
  41. "characters from the right.",
  42. (radix == 10 ? "decimal" : "hexadecimal"),
  43. (radix == 10 ? "3" : "4"))
  44. .str();
  45. }
  46. };
  47. struct UnknownBaseSpecifier : SimpleDiagnostic<UnknownBaseSpecifier> {
  48. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
  49. static constexpr llvm::StringLiteral Message =
  50. "Unknown base specifier in numeric literal.";
  51. };
  52. struct BinaryRealLiteral : SimpleDiagnostic<BinaryRealLiteral> {
  53. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
  54. static constexpr llvm::StringLiteral Message =
  55. "Binary real number literals are not supported.";
  56. };
  57. struct WrongRealLiteralExponent {
  58. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
  59. char expected;
  60. auto Format() -> std::string {
  61. return llvm::formatv("Expected '{0}' to introduce exponent.", expected)
  62. .str();
  63. }
  64. };
  65. } // namespace
  66. auto LexedNumericLiteral::Lex(llvm::StringRef source_text)
  67. -> llvm::Optional<LexedNumericLiteral> {
  68. LexedNumericLiteral result;
  69. if (source_text.empty() || !IsDecimalDigit(source_text.front())) {
  70. return llvm::None;
  71. }
  72. bool seen_plus_minus = false;
  73. bool seen_radix_point = false;
  74. bool seen_potential_exponent = false;
  75. // Greedily consume all following characters that might be part of a numeric
  76. // literal. This allows us to produce better diagnostics on invalid literals.
  77. //
  78. // TODO(zygoloid): Update lexical rules to specify that a numeric literal
  79. // cannot be immediately followed by an alphanumeric character.
  80. int i = 1, n = source_text.size();
  81. for (; i != n; ++i) {
  82. char c = source_text[i];
  83. if (IsAlnum(c) || c == '_') {
  84. if (IsLower(c) && seen_radix_point && !seen_plus_minus) {
  85. result.exponent = i;
  86. seen_potential_exponent = true;
  87. }
  88. continue;
  89. }
  90. // Exactly one `.` can be part of the literal, but only if it's followed by
  91. // an alphanumeric character.
  92. if (c == '.' && i + 1 != n && IsAlnum(source_text[i + 1]) &&
  93. !seen_radix_point) {
  94. result.radix_point = i;
  95. seen_radix_point = true;
  96. continue;
  97. }
  98. // A `+` or `-` continues the literal only if it's preceded by a lowercase
  99. // letter (which will be 'e' or 'p' or part of an invalid literal) and
  100. // followed by an alphanumeric character. This '+' or '-' cannot be an
  101. // operator because a literal cannot end in a lowercase letter.
  102. if ((c == '+' || c == '-') && seen_potential_exponent &&
  103. result.exponent == i - 1 && i + 1 != n && IsAlnum(source_text[i + 1])) {
  104. // This is not possible because we don't update result.exponent after we
  105. // see a '+' or '-'.
  106. assert(!seen_plus_minus && "should only consume one + or -");
  107. seen_plus_minus = true;
  108. continue;
  109. }
  110. break;
  111. }
  112. result.text = source_text.substr(0, i);
  113. if (!seen_radix_point) {
  114. result.radix_point = i;
  115. }
  116. if (!seen_potential_exponent) {
  117. result.exponent = i;
  118. }
  119. return result;
  120. }
  121. // Parser for numeric literal tokens.
  122. //
  123. // Responsible for checking that a numeric literal is valid and meaningful and
  124. // either diagnosing or extracting its meaning.
  125. class LexedNumericLiteral::Parser {
  126. public:
  127. Parser(DiagnosticEmitter<const char*>& emitter, LexedNumericLiteral literal);
  128. auto IsInteger() -> bool {
  129. return literal.radix_point == static_cast<int>(literal.text.size());
  130. }
  131. // Check that the numeric literal token is syntactically valid and
  132. // meaningful, and diagnose if not. Returns `true` if the token was
  133. // sufficiently valid that we could determine its meaning. If `false` is
  134. // returned, a diagnostic has already been issued.
  135. auto Check() -> bool;
  136. // Get the radix of this token. One of 2, 10, or 16.
  137. auto GetRadix() -> int { return radix; }
  138. // Get the mantissa of this token's value.
  139. auto GetMantissa() -> llvm::APInt;
  140. // Get the exponent of this token's value. This is always zero for an integer
  141. // literal.
  142. auto GetExponent() -> llvm::APInt;
  143. private:
  144. struct CheckDigitSequenceResult {
  145. bool ok;
  146. bool has_digit_separators = false;
  147. };
  148. auto CheckDigitSequence(llvm::StringRef text, int radix,
  149. bool allow_digit_separators = true)
  150. -> CheckDigitSequenceResult;
  151. auto CheckDigitSeparatorPlacement(llvm::StringRef text, int radix,
  152. int num_digit_separators) -> void;
  153. auto CheckLeadingZero() -> bool;
  154. auto CheckIntPart() -> bool;
  155. auto CheckFractionalPart() -> bool;
  156. auto CheckExponentPart() -> bool;
  157. private:
  158. DiagnosticEmitter<const char*>& emitter;
  159. LexedNumericLiteral literal;
  160. // The radix of the literal: 2, 10, or 16, for a prefix of '0b', no prefix,
  161. // or '0x', respectively.
  162. int radix = 10;
  163. // The various components of a numeric literal:
  164. //
  165. // [radix] int_part [. fract_part [[ep] [+-] exponent_part]]
  166. llvm::StringRef int_part;
  167. llvm::StringRef fract_part;
  168. llvm::StringRef exponent_part;
  169. // Do we need to remove any special characters (digit separator or radix
  170. // point) before interpreting the mantissa or exponent as an integer?
  171. bool mantissa_needs_cleaning = false;
  172. bool exponent_needs_cleaning = false;
  173. // True if we found a `-` before `exponent_part`.
  174. bool exponent_is_negative = false;
  175. };
  176. LexedNumericLiteral::Parser::Parser(DiagnosticEmitter<const char*>& emitter,
  177. LexedNumericLiteral literal)
  178. : emitter(emitter), literal(literal) {
  179. int_part = literal.text.substr(0, literal.radix_point);
  180. if (int_part.consume_front("0x")) {
  181. radix = 16;
  182. } else if (int_part.consume_front("0b")) {
  183. radix = 2;
  184. }
  185. fract_part = literal.text.substr(literal.radix_point + 1,
  186. literal.exponent - literal.radix_point - 1);
  187. exponent_part = literal.text.substr(literal.exponent + 1);
  188. if (!exponent_part.consume_front("+")) {
  189. exponent_is_negative = exponent_part.consume_front("-");
  190. }
  191. }
  192. // Check that the numeric literal token is syntactically valid and meaningful,
  193. // and diagnose if not.
  194. auto LexedNumericLiteral::Parser::Check() -> bool {
  195. return CheckLeadingZero() && CheckIntPart() && CheckFractionalPart() &&
  196. CheckExponentPart();
  197. }
  198. // Parse a string that is known to be a valid base-radix integer into an
  199. // APInt. If needs_cleaning is true, the string may additionally contain '_'
  200. // and '.' characters that should be ignored.
  201. //
  202. // Ignoring '.' is used when parsing a real literal. For example, when
  203. // parsing 123.456e7, we want to decompose it into an integer mantissa
  204. // (123456) and an exponent (7 - 3 = 2), and this routine is given the
  205. // "123.456" to parse as the mantissa.
  206. static auto ParseInteger(llvm::StringRef digits, int radix, bool needs_cleaning)
  207. -> llvm::APInt {
  208. llvm::SmallString<32> cleaned;
  209. if (needs_cleaning) {
  210. cleaned.reserve(digits.size());
  211. std::remove_copy_if(digits.begin(), digits.end(),
  212. std::back_inserter(cleaned),
  213. [](char c) { return c == '_' || c == '.'; });
  214. digits = cleaned;
  215. }
  216. llvm::APInt value;
  217. if (digits.getAsInteger(radix, value)) {
  218. llvm_unreachable("should never fail");
  219. }
  220. return value;
  221. }
  222. auto LexedNumericLiteral::Parser::GetMantissa() -> llvm::APInt {
  223. const char* end = IsInteger() ? int_part.end() : fract_part.end();
  224. llvm::StringRef digits(int_part.begin(), end - int_part.begin());
  225. return ParseInteger(digits, radix, mantissa_needs_cleaning);
  226. }
  227. auto LexedNumericLiteral::Parser::GetExponent() -> llvm::APInt {
  228. // Compute the effective exponent from the specified exponent, if any,
  229. // and the position of the radix point.
  230. llvm::APInt exponent(64, 0);
  231. if (!exponent_part.empty()) {
  232. exponent = ParseInteger(exponent_part, 10, exponent_needs_cleaning);
  233. // The exponent is a signed integer, and the number we just parsed is
  234. // non-negative, so ensure we have a wide enough representation to
  235. // include a sign bit. Also make sure the exponent isn't too narrow so
  236. // the calculation below can't lose information through overflow.
  237. if (exponent.isSignBitSet() || exponent.getBitWidth() < 64) {
  238. exponent = exponent.zext(std::max(64u, exponent.getBitWidth() + 1));
  239. }
  240. if (exponent_is_negative) {
  241. exponent.negate();
  242. }
  243. }
  244. // Each character after the decimal point reduces the effective exponent.
  245. int excess_exponent = fract_part.size();
  246. if (radix == 16) {
  247. excess_exponent *= 4;
  248. }
  249. exponent -= excess_exponent;
  250. if (exponent_is_negative && !exponent.isNegative()) {
  251. // We overflowed. Note that we can only overflow by a little, and only
  252. // from negative to positive, because exponent is at least 64 bits wide
  253. // and excess_exponent is bounded above by four times the size of the
  254. // input buffer, which we assume fits into 32 bits.
  255. exponent = exponent.zext(exponent.getBitWidth() + 1);
  256. exponent.setSignBit();
  257. }
  258. return exponent;
  259. }
  260. // Check that a digit sequence is valid: that it contains one or more digits,
  261. // contains only digits in the specified base, and that any digit separators
  262. // are present and correctly positioned.
  263. auto LexedNumericLiteral::Parser::CheckDigitSequence(
  264. llvm::StringRef text, int radix, bool allow_digit_separators)
  265. -> CheckDigitSequenceResult {
  266. assert((radix == 2 || radix == 10 || radix == 16) && "unknown radix");
  267. std::bitset<256> valid_digits;
  268. if (radix == 2) {
  269. for (char c : "01") {
  270. valid_digits[static_cast<unsigned char>(c)] = true;
  271. }
  272. } else if (radix == 10) {
  273. for (char c : "0123456789") {
  274. valid_digits[static_cast<unsigned char>(c)] = true;
  275. }
  276. } else {
  277. for (char c : "0123456789ABCDEF") {
  278. valid_digits[static_cast<unsigned char>(c)] = true;
  279. }
  280. }
  281. int num_digit_separators = 0;
  282. for (int i = 0, n = text.size(); i != n; ++i) {
  283. char c = text[i];
  284. if (valid_digits[static_cast<unsigned char>(c)]) {
  285. continue;
  286. }
  287. if (c == '_') {
  288. // A digit separator cannot appear at the start of a digit sequence,
  289. // next to another digit separator, or at the end.
  290. if (!allow_digit_separators || i == 0 || text[i - 1] == '_' ||
  291. i + 1 == n) {
  292. emitter.EmitError<InvalidDigitSeparator>(text.begin() + i);
  293. }
  294. ++num_digit_separators;
  295. continue;
  296. }
  297. emitter.EmitError<InvalidDigit>(text.begin() + i,
  298. {.digit = c, .radix = radix});
  299. return {.ok = false};
  300. }
  301. if (num_digit_separators == static_cast<int>(text.size())) {
  302. emitter.EmitError<EmptyDigitSequence>(text.begin());
  303. return {.ok = false};
  304. }
  305. // Check that digit separators occur in exactly the expected positions.
  306. if (num_digit_separators) {
  307. CheckDigitSeparatorPlacement(text, radix, num_digit_separators);
  308. }
  309. return {.ok = true, .has_digit_separators = (num_digit_separators != 0)};
  310. }
  311. // Given a number with digit separators, check that the digit separators are
  312. // correctly positioned.
  313. auto LexedNumericLiteral::Parser::CheckDigitSeparatorPlacement(
  314. llvm::StringRef text, int radix, int num_digit_separators) -> void {
  315. assert(std::count(text.begin(), text.end(), '_') == num_digit_separators &&
  316. "given wrong number of digit separators");
  317. if (radix == 2) {
  318. // There are no restrictions on digit separator placement for binary
  319. // literals.
  320. return;
  321. }
  322. assert((radix == 10 || radix == 16) &&
  323. "unexpected radix for digit separator checks");
  324. auto diagnose_irregular_digit_separators = [&]() {
  325. emitter.EmitError<IrregularDigitSeparators>(text.begin(), {.radix = radix});
  326. };
  327. // For decimal and hexadecimal digit sequences, digit separators must form
  328. // groups of 3 or 4 digits (4 or 5 characters), respectively.
  329. int stride = (radix == 10 ? 4 : 5);
  330. int remaining_digit_separators = num_digit_separators;
  331. auto pos = text.end();
  332. while (pos - text.begin() >= stride) {
  333. pos -= stride;
  334. if (*pos != '_') {
  335. diagnose_irregular_digit_separators();
  336. return;
  337. }
  338. --remaining_digit_separators;
  339. }
  340. // Check there weren't any other digit separators.
  341. if (remaining_digit_separators) {
  342. diagnose_irregular_digit_separators();
  343. }
  344. };
  345. // Check that we don't have a '0' prefix on a non-zero decimal integer.
  346. auto LexedNumericLiteral::Parser::CheckLeadingZero() -> bool {
  347. if (radix == 10 && int_part.startswith("0") && int_part != "0") {
  348. emitter.EmitError<UnknownBaseSpecifier>(int_part.begin());
  349. return false;
  350. }
  351. return true;
  352. }
  353. // Check the integer part (before the '.', if any) is valid.
  354. auto LexedNumericLiteral::Parser::CheckIntPart() -> bool {
  355. auto int_result = CheckDigitSequence(int_part, radix);
  356. mantissa_needs_cleaning |= int_result.has_digit_separators;
  357. return int_result.ok;
  358. }
  359. // Check the fractional part (after the '.' and before the exponent, if any)
  360. // is valid.
  361. auto LexedNumericLiteral::Parser::CheckFractionalPart() -> bool {
  362. if (IsInteger()) {
  363. return true;
  364. }
  365. if (radix == 2) {
  366. emitter.EmitError<BinaryRealLiteral>(literal.text.begin() +
  367. literal.radix_point);
  368. // Carry on and parse the binary real literal anyway.
  369. }
  370. // We need to remove a '.' from the mantissa.
  371. mantissa_needs_cleaning = true;
  372. return CheckDigitSequence(fract_part, radix,
  373. /*allow_digit_separators=*/false)
  374. .ok;
  375. }
  376. // Check the exponent part (if any) is valid.
  377. auto LexedNumericLiteral::Parser::CheckExponentPart() -> bool {
  378. if (literal.exponent == static_cast<int>(literal.text.size())) {
  379. return true;
  380. }
  381. char expected_exponent_kind = (radix == 10 ? 'e' : 'p');
  382. if (literal.text[literal.exponent] != expected_exponent_kind) {
  383. emitter.EmitError<WrongRealLiteralExponent>(
  384. literal.text.begin() + literal.exponent,
  385. {.expected = expected_exponent_kind});
  386. return false;
  387. }
  388. auto exponent_result = CheckDigitSequence(exponent_part, 10);
  389. exponent_needs_cleaning = exponent_result.has_digit_separators;
  390. return exponent_result.ok;
  391. }
  392. // Parse the token and compute its value.
  393. auto LexedNumericLiteral::ComputeValue(
  394. DiagnosticEmitter<const char*>& emitter) const -> Value {
  395. Parser parser(emitter, *this);
  396. if (!parser.Check()) {
  397. return UnrecoverableError();
  398. }
  399. if (parser.IsInteger()) {
  400. return IntegerValue{.value = parser.GetMantissa()};
  401. }
  402. return RealValue{.radix = (parser.GetRadix() == 10 ? 10 : 2),
  403. .mantissa = parser.GetMantissa(),
  404. .exponent = parser.GetExponent()};
  405. }
  406. } // namespace Carbon