ソースを参照

Added unicode support (#2027)

Can parse Unicode literals following the format \u{HHHH...} in a string
Vineeth B V 3 年 前
コミット
8c85fa2744

+ 35 - 2
common/string_helpers.cpp

@@ -10,6 +10,7 @@
 #include "common/check.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/ConvertUTF.h"
 
 namespace Carbon {
 
@@ -81,8 +82,40 @@ auto UnescapeStringLiteral(llvm::StringRef source, const int hashtag_num,
           ret.push_back(16 * *c1 + *c2);
           break;
         }
-        case 'u':
-          CARBON_FATAL() << "\\u is not yet supported in string literals";
+        case 'u': {
+          ++i;
+          if (i >= source.size() || source[i] != '{') {
+            return std::nullopt;
+          }
+          unsigned int unicode_int = 0;
+          ++i;
+          int original_i = i;
+          while (i < source.size() && source[i] != '}') {
+            std::optional<char> hex_val = FromHex(source[i]);
+            if (hex_val == std::nullopt) {
+              return std::nullopt;
+            }
+            unicode_int = unicode_int << 4;
+            unicode_int += hex_val.value();
+            ++i;
+            if (i - original_i > 8) {
+              return std::nullopt;
+            }
+          }
+          if (i >= source.size()) {
+            return std::nullopt;
+          }
+          if (i - original_i == 0) {
+            return std::nullopt;
+          }
+          char utf8_buf[4];
+          char* utf8_end = &utf8_buf[0];
+          if (!llvm::ConvertCodePointToUTF8(unicode_int, utf8_end)) {
+            return std::nullopt;
+          }
+          ret.append(utf8_buf, utf8_end - utf8_buf);
+          break;
+        }
         case '\n':
           if (!is_block_string) {
             return std::nullopt;

+ 27 - 0
common/string_helpers_test.cpp

@@ -30,6 +30,9 @@ TEST(UnescapeStringLiteral, Valid) {
   EXPECT_THAT(UnescapeStringLiteral("\\x12"), Optional(Eq("\x12")));
   EXPECT_THAT(UnescapeStringLiteral("test", 1), Optional(Eq("test")));
   EXPECT_THAT(UnescapeStringLiteral("test\\#n", 1), Optional(Eq("test\n")));
+  EXPECT_THAT(UnescapeStringLiteral(
+                  "r\\u{000000E9}al \\u{2764}\\u{FE0F}\\u{1F50A}!\\u{10FFFF}"),
+              Optional(Eq("réal ❤️🔊!􏿿")));
 }
 
 TEST(UnescapeStringLiteral, Invalid) {
@@ -48,6 +51,30 @@ TEST(UnescapeStringLiteral, Invalid) {
   EXPECT_THAT(UnescapeStringLiteral("\\#00", 1), Eq(std::nullopt));
 }
 
+TEST(UnescapeStringLiteral, InvalidUnicodes) {
+  // Various incomplete Unicode specifiers
+  EXPECT_THAT(UnescapeStringLiteral("\\u"), Eq(std::nullopt));
+  EXPECT_THAT(UnescapeStringLiteral("\\u1"), Eq(std::nullopt));
+  EXPECT_THAT(UnescapeStringLiteral("\\uz"), Eq(std::nullopt));
+  EXPECT_THAT(UnescapeStringLiteral("\\u{"), Eq(std::nullopt));
+  EXPECT_THAT(UnescapeStringLiteral("\\u{z"), Eq(std::nullopt));
+  EXPECT_THAT(UnescapeStringLiteral("\\u{E9"), Eq(std::nullopt));
+  EXPECT_THAT(UnescapeStringLiteral("\\u{E9z"), Eq(std::nullopt));
+  EXPECT_THAT(UnescapeStringLiteral("\\u{}"), Eq(std::nullopt));
+
+  // invalid characters in unicode
+  EXPECT_THAT(UnescapeStringLiteral("\\u{z}"), Eq(std::nullopt));
+
+  // lowercase hexadecimal
+  EXPECT_THAT(UnescapeStringLiteral("\\u{e9}"), Eq(std::nullopt));
+
+  // Codepoint number too high
+  EXPECT_THAT(UnescapeStringLiteral("\\u{110000}"), Eq(std::nullopt));
+
+  // codepoint more than 8 hex digits
+  EXPECT_THAT(UnescapeStringLiteral("\\u{FF000000E9}"), Eq(std::nullopt));
+}
+
 TEST(UnescapeStringLiteral, Nul) {
   std::optional<std::string> str = UnescapeStringLiteral("a\\0b");
   ASSERT_NE(str, std::nullopt);

+ 23 - 0
explorer/testdata/string/unicode.carbon

@@ -0,0 +1,23 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// RUN: %{explorer} %s 2>&1 | \
+// RUN:   %{FileCheck} --match-full-lines --allow-unused-prefixes=false %s
+// RUN: %{explorer} --parser_debug --trace_file=- %s 2>&1 | \
+// RUN:   %{FileCheck} --match-full-lines --allow-unused-prefixes %s
+// AUTOUPDATE: %{explorer} %s
+// CHECK: result: 0
+
+package ExplorerTest api;
+
+fn CompareStr(s: String) -> i32 {
+  if (s == "HELLO WORLD!!") {
+    return 0;
+  }
+  return 1;
+}
+
+fn Main() -> i32 {
+  return CompareStr("\u{0048}\u{0045}\u{004C}\u{004C}\u{004F} \u{0057}\u{004F}\u{0052}\u{004C}\u{0044}\u{0021}\u{21}");
+}