reader/wgsl: Support unicode identifiers

Bug: tint:1437 Change-Id: Ie00ccb3e93d207111e55117dfc989f79b76164bf Reviewed-on: https://dawn-review.googlesource.com/c/tint/+/80844 Kokoro: Kokoro <noreply+kokoro@google.com> Reviewed-by: David Neto <dneto@google.com> Commit-Queue: Ben Clayton <bclayton@google.com>
2025-12-08 13:14:56 +00:00 · 2022-02-18 20:39:43 +00:00
parent 1c6c6b19ab
commit d29d3d5740
29 changed files with 483 additions and 95 deletions
--- a/docs/origin-trial-changes.md
+++ b/docs/origin-trial-changes.md
@@ -12,6 +12,7 @@

 * Module-scope declarations can now be declared in any order. [tint:1266](crbug.com/tint/1266)
 * The `override` keyword and `@id()` attribute for pipeline-overridable constants are now supported, replacing the `@override` attribute. [tint:1403](crbug.com/tint/1403)
+* Tint now supports unicode identifiers. [tint:1437](crbug.com/tint/1437)

 ## Changes for M99

--- a/samples/main.cc
+++ b/samples/main.cc
@@ -1119,7 +1119,8 @@ int main(int argc, const char** argv) {
    case Format::kMsl: {
 #if TINT_BUILD_MSL_WRITER
      transform_inputs.Add<tint::transform::Renamer::Config>(
-          tint::transform::Renamer::Target::kMslKeywords);
+          tint::transform::Renamer::Target::kMslKeywords,
+          /* preserve_unicode */ false);
      transform_manager.Add<tint::transform::Renamer>();
 #endif  // TINT_BUILD_MSL_WRITER
      break;
@@ -1132,7 +1133,8 @@ int main(int argc, const char** argv) {
    case Format::kHlsl: {
 #if TINT_BUILD_HLSL_WRITER
      transform_inputs.Add<tint::transform::Renamer::Config>(
-          tint::transform::Renamer::Target::kHlslKeywords);
+          tint::transform::Renamer::Target::kHlslKeywords,
+          /* preserve_unicode */ false);
      transform_manager.Add<tint::transform::Renamer>();
 #endif  // TINT_BUILD_HLSL_WRITER
      break;
--- a/src/reader/wgsl/lexer.cc
+++ b/src/reader/wgsl/lexer.cc
@@ -21,6 +21,7 @@
 #include <utility>

 #include "src/debug.h"
+#include "src/text/unicode.h"

 namespace tint {
 namespace reader {
@@ -113,18 +114,10 @@ bool Lexer::is_null() const {
  return (pos_ < len_) && (file_->content.data[pos_] == 0);
 }

-bool Lexer::is_alpha(char ch) const {
-  return std::isalpha(ch);
-}
-
 bool Lexer::is_digit(char ch) const {
  return std::isdigit(ch);
 }

-bool Lexer::is_alphanum_underscore(char ch) const {
-  return is_alpha(ch) || is_digit(ch) || ch == '_';
-}
-
 bool Lexer::is_hex(char ch) const {
  return std::isxdigit(ch);
 }
@@ -733,31 +726,52 @@ Token Lexer::try_integer() {
 }

 Token Lexer::try_ident() {
-  // Must begin with an a-zA-Z_
-  if (!(is_alpha(file_->content.data[pos_]) ||
-        file_->content.data[pos_] == '_')) {
-    return {};
-  }
-
  auto source = begin_source();
+  auto start = pos_;

-  auto s = pos_;
-  while (!is_eof() && is_alphanum_underscore(file_->content.data[pos_])) {
-    pos_++;
-    location_.column++;
+  // This below assumes that the size of a single std::string element is 1 byte.
+  static_assert(sizeof(file_->content.data[0]) == sizeof(uint8_t),
+                "tint::reader::wgsl requires the size of a std::string element "
+                "to be a single byte");
+
+  // Must begin with an XID_Source unicode character, or underscore
+  {
+    auto* utf8 = reinterpret_cast<const uint8_t*>(&file_->content.data[pos_]);
+    auto [code_point, n] =
+        text::utf8::Decode(utf8, file_->content.data.size() - pos_);
+    if (code_point != text::CodePoint('_') && !code_point.IsXIDStart()) {
+      return {};
+    }
+    // Consume start codepoint
+    pos_ += n;
+    location_.column += n;
  }

-  if (file_->content.data[s] == '_') {
+  while (!is_eof()) {
+    // Must continue with an XID_Continue unicode character
+    auto* utf8 = reinterpret_cast<const uint8_t*>(&file_->content.data[pos_]);
+    auto [code_point, n] =
+        text::utf8::Decode(utf8, file_->content.data.size() - pos_);
+    if (!code_point.IsXIDContinue()) {
+      break;
+    }
+
+    // Consume continuing codepoint
+    pos_ += n;
+    location_.column += n;
+  }
+
+  if (file_->content.data[start] == '_') {
    // Check for an underscore on its own (special token), or a
    // double-underscore (not allowed).
-    if ((pos_ == s + 1) || (file_->content.data[s + 1] == '_')) {
-      location_.column -= (pos_ - s);
-      pos_ = s;
+    if ((pos_ == start + 1) || (file_->content.data[start + 1] == '_')) {
+      location_.column -= (pos_ - start);
+      pos_ = start;
      return {};
    }
  }

-  auto str = file_->content.data_view.substr(s, pos_ - s);
+  auto str = file_->content.data_view.substr(start, pos_ - start);
  end_source(source);

  auto t = check_keyword(source, str);
--- a/src/reader/wgsl/lexer.h
+++ b/src/reader/wgsl/lexer.h
@@ -76,25 +76,18 @@ class Lexer {
  /// it is not null.
  bool is_null() const;
  /// @param ch a character
-  /// @returns true if 'ch' is an alphabetic character
-  bool is_alpha(char ch) const;
-  /// @param ch a character
  /// @returns true if 'ch' is a decimal digit
  bool is_digit(char ch) const;
  /// @param ch a character
  /// @returns true if 'ch' is a hexadecimal digit
  bool is_hex(char ch) const;
-  /// @param ch a character
-  /// @returns true if 'ch' is a digit, an alphabetic character,
-  /// or an underscore.
-  bool is_alphanum_underscore(char ch) const;
  bool matches(size_t pos, std::string_view substr);

  /// The source file content
  Source::File const* const file_;
  /// The length of the input
  uint32_t len_ = 0;
-  /// The current position within the input
+  /// The current position in utf-8 code units (bytes) within the input
  uint32_t pos_ = 0;
  /// The current location within the input
  Source::Location location_;
--- a/src/reader/wgsl/lexer_test.cc
+++ b/src/reader/wgsl/lexer_test.cc
@@ -315,8 +315,8 @@ INSTANTIATE_TEST_SUITE_P(
                    "2.5E+ 123",
                    "2.5E- 123"));

-using IdentifierTest = testing::TestWithParam<const char*>;
-TEST_P(IdentifierTest, Parse) {
+using AsciiIdentifierTest = testing::TestWithParam<const char*>;
+TEST_P(AsciiIdentifierTest, Parse) {
  Source::File file("", GetParam());
  Lexer l(&file);

@@ -329,7 +329,7 @@ TEST_P(IdentifierTest, Parse) {
  EXPECT_EQ(t.to_str(), GetParam());
 }
 INSTANTIATE_TEST_SUITE_P(LexerTest,
-                         IdentifierTest,
+                         AsciiIdentifierTest,
                         testing::Values("a",
                                         "test",
                                         "test01",
@@ -342,6 +342,57 @@ INSTANTIATE_TEST_SUITE_P(LexerTest,
                                         "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
                                         "alldigits_0123456789"));

+struct UnicodeCase {
+  const char* utf8;
+  size_t code_units;
+};
+
+using UnicodeIdentifierTest = testing::TestWithParam<UnicodeCase>;
+TEST_P(UnicodeIdentifierTest, Parse) {
+  Source::File file("", GetParam().utf8);
+  Lexer l(&file);
+
+  auto t = l.next();
+  EXPECT_TRUE(t.IsIdentifier());
+  EXPECT_EQ(t.source().range.begin.line, 1u);
+  EXPECT_EQ(t.source().range.begin.column, 1u);
+  EXPECT_EQ(t.source().range.end.line, 1u);
+  EXPECT_EQ(t.source().range.end.column, 1u + GetParam().code_units);
+  EXPECT_EQ(t.to_str(), GetParam().utf8);
+}
+INSTANTIATE_TEST_SUITE_P(
+    LexerTest,
+    UnicodeIdentifierTest,
+    testing::ValuesIn({
+        UnicodeCase{// "𝐢𝐝𝐞𝐧𝐭𝐢𝐟𝐢𝐞𝐫"
+                    "\xf0\x9d\x90\xa2\xf0\x9d\x90\x9d\xf0\x9d\x90\x9e\xf0\x9d"
+                    "\x90\xa7\xf0\x9d\x90\xad\xf0\x9d\x90\xa2\xf0\x9d\x90\x9f"
+                    "\xf0\x9d\x90\xa2\xf0\x9d\x90\x9e\xf0\x9d\x90\xab",
+                    40},
+        UnicodeCase{// "𝑖𝑑𝑒𝑛𝑡𝑖𝑓𝑖𝑒𝑟"
+                    "\xf0\x9d\x91\x96\xf0\x9d\x91\x91\xf0\x9d\x91\x92\xf0\x9d"
+                    "\x91\x9b\xf0\x9d\x91\xa1\xf0\x9d\x91\x96\xf0\x9d\x91\x93"
+                    "\xf0\x9d\x91\x96\xf0\x9d\x91\x92\xf0\x9d\x91\x9f",
+                    40},
+        UnicodeCase{
+            // "ｉｄｅｎｔｉｆｉｅｒ"
+            "\xef\xbd\x89\xef\xbd\x84\xef\xbd\x85\xef\xbd\x8e\xef\xbd\x94\xef"
+            "\xbd\x89\xef\xbd\x86\xef\xbd\x89\xef\xbd\x85\xef\xbd\x92",
+            30},
+        UnicodeCase{// "𝕚𝕕𝕖𝕟𝕥𝕚𝕗𝕚𝕖𝕣𝟙𝟚𝟛"
+                    "\xf0\x9d\x95\x9a\xf0\x9d\x95\x95\xf0\x9d\x95\x96\xf0\x9d"
+                    "\x95\x9f\xf0\x9d\x95\xa5\xf0\x9d\x95\x9a\xf0\x9d\x95\x97"
+                    "\xf0\x9d\x95\x9a\xf0\x9d\x95\x96\xf0\x9d\x95\xa3\xf0\x9d"
+                    "\x9f\x99\xf0\x9d\x9f\x9a\xf0\x9d\x9f\x9b",
+                    52},
+        UnicodeCase{
+            // "𝖎𝖉𝖊𝖓𝖙𝖎𝖋𝖎𝖊𝖗123"
+            "\xf0\x9d\x96\x8e\xf0\x9d\x96\x89\xf0\x9d\x96\x8a\xf0\x9d\x96\x93"
+            "\xf0\x9d\x96\x99\xf0\x9d\x96\x8e\xf0\x9d\x96\x8b\xf0\x9d\x96\x8e"
+            "\xf0\x9d\x96\x8a\xf0\x9d\x96\x97\x31\x32\x33",
+            43},
+    }));
+
 TEST_F(LexerTest, IdentifierTest_SingleUnderscoreDoesNotMatch) {
  Source::File file("", "_");
  Lexer l(&file);
--- a/src/reader/wgsl/parser_impl_function_decl_test.cc
+++ b/src/reader/wgsl/parser_impl_function_decl_test.cc
@@ -14,6 +14,7 @@

 #include "src/ast/workgroup_attribute.h"
 #include "src/reader/wgsl/parser_impl_test_helper.h"
+#include "src/utils/string.h"

 namespace tint {
 namespace reader {
@@ -48,6 +49,51 @@ TEST_F(ParserImplTest, FunctionDecl) {
  EXPECT_TRUE(body->statements[0]->Is<ast::ReturnStatement>());
 }

+TEST_F(ParserImplTest, FunctionDecl_Unicode) {
+  const std::string function_ident =  // "𝗳𝘂𝗻𝗰𝘁𝗶𝗼𝗻"
+      "\xf0\x9d\x97\xb3\xf0\x9d\x98\x82\xf0\x9d\x97\xbb\xf0\x9d\x97\xb0\xf0\x9d"
+      "\x98\x81\xf0\x9d\x97\xb6\xf0\x9d\x97\xbc\xf0\x9d\x97\xbb";
+
+  const std::string param_a_ident =  // "𝓹𝓪𝓻𝓪𝓶_𝓪"
+      "\xf0\x9d\x93\xb9\xf0\x9d\x93\xaa\xf0\x9d\x93\xbb\xf0\x9d\x93\xaa\xf0\x9d"
+      "\x93\xb6\x5f\xf0\x9d\x93\xaa";
+
+  const std::string param_b_ident =  // "𝕡𝕒𝕣𝕒𝕞_𝕓"
+      "\xf0\x9d\x95\xa1\xf0\x9d\x95\x92\xf0\x9d\x95\xa3\xf0\x9d\x95\x92\xf0\x9d"
+      "\x95\x9e\x5f\xf0\x9d\x95\x93";
+
+  std::string src = "fn $function($param_a : i32, $param_b : f32) { return; }";
+  src = utils::ReplaceAll(src, "$function", function_ident);
+  src = utils::ReplaceAll(src, "$param_a", param_a_ident);
+  src = utils::ReplaceAll(src, "$param_b", param_b_ident);
+
+  auto p = parser(src);
+  auto attrs = p->attribute_list();
+  EXPECT_FALSE(p->has_error()) << p->error();
+  ASSERT_FALSE(attrs.errored);
+  EXPECT_FALSE(attrs.matched);
+  auto f = p->function_decl(attrs.value);
+  EXPECT_FALSE(p->has_error()) << p->error();
+  EXPECT_FALSE(f.errored);
+  EXPECT_TRUE(f.matched);
+  ASSERT_NE(f.value, nullptr);
+
+  EXPECT_EQ(f->symbol, p->builder().Symbols().Get(function_ident));
+  ASSERT_NE(f->return_type, nullptr);
+  EXPECT_TRUE(f->return_type->Is<ast::Void>());
+
+  ASSERT_EQ(f->params.size(), 2u);
+  EXPECT_EQ(f->params[0]->symbol, p->builder().Symbols().Get(param_a_ident));
+  EXPECT_EQ(f->params[1]->symbol, p->builder().Symbols().Get(param_b_ident));
+
+  ASSERT_NE(f->return_type, nullptr);
+  EXPECT_TRUE(f->return_type->Is<ast::Void>());
+
+  auto* body = f->body;
+  ASSERT_EQ(body->statements.size(), 1u);
+  EXPECT_TRUE(body->statements[0]->Is<ast::ReturnStatement>());
+}
+
 TEST_F(ParserImplTest, FunctionDecl_AttributeList) {
  auto p = parser("@workgroup_size(2, 3, 4) fn main() { return; }");
  auto attrs = p->attribute_list();
--- a/src/reader/wgsl/parser_impl_struct_decl_test.cc
+++ b/src/reader/wgsl/parser_impl_struct_decl_test.cc
@@ -14,6 +14,7 @@

 #include "src/ast/struct_block_attribute.h"
 #include "src/reader/wgsl/parser_impl_test_helper.h"
+#include "src/utils/string.h"

 namespace tint {
 namespace reader {
@@ -42,6 +43,46 @@ struct S {
  EXPECT_EQ(s->members[1]->symbol, p->builder().Symbols().Register("b"));
 }

+TEST_F(ParserImplTest, StructDecl_Unicode_Parses) {
+  const std::string struct_ident =  // "𝓼𝓽𝓻𝓾𝓬𝓽𝓾𝓻𝓮"
+      "\xf0\x9d\x93\xbc\xf0\x9d\x93\xbd\xf0\x9d\x93\xbb\xf0\x9d\x93\xbe\xf0\x9d"
+      "\x93\xac\xf0\x9d\x93\xbd\xf0\x9d\x93\xbe\xf0\x9d\x93\xbb\xf0\x9d\x93"
+      "\xae";
+  const std::string member_a_ident =  // "𝕞𝕖𝕞𝕓𝕖𝕣_𝕒"
+      "\xf0\x9d\x95\x9e\xf0\x9d\x95\x96\xf0\x9d\x95\x9e\xf0\x9d\x95\x93\xf0\x9d"
+      "\x95\x96\xf0\x9d\x95\xa3\x5f\xf0\x9d\x95\x92";
+  const std::string member_b_ident =  // "𝔪𝔢𝔪𝔟𝔢𝔯_𝔟"
+      "\xf0\x9d\x94\xaa\xf0\x9d\x94\xa2\xf0\x9d\x94\xaa\xf0\x9d\x94\x9f\xf0\x9d"
+      "\x94\xa2\xf0\x9d\x94\xaf\x5f\xf0\x9d\x94\x9f";
+
+  std::string src = R"(
+struct $struct {
+  $member_a : i32;
+  $member_b : f32;
+})";
+  src = utils::ReplaceAll(src, "$struct", struct_ident);
+  src = utils::ReplaceAll(src, "$member_a", member_a_ident);
+  src = utils::ReplaceAll(src, "$member_b", member_b_ident);
+
+  auto p = parser(src);
+  auto attrs = p->attribute_list();
+  EXPECT_FALSE(attrs.errored);
+  EXPECT_FALSE(attrs.matched);
+  ASSERT_EQ(attrs.value.size(), 0u);
+
+  auto s = p->struct_decl(attrs.value);
+  EXPECT_FALSE(p->has_error());
+  EXPECT_FALSE(s.errored);
+  EXPECT_TRUE(s.matched);
+  ASSERT_NE(s.value, nullptr);
+  ASSERT_EQ(s->name, p->builder().Symbols().Register(struct_ident));
+  ASSERT_EQ(s->members.size(), 2u);
+  EXPECT_EQ(s->members[0]->symbol,
+            p->builder().Symbols().Register(member_a_ident));
+  EXPECT_EQ(s->members[1]->symbol,
+            p->builder().Symbols().Register(member_b_ident));
+}
+
 TEST_F(ParserImplTest, StructDecl_ParsesWithAttribute) {
  auto p = parser(R"(
 [[block]] struct B {
--- a/src/reader/wgsl/parser_impl_type_alias_test.cc
+++ b/src/reader/wgsl/parser_impl_type_alias_test.cc
@@ -34,7 +34,7 @@ TEST_F(ParserImplTest, TypeDecl_ParsesType) {
  EXPECT_EQ(t.value->source.range, (Source::Range{{1u, 1u}, {1u, 13u}}));
 }

-TEST_F(ParserImplTest, TypeDecl_ParsesStruct_Ident) {
+TEST_F(ParserImplTest, TypeDecl_Parses_Ident) {
  auto p = parser("type a = B");

  auto t = p->type_alias();
@@ -49,6 +49,25 @@ TEST_F(ParserImplTest, TypeDecl_ParsesStruct_Ident) {
  EXPECT_EQ(alias->source.range, (Source::Range{{1u, 1u}, {1u, 11u}}));
 }

+TEST_F(ParserImplTest, TypeDecl_Unicode_Parses_Ident) {
+  const std::string ident =  // "𝓶𝔂_𝓽𝔂𝓹𝓮"
+      "\xf0\x9d\x93\xb6\xf0\x9d\x94\x82\x5f\xf0\x9d\x93\xbd\xf0\x9d\x94\x82\xf0"
+      "\x9d\x93\xb9\xf0\x9d\x93\xae";
+
+  auto p = parser("type " + ident + " = i32");
+
+  auto t = p->type_alias();
+  EXPECT_FALSE(p->has_error());
+  EXPECT_FALSE(t.errored);
+  EXPECT_TRUE(t.matched);
+  ASSERT_NE(t.value, nullptr);
+  ASSERT_TRUE(t.value->Is<ast::Alias>());
+  auto* alias = t.value->As<ast::Alias>();
+  EXPECT_EQ(p->builder().Symbols().NameFor(alias->name), ident);
+  EXPECT_TRUE(alias->type->Is<ast::I32>());
+  EXPECT_EQ(alias->source.range, (Source::Range{{1u, 1u}, {1u, 37u}}));
+}
+
 TEST_F(ParserImplTest, TypeDecl_MissingIdent) {
  auto p = parser("type = i32");
  auto t = p->type_alias();
--- a/src/reader/wgsl/parser_impl_variable_decl_test.cc
+++ b/src/reader/wgsl/parser_impl_variable_decl_test.cc
@@ -18,7 +18,6 @@ namespace tint {
 namespace reader {
 namespace wgsl {
 namespace {
-
 TEST_F(ParserImplTest, VariableDecl_Parses) {
  auto p = parser("var my_var : f32");
  auto v = p->variable_decl();
@@ -33,6 +32,25 @@ TEST_F(ParserImplTest, VariableDecl_Parses) {
  EXPECT_EQ(v->type->source.range, (Source::Range{{1u, 14u}, {1u, 17u}}));
 }

+TEST_F(ParserImplTest, VariableDecl_Unicode_Parses) {
+  const std::string ident =  // "𝖎𝖉𝖊𝖓𝖙𝖎𝖋𝖎𝖊𝖗123"
+      "\xf0\x9d\x96\x8e\xf0\x9d\x96\x89\xf0\x9d\x96\x8a\xf0\x9d\x96\x93"
+      "\xf0\x9d\x96\x99\xf0\x9d\x96\x8e\xf0\x9d\x96\x8b\xf0\x9d\x96\x8e"
+      "\xf0\x9d\x96\x8a\xf0\x9d\x96\x97\x31\x32\x33";
+
+  auto p = parser("var " + ident + " : f32");
+  auto v = p->variable_decl();
+  EXPECT_FALSE(p->has_error());
+  EXPECT_TRUE(v.matched);
+  EXPECT_FALSE(v.errored);
+  EXPECT_EQ(v->name, ident);
+  EXPECT_NE(v->type, nullptr);
+  EXPECT_TRUE(v->type->Is<ast::F32>());
+
+  EXPECT_EQ(v->source.range, (Source::Range{{1u, 5u}, {1u, 48u}}));
+  EXPECT_EQ(v->type->source.range, (Source::Range{{1u, 51u}, {1u, 54u}}));
+}
+
 TEST_F(ParserImplTest, VariableDecl_Inferred_Parses) {
  auto p = parser("var my_var = 1.0");
  auto v = p->variable_decl(/*allow_inferred = */ true);
--- a/src/source.h
+++ b/src/source.h
@@ -27,7 +27,7 @@ namespace tint {
 /// Source describes a range of characters within a source file.
 class Source {
 public:
-  /// FileContent describes the content of a source file.
+  /// FileContent describes the content of a source file encoded using utf-8.
  class FileContent {
   public:
    /// Constructs the FileContent with the given file content.
@@ -78,7 +78,8 @@ class Source {
   public:
    /// the 1-based line number. 0 represents no line information.
    size_t line = 0;
-    /// the 1-based column number. 0 represents no column information.
+    /// the 1-based column number in utf8-code units (bytes).
+    /// 0 represents no column information.
    size_t column = 0;

    /// Returns true of `this` location is lexicographically less than `rhs`
--- a/src/text/unicode.cc
+++ b/src/text/unicode.cc
@@ -427,7 +427,9 @@ std::ostream& operator<<(std::ostream& out, CodePoint code_point) {
  return out << "'U+" << std::hex << code_point.value << "'";
 }

-std::pair<CodePoint, size_t> utf8::Decode(const uint8_t* ptr, size_t len) {
+namespace utf8 {
+
+std::pair<CodePoint, size_t> Decode(const uint8_t* ptr, size_t len) {
  if (len < 1) {
    return {};
  }
@@ -490,4 +492,15 @@ std::pair<CodePoint, size_t> utf8::Decode(const uint8_t* ptr, size_t len) {
  return {c, n};
 }

+bool IsASCII(std::string_view str) {
+  for (auto c : str) {
+    if (c & 0x80) {
+      return false;
+    }
+  }
+  return true;
+}
+
+}  // namespace utf8
+
 }  // namespace tint::text
--- a/src/text/unicode.h
+++ b/src/text/unicode.h
@@ -69,6 +69,10 @@ namespace utf8 {
 ///          If the next code point cannot be decoded then returns [0,0].
 std::pair<CodePoint, size_t> Decode(const uint8_t* ptr, size_t len);

+/// @returns true if all the utf-8 code points in the string are ASCII
+/// (code-points 0x00..0x7f).
+bool IsASCII(std::string_view);
+
 }  // namespace utf8

 }  // namespace tint::text
--- a/src/transform/glsl.cc
+++ b/src/transform/glsl.cc
@@ -56,7 +56,8 @@ Output Glsl::Run(const Program* in, const DataMap& inputs) const {
    data.Add<SingleEntryPoint::Config>(cfg->entry_point);
  }
  manager.Add<Renamer>();
-  data.Add<Renamer::Config>(Renamer::Target::kGlslKeywords);
+  data.Add<Renamer::Config>(Renamer::Target::kGlslKeywords,
+                            /* preserve_unicode */ false);
  manager.Add<Unshadow>();

  // Attempt to convert `loop`s into for-loops. This is to try and massage the
--- a/src/transform/renamer.cc
+++ b/src/transform/renamer.cc
@@ -21,13 +21,13 @@
 #include "src/program_builder.h"
 #include "src/sem/call.h"
 #include "src/sem/member_accessor_expression.h"
+#include "src/text/unicode.h"

 TINT_INSTANTIATE_TYPEINFO(tint::transform::Renamer);
 TINT_INSTANTIATE_TYPEINFO(tint::transform::Renamer::Data);
 TINT_INSTANTIATE_TYPEINFO(tint::transform::Renamer::Config);

-namespace tint {
-namespace transform {
+namespace tint::transform {

 namespace {

@@ -1245,7 +1245,7 @@ Renamer::Data::Data(Remappings&& r) : remappings(std::move(r)) {}
 Renamer::Data::Data(const Data&) = default;
 Renamer::Data::~Data() = default;

-Renamer::Config::Config(Target t) : target(t) {}
+Renamer::Config::Config(Target t, bool pu) : target(t), preserve_unicode(pu) {}
 Renamer::Config::Config(const Config&) = default;
 Renamer::Config::~Config() = default;

@@ -1293,48 +1293,52 @@ Output Renamer::Run(const Program* in, const DataMap& inputs) const {
  Data::Remappings remappings;

  Target target = Target::kAll;
+  bool preserve_unicode = false;

  if (auto* cfg = inputs.Get<Config>()) {
    target = cfg->target;
+    preserve_unicode = cfg->preserve_unicode;
  }

  ctx.ReplaceAll([&](Symbol sym_in) {
    auto name_in = ctx.src->Symbols().NameFor(sym_in);
-    switch (target) {
-      case Target::kAll:
-        // Always rename.
-        break;
-      case Target::kGlslKeywords:
-        if (!std::binary_search(
-                kReservedKeywordsGLSL,
-                kReservedKeywordsGLSL +
-                    sizeof(kReservedKeywordsGLSL) / sizeof(const char*),
-                name_in) &&
-            name_in.compare(0, 3, "gl_")) {
-          // No match, just reuse the original name.
-          return ctx.dst->Symbols().New(name_in);
-        }
-        break;
-      case Target::kHlslKeywords:
-        if (!std::binary_search(
-                kReservedKeywordsHLSL,
-                kReservedKeywordsHLSL +
-                    sizeof(kReservedKeywordsHLSL) / sizeof(const char*),
-                name_in)) {
-          // No match, just reuse the original name.
-          return ctx.dst->Symbols().New(name_in);
-        }
-        break;
-      case Target::kMslKeywords:
-        if (!std::binary_search(
-                kReservedKeywordsMSL,
-                kReservedKeywordsMSL +
-                    sizeof(kReservedKeywordsMSL) / sizeof(const char*),
-                name_in)) {
-          // No match, just reuse the original name.
-          return ctx.dst->Symbols().New(name_in);
-        }
-        break;
+    if (preserve_unicode || text::utf8::IsASCII(name_in)) {
+      switch (target) {
+        case Target::kAll:
+          // Always rename.
+          break;
+        case Target::kGlslKeywords:
+          if (!std::binary_search(
+                  kReservedKeywordsGLSL,
+                  kReservedKeywordsGLSL +
+                      sizeof(kReservedKeywordsGLSL) / sizeof(const char*),
+                  name_in) &&
+              name_in.compare(0, 3, "gl_")) {
+            // No match, just reuse the original name.
+            return ctx.dst->Symbols().New(name_in);
+          }
+          break;
+        case Target::kHlslKeywords:
+          if (!std::binary_search(
+                  kReservedKeywordsHLSL,
+                  kReservedKeywordsHLSL +
+                      sizeof(kReservedKeywordsHLSL) / sizeof(const char*),
+                  name_in)) {
+            // No match, just reuse the original name.
+            return ctx.dst->Symbols().New(name_in);
+          }
+          break;
+        case Target::kMslKeywords:
+          if (!std::binary_search(
+                  kReservedKeywordsMSL,
+                  kReservedKeywordsMSL +
+                      sizeof(kReservedKeywordsMSL) / sizeof(const char*),
+                  name_in)) {
+            // No match, just reuse the original name.
+            return ctx.dst->Symbols().New(name_in);
+          }
+          break;
+      }
    }

    auto sym_out = ctx.dst->Sym();
@@ -1359,5 +1363,4 @@ Output Renamer::Run(const Program* in, const DataMap& inputs) const {
                std::make_unique<Data>(std::move(remappings)));
 }

-}  // namespace transform
-}  // namespace tint
+}  // namespace tint::transform
--- a/src/transform/renamer.h
+++ b/src/transform/renamer.h
@@ -20,8 +20,7 @@

 #include "src/transform/transform.h"

-namespace tint {
-namespace transform {
+namespace tint::transform {

 /// Renamer is a Transform that renames all the symbols in a program.
 class Renamer : public Castable<Renamer, Transform> {
@@ -63,7 +62,9 @@ class Renamer : public Castable<Renamer, Transform> {
  struct Config : public Castable<Config, transform::Data> {
    /// Constructor
    /// @param tgt the targets to rename
-    explicit Config(Target tgt);
+    /// @param keep_unicode if false, symbols with non-ascii code-points are
+    /// renamed
+    explicit Config(Target tgt, bool keep_unicode = false);

    /// Copy constructor
    Config(const Config&);
@@ -73,6 +74,9 @@ class Renamer : public Castable<Renamer, Transform> {

    /// The targets to rename
    Target const target = Target::kAll;
+
+    /// If false, symbols with non-ascii code-points are renamed.
+    bool preserve_unicode = false;
  };

  /// Constructor using a the configuration provided in the input Data
@@ -88,7 +92,6 @@ class Renamer : public Castable<Renamer, Transform> {
  Output Run(const Program* program, const DataMap& data = {}) const override;
 };

-}  // namespace transform
-}  // namespace tint
+}  // namespace tint::transform

 #endif  // SRC_TRANSFORM_RENAMER_H_
--- a/src/transform/renamer_test.cc
+++ b/src/transform/renamer_test.cc
@@ -23,6 +23,11 @@ namespace tint {
 namespace transform {
 namespace {

+constexpr const char kUnicodeIdentifier[] =  // "𝖎𝖉𝖊𝖓𝖙𝖎𝖋𝖎𝖊𝖗123"
+    "\xf0\x9d\x96\x8e\xf0\x9d\x96\x89\xf0\x9d\x96\x8a\xf0\x9d\x96\x93"
+    "\xf0\x9d\x96\x99\xf0\x9d\x96\x8e\xf0\x9d\x96\x8b\xf0\x9d\x96\x8e"
+    "\xf0\x9d\x96\x8a\xf0\x9d\x96\x97\x31\x32\x33";
+
 using ::testing::ContainerEq;

 using RenamerTest = TransformTest;
@@ -184,6 +189,25 @@ fn tint_symbol() {
  EXPECT_THAT(data->remappings, ContainerEq(expected_remappings));
 }

+TEST_F(RenamerTest, PreserveUnicode) {
+  auto src = R"(
+@stage(fragment)
+fn frag_main() {
+  var )" + std::string(kUnicodeIdentifier) +
+             R"( : i32;
+}
+)";
+
+  auto expect = src;
+
+  DataMap inputs;
+  inputs.Add<Renamer::Config>(Renamer::Target::kMslKeywords,
+                              /* preserve_unicode */ true);
+  auto got = Run<Renamer>(src, inputs);
+
+  EXPECT_EQ(expect, str(got));
+}
+
 TEST_F(RenamerTest, AttemptSymbolCollision) {
  auto* src = R"(
@stage(vertex)
@@ -244,7 +268,8 @@ fn frag_main() {
 )";

  DataMap inputs;
-  inputs.Add<Renamer::Config>(Renamer::Target::kGlslKeywords);
+  inputs.Add<Renamer::Config>(Renamer::Target::kGlslKeywords,
+                              /* preserve_unicode */ false);
  auto got = Run<Renamer>(src, inputs);

  EXPECT_EQ(expect, str(got));
@@ -269,7 +294,8 @@ fn frag_main() {
 )";

  DataMap inputs;
-  inputs.Add<Renamer::Config>(Renamer::Target::kHlslKeywords);
+  inputs.Add<Renamer::Config>(Renamer::Target::kHlslKeywords,
+                              /* preserve_unicode */ false);
  auto got = Run<Renamer>(src, inputs);

  EXPECT_EQ(expect, str(got));
@@ -294,7 +320,8 @@ fn frag_main() {
 )";

  DataMap inputs;
-  inputs.Add<Renamer::Config>(Renamer::Target::kMslKeywords);
+  inputs.Add<Renamer::Config>(Renamer::Target::kMslKeywords,
+                              /* preserve_unicode */ false);
  auto got = Run<Renamer>(src, inputs);

  EXPECT_EQ(expect, str(got));
@@ -528,7 +555,8 @@ INSTANTIATE_TEST_SUITE_P(RenamerTestGlsl,
                                         //    "void",       // WGSL keyword
                                         "volatile",
                                         //    "while",      // WGSL keyword
-                                         "writeonly"));
+                                         "writeonly",
+                                         kUnicodeIdentifier));

 INSTANTIATE_TEST_SUITE_P(RenamerTestHlsl,
                         RenamerTestHlsl,
@@ -1142,8 +1170,9 @@ INSTANTIATE_TEST_SUITE_P(RenamerTestHlsl,
                                         "vertexshader",
                                         "virtual",
                                         // "void",  // WGSL keyword
-                                         "volatile"));
-//                                          "while"  // WGSL reserved keyword
+                                         "volatile",
+                                         // "while"  // WGSL reserved keyword
+                                         kUnicodeIdentifier));

 INSTANTIATE_TEST_SUITE_P(
    RenamerTestMsl,
@@ -1425,7 +1454,9 @@ INSTANTIATE_TEST_SUITE_P(
        "M_2_PI_H",
        "M_2_SQRTPI_H",
        "M_SQRT2_H",
-        "M_SQRT1_2_H"));
+        "M_SQRT1_2_H",
+        // "while"  // WGSL reserved keyword
+        kUnicodeIdentifier));

 }  // namespace
 }  // namespace transform
--- a/src/writer/hlsl/test_helper.h
+++ b/src/writer/hlsl/test_helper.h
@@ -89,7 +89,8 @@ class TestHelperBase : public BODY, public ProgramBuilder {
    transform::Manager transform_manager;
    transform::DataMap transform_data;
    transform_data.Add<transform::Renamer::Config>(
-        transform::Renamer::Target::kHlslKeywords);
+        transform::Renamer::Target::kHlslKeywords,
+        /* preserve_unicode */ true);
    transform_manager.Add<tint::transform::Renamer>();
    auto result =
        transform_manager.Run(&sanitized_result.program, transform_data);
--- a/test/unicode/comments.wgsl
+++ b/test/unicode/comments.wgsl
@@ -0,0 +1,11 @@
+// Øⓑꚫ쁹Ǵ𐌒岾🥍ⴵ㍨又ᮗ
+
+/*
+/* 👋🌎 */
+नमस्ते दुनिया
+*/
+
+@stage(fragment)
+fn /* こんにちは世界 */ main( /* 你好世界 */ ) {
+    // 안녕하세요 세계
+}
--- a/test/unicode/comments.wgsl.expected.glsl
+++ b/test/unicode/comments.wgsl.expected.glsl
@@ -0,0 +1,10 @@
+#version 310 es
+precision mediump float;
+
+void tint_symbol() {
+}
+
+void main() {
+  tint_symbol();
+  return;
+}
--- a/test/unicode/comments.wgsl.expected.hlsl
+++ b/test/unicode/comments.wgsl.expected.hlsl
@@ -0,0 +1,3 @@
+void main() {
+  return;
+}
--- a/test/unicode/comments.wgsl.expected.msl
+++ b/test/unicode/comments.wgsl.expected.msl
@@ -0,0 +1,7 @@
+#include <metal_stdlib>
+
+using namespace metal;
+fragment void tint_symbol() {
+  return;
+}
+
--- a/test/unicode/comments.wgsl.expected.spvasm
+++ b/test/unicode/comments.wgsl.expected.spvasm
@@ -0,0 +1,16 @@
+; SPIR-V
+; Version: 1.3
+; Generator: Google Tint Compiler; 0
+; Bound: 5
+; Schema: 0
+               OpCapability Shader
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint Fragment %main "main"
+               OpExecutionMode %main OriginUpperLeft
+               OpName %main "main"
+       %void = OpTypeVoid
+          %1 = OpTypeFunction %void
+       %main = OpFunction %void None %1
+          %4 = OpLabel
+               OpReturn
+               OpFunctionEnd
--- a/test/unicode/comments.wgsl.expected.wgsl
+++ b/test/unicode/comments.wgsl.expected.wgsl
@@ -0,0 +1,3 @@
+@stage(fragment)
+fn main() {
+}
--- a/test/unicode/indentifiers.wgsl
+++ b/test/unicode/indentifiers.wgsl
@@ -0,0 +1,12 @@
+type 𝓉𝓎𝓅ℯ_𝒶 = i32;
+type 𝐭𝐲𝐩𝐞_𝐛 = f32;
+
+fn 𝓯𝓾𝓷𝓬𝓽𝓲𝓸𝓷(ᵖᵃʳᵃᵐ : 𝓉𝓎𝓅ℯ_𝒶) -> 𝐭𝐲𝐩𝐞_𝐛 {
+    return 𝐭𝐲𝐩𝐞_𝐛(ᵖᵃʳᵃᵐ);
+}
+
+@stage(fragment)
+fn 𝕖𝕟𝕥𝕣𝕪𝕡𝕠𝕚𝕟𝕥() {
+    var 𝙫𝙖𝙧𝙞𝙖𝙗𝙡𝙚 : 𝓉𝓎𝓅ℯ_𝒶;
+    let 𝖗𝖊𝖘𝖚𝖑𝖙 = 𝓯𝓾𝓷𝓬𝓽𝓲𝓸𝓷(𝙫𝙖𝙧𝙞𝙖𝙗𝙡𝙚);
+}
--- a/test/unicode/indentifiers.wgsl.expected.glsl
+++ b/test/unicode/indentifiers.wgsl.expected.glsl
@@ -0,0 +1,16 @@
+#version 310 es
+precision mediump float;
+
+float tint_symbol_2(int tint_symbol_3) {
+  return float(tint_symbol_3);
+}
+
+void tint_symbol_4() {
+  int tint_symbol_5 = 0;
+  float tint_symbol_6 = tint_symbol_2(tint_symbol_5);
+}
+
+void main() {
+  tint_symbol_4();
+  return;
+}
--- a/test/unicode/indentifiers.wgsl.expected.hlsl
+++ b/test/unicode/indentifiers.wgsl.expected.hlsl
@@ -0,0 +1,9 @@
+float tint_symbol_2(int tint_symbol_3) {
+  return float(tint_symbol_3);
+}
+
+void tint_symbol_4() {
+  int tint_symbol_5 = 0;
+  const float tint_symbol_6 = tint_symbol_2(tint_symbol_5);
+  return;
+}
--- a/test/unicode/indentifiers.wgsl.expected.msl
+++ b/test/unicode/indentifiers.wgsl.expected.msl
@@ -0,0 +1,13 @@
+#include <metal_stdlib>
+
+using namespace metal;
+float tint_symbol_2(int tint_symbol_3) {
+  return float(tint_symbol_3);
+}
+
+fragment void tint_symbol_4() {
+  int tint_symbol_5 = 0;
+  float const tint_symbol_6 = tint_symbol_2(tint_symbol_5);
+  return;
+}
+
--- a/test/unicode/indentifiers.wgsl.expected.spvasm
+++ b/test/unicode/indentifiers.wgsl.expected.spvasm
@@ -0,0 +1,33 @@
+; SPIR-V
+; Version: 1.3
+; Generator: Google Tint Compiler; 0
+; Bound: 17
+; Schema: 0
+               OpCapability Shader
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint Fragment %________________________________________ "𝕖𝕟𝕥𝕣𝕪𝕡𝕠𝕚𝕟𝕥"
+               OpExecutionMode %________________________________________ OriginUpperLeft
+               OpName %________________________________ "𝓯𝓾𝓷𝓬𝓽𝓲𝓸𝓷"
+               OpName %______________ "ᵖᵃʳᵃᵐ"
+               OpName %________________________________________ "𝕖𝕟𝕥𝕣𝕪𝕡𝕠𝕚𝕟𝕥"
+               OpName %_________________________________0 "𝙫𝙖𝙧𝙞𝙖𝙗𝙡𝙚"
+      %float = OpTypeFloat 32
+        %int = OpTypeInt 32 1
+          %1 = OpTypeFunction %float %int
+       %void = OpTypeVoid
+          %8 = OpTypeFunction %void
+%_ptr_Function_int = OpTypePointer Function %int
+         %14 = OpConstantNull %int
+%________________________________ = OpFunction %float None %1
+%______________ = OpFunctionParameter %int
+          %6 = OpLabel
+          %7 = OpConvertSToF %float %______________
+               OpReturnValue %7
+               OpFunctionEnd
+%________________________________________ = OpFunction %void None %8
+         %11 = OpLabel
+%_________________________________0 = OpVariable %_ptr_Function_int Function %14
+         %16 = OpLoad %int %_________________________________0
+         %15 = OpFunctionCall %float %________________________________ %16
+               OpReturn
+               OpFunctionEnd
--- a/test/unicode/indentifiers.wgsl.expected.wgsl
+++ b/test/unicode/indentifiers.wgsl.expected.wgsl
@@ -0,0 +1,13 @@
+type 𝓉𝓎𝓅ℯ_𝒶 = i32;
+
+type 𝐭𝐲𝐩𝐞_𝐛 = f32;
+
+fn 𝓯𝓾𝓷𝓬𝓽𝓲𝓸𝓷(ᵖᵃʳᵃᵐ : 𝓉𝓎𝓅ℯ_𝒶) -> 𝐭𝐲𝐩𝐞_𝐛 {
+  return 𝐭𝐲𝐩𝐞_𝐛(ᵖᵃʳᵃᵐ);
+}
+
+@stage(fragment)
+fn 𝕖𝕟𝕥𝕣𝕪𝕡𝕠𝕚𝕟𝕥() {
+  var 𝙫𝙖𝙧𝙞𝙖𝙗𝙡𝙚 : 𝓉𝓎𝓅ℯ_𝒶;
+  let 𝖗𝖊𝖘𝖚𝖑𝖙 = 𝓯𝓾𝓷𝓬𝓽𝓲𝓸𝓷(𝙫𝙖𝙧𝙞𝙖𝙗𝙡𝙚);
+}