From d29d3d574080a71238a246f3580383c275990877 Mon Sep 17 00:00:00 2001 From: Ben Clayton Date: Fri, 18 Feb 2022 20:39:43 +0000 Subject: [PATCH] reader/wgsl: Support unicode identifiers Bug: tint:1437 Change-Id: Ie00ccb3e93d207111e55117dfc989f79b76164bf Reviewed-on: https://dawn-review.googlesource.com/c/tint/+/80844 Kokoro: Kokoro Reviewed-by: David Neto Commit-Queue: Ben Clayton --- docs/origin-trial-changes.md | 1 + samples/main.cc | 6 +- src/reader/wgsl/lexer.cc | 60 +++++++++----- src/reader/wgsl/lexer.h | 9 +- src/reader/wgsl/lexer_test.cc | 57 ++++++++++++- .../wgsl/parser_impl_function_decl_test.cc | 46 ++++++++++ .../wgsl/parser_impl_struct_decl_test.cc | 41 +++++++++ .../wgsl/parser_impl_type_alias_test.cc | 21 ++++- .../wgsl/parser_impl_variable_decl_test.cc | 20 ++++- src/source.h | 5 +- src/text/unicode.cc | 15 +++- src/text/unicode.h | 4 + src/transform/glsl.cc | 3 +- src/transform/renamer.cc | 83 ++++++++++--------- src/transform/renamer.h | 13 +-- src/transform/renamer_test.cc | 45 ++++++++-- src/writer/hlsl/test_helper.h | 3 +- test/unicode/comments.wgsl | 11 +++ test/unicode/comments.wgsl.expected.glsl | 10 +++ test/unicode/comments.wgsl.expected.hlsl | 3 + test/unicode/comments.wgsl.expected.msl | 7 ++ test/unicode/comments.wgsl.expected.spvasm | 16 ++++ test/unicode/comments.wgsl.expected.wgsl | 3 + test/unicode/indentifiers.wgsl | 12 +++ test/unicode/indentifiers.wgsl.expected.glsl | 16 ++++ test/unicode/indentifiers.wgsl.expected.hlsl | 9 ++ test/unicode/indentifiers.wgsl.expected.msl | 13 +++ .../unicode/indentifiers.wgsl.expected.spvasm | 33 ++++++++ test/unicode/indentifiers.wgsl.expected.wgsl | 13 +++ 29 files changed, 483 insertions(+), 95 deletions(-) create mode 100644 test/unicode/comments.wgsl create mode 100644 test/unicode/comments.wgsl.expected.glsl create mode 100644 test/unicode/comments.wgsl.expected.hlsl create mode 100644 test/unicode/comments.wgsl.expected.msl create mode 100644 test/unicode/comments.wgsl.expected.spvasm create mode 100644 test/unicode/comments.wgsl.expected.wgsl create mode 100644 test/unicode/indentifiers.wgsl create mode 100644 test/unicode/indentifiers.wgsl.expected.glsl create mode 100644 test/unicode/indentifiers.wgsl.expected.hlsl create mode 100644 test/unicode/indentifiers.wgsl.expected.msl create mode 100644 test/unicode/indentifiers.wgsl.expected.spvasm create mode 100644 test/unicode/indentifiers.wgsl.expected.wgsl diff --git a/docs/origin-trial-changes.md b/docs/origin-trial-changes.md index 9e19b620ec..d93bce4142 100644 --- a/docs/origin-trial-changes.md +++ b/docs/origin-trial-changes.md @@ -12,6 +12,7 @@ * Module-scope declarations can now be declared in any order. [tint:1266](crbug.com/tint/1266) * The `override` keyword and `@id()` attribute for pipeline-overridable constants are now supported, replacing the `@override` attribute. [tint:1403](crbug.com/tint/1403) +* Tint now supports unicode identifiers. [tint:1437](crbug.com/tint/1437) ## Changes for M99 diff --git a/samples/main.cc b/samples/main.cc index d1d0ec285a..8a8484ac34 100644 --- a/samples/main.cc +++ b/samples/main.cc @@ -1119,7 +1119,8 @@ int main(int argc, const char** argv) { case Format::kMsl: { #if TINT_BUILD_MSL_WRITER transform_inputs.Add( - tint::transform::Renamer::Target::kMslKeywords); + tint::transform::Renamer::Target::kMslKeywords, + /* preserve_unicode */ false); transform_manager.Add(); #endif // TINT_BUILD_MSL_WRITER break; @@ -1132,7 +1133,8 @@ int main(int argc, const char** argv) { case Format::kHlsl: { #if TINT_BUILD_HLSL_WRITER transform_inputs.Add( - tint::transform::Renamer::Target::kHlslKeywords); + tint::transform::Renamer::Target::kHlslKeywords, + /* preserve_unicode */ false); transform_manager.Add(); #endif // TINT_BUILD_HLSL_WRITER break; diff --git a/src/reader/wgsl/lexer.cc b/src/reader/wgsl/lexer.cc index 56e56adc5e..8b0f7ac533 100644 --- a/src/reader/wgsl/lexer.cc +++ b/src/reader/wgsl/lexer.cc @@ -21,6 +21,7 @@ #include #include "src/debug.h" +#include "src/text/unicode.h" namespace tint { namespace reader { @@ -113,18 +114,10 @@ bool Lexer::is_null() const { return (pos_ < len_) && (file_->content.data[pos_] == 0); } -bool Lexer::is_alpha(char ch) const { - return std::isalpha(ch); -} - bool Lexer::is_digit(char ch) const { return std::isdigit(ch); } -bool Lexer::is_alphanum_underscore(char ch) const { - return is_alpha(ch) || is_digit(ch) || ch == '_'; -} - bool Lexer::is_hex(char ch) const { return std::isxdigit(ch); } @@ -733,31 +726,52 @@ Token Lexer::try_integer() { } Token Lexer::try_ident() { - // Must begin with an a-zA-Z_ - if (!(is_alpha(file_->content.data[pos_]) || - file_->content.data[pos_] == '_')) { - return {}; - } - auto source = begin_source(); + auto start = pos_; - auto s = pos_; - while (!is_eof() && is_alphanum_underscore(file_->content.data[pos_])) { - pos_++; - location_.column++; + // This below assumes that the size of a single std::string element is 1 byte. + static_assert(sizeof(file_->content.data[0]) == sizeof(uint8_t), + "tint::reader::wgsl requires the size of a std::string element " + "to be a single byte"); + + // Must begin with an XID_Source unicode character, or underscore + { + auto* utf8 = reinterpret_cast(&file_->content.data[pos_]); + auto [code_point, n] = + text::utf8::Decode(utf8, file_->content.data.size() - pos_); + if (code_point != text::CodePoint('_') && !code_point.IsXIDStart()) { + return {}; + } + // Consume start codepoint + pos_ += n; + location_.column += n; } - if (file_->content.data[s] == '_') { + while (!is_eof()) { + // Must continue with an XID_Continue unicode character + auto* utf8 = reinterpret_cast(&file_->content.data[pos_]); + auto [code_point, n] = + text::utf8::Decode(utf8, file_->content.data.size() - pos_); + if (!code_point.IsXIDContinue()) { + break; + } + + // Consume continuing codepoint + pos_ += n; + location_.column += n; + } + + if (file_->content.data[start] == '_') { // Check for an underscore on its own (special token), or a // double-underscore (not allowed). - if ((pos_ == s + 1) || (file_->content.data[s + 1] == '_')) { - location_.column -= (pos_ - s); - pos_ = s; + if ((pos_ == start + 1) || (file_->content.data[start + 1] == '_')) { + location_.column -= (pos_ - start); + pos_ = start; return {}; } } - auto str = file_->content.data_view.substr(s, pos_ - s); + auto str = file_->content.data_view.substr(start, pos_ - start); end_source(source); auto t = check_keyword(source, str); diff --git a/src/reader/wgsl/lexer.h b/src/reader/wgsl/lexer.h index 43d9ea52da..5bdb20f973 100644 --- a/src/reader/wgsl/lexer.h +++ b/src/reader/wgsl/lexer.h @@ -76,25 +76,18 @@ class Lexer { /// it is not null. bool is_null() const; /// @param ch a character - /// @returns true if 'ch' is an alphabetic character - bool is_alpha(char ch) const; - /// @param ch a character /// @returns true if 'ch' is a decimal digit bool is_digit(char ch) const; /// @param ch a character /// @returns true if 'ch' is a hexadecimal digit bool is_hex(char ch) const; - /// @param ch a character - /// @returns true if 'ch' is a digit, an alphabetic character, - /// or an underscore. - bool is_alphanum_underscore(char ch) const; bool matches(size_t pos, std::string_view substr); /// The source file content Source::File const* const file_; /// The length of the input uint32_t len_ = 0; - /// The current position within the input + /// The current position in utf-8 code units (bytes) within the input uint32_t pos_ = 0; /// The current location within the input Source::Location location_; diff --git a/src/reader/wgsl/lexer_test.cc b/src/reader/wgsl/lexer_test.cc index ff6c9caf3f..2e79906f65 100644 --- a/src/reader/wgsl/lexer_test.cc +++ b/src/reader/wgsl/lexer_test.cc @@ -315,8 +315,8 @@ INSTANTIATE_TEST_SUITE_P( "2.5E+ 123", "2.5E- 123")); -using IdentifierTest = testing::TestWithParam; -TEST_P(IdentifierTest, Parse) { +using AsciiIdentifierTest = testing::TestWithParam; +TEST_P(AsciiIdentifierTest, Parse) { Source::File file("", GetParam()); Lexer l(&file); @@ -329,7 +329,7 @@ TEST_P(IdentifierTest, Parse) { EXPECT_EQ(t.to_str(), GetParam()); } INSTANTIATE_TEST_SUITE_P(LexerTest, - IdentifierTest, + AsciiIdentifierTest, testing::Values("a", "test", "test01", @@ -342,6 +342,57 @@ INSTANTIATE_TEST_SUITE_P(LexerTest, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "alldigits_0123456789")); +struct UnicodeCase { + const char* utf8; + size_t code_units; +}; + +using UnicodeIdentifierTest = testing::TestWithParam; +TEST_P(UnicodeIdentifierTest, Parse) { + Source::File file("", GetParam().utf8); + Lexer l(&file); + + auto t = l.next(); + EXPECT_TRUE(t.IsIdentifier()); + EXPECT_EQ(t.source().range.begin.line, 1u); + EXPECT_EQ(t.source().range.begin.column, 1u); + EXPECT_EQ(t.source().range.end.line, 1u); + EXPECT_EQ(t.source().range.end.column, 1u + GetParam().code_units); + EXPECT_EQ(t.to_str(), GetParam().utf8); +} +INSTANTIATE_TEST_SUITE_P( + LexerTest, + UnicodeIdentifierTest, + testing::ValuesIn({ + UnicodeCase{// "๐ข๐๐ž๐ง๐ญ๐ข๐Ÿ๐ข๐ž๐ซ" + "\xf0\x9d\x90\xa2\xf0\x9d\x90\x9d\xf0\x9d\x90\x9e\xf0\x9d" + "\x90\xa7\xf0\x9d\x90\xad\xf0\x9d\x90\xa2\xf0\x9d\x90\x9f" + "\xf0\x9d\x90\xa2\xf0\x9d\x90\x9e\xf0\x9d\x90\xab", + 40}, + UnicodeCase{// "๐‘–๐‘‘๐‘’๐‘›๐‘ก๐‘–๐‘“๐‘–๐‘’๐‘Ÿ" + "\xf0\x9d\x91\x96\xf0\x9d\x91\x91\xf0\x9d\x91\x92\xf0\x9d" + "\x91\x9b\xf0\x9d\x91\xa1\xf0\x9d\x91\x96\xf0\x9d\x91\x93" + "\xf0\x9d\x91\x96\xf0\x9d\x91\x92\xf0\x9d\x91\x9f", + 40}, + UnicodeCase{ + // "๏ฝ‰๏ฝ„๏ฝ…๏ฝŽ๏ฝ”๏ฝ‰๏ฝ†๏ฝ‰๏ฝ…๏ฝ’" + "\xef\xbd\x89\xef\xbd\x84\xef\xbd\x85\xef\xbd\x8e\xef\xbd\x94\xef" + "\xbd\x89\xef\xbd\x86\xef\xbd\x89\xef\xbd\x85\xef\xbd\x92", + 30}, + UnicodeCase{// "๐•š๐••๐•–๐•Ÿ๐•ฅ๐•š๐•—๐•š๐•–๐•ฃ๐Ÿ™๐Ÿš๐Ÿ›" + "\xf0\x9d\x95\x9a\xf0\x9d\x95\x95\xf0\x9d\x95\x96\xf0\x9d" + "\x95\x9f\xf0\x9d\x95\xa5\xf0\x9d\x95\x9a\xf0\x9d\x95\x97" + "\xf0\x9d\x95\x9a\xf0\x9d\x95\x96\xf0\x9d\x95\xa3\xf0\x9d" + "\x9f\x99\xf0\x9d\x9f\x9a\xf0\x9d\x9f\x9b", + 52}, + UnicodeCase{ + // "๐–Ž๐–‰๐–Š๐–“๐–™๐–Ž๐–‹๐–Ž๐–Š๐–—123" + "\xf0\x9d\x96\x8e\xf0\x9d\x96\x89\xf0\x9d\x96\x8a\xf0\x9d\x96\x93" + "\xf0\x9d\x96\x99\xf0\x9d\x96\x8e\xf0\x9d\x96\x8b\xf0\x9d\x96\x8e" + "\xf0\x9d\x96\x8a\xf0\x9d\x96\x97\x31\x32\x33", + 43}, + })); + TEST_F(LexerTest, IdentifierTest_SingleUnderscoreDoesNotMatch) { Source::File file("", "_"); Lexer l(&file); diff --git a/src/reader/wgsl/parser_impl_function_decl_test.cc b/src/reader/wgsl/parser_impl_function_decl_test.cc index f9bb2bd6d2..56f024efc7 100644 --- a/src/reader/wgsl/parser_impl_function_decl_test.cc +++ b/src/reader/wgsl/parser_impl_function_decl_test.cc @@ -14,6 +14,7 @@ #include "src/ast/workgroup_attribute.h" #include "src/reader/wgsl/parser_impl_test_helper.h" +#include "src/utils/string.h" namespace tint { namespace reader { @@ -48,6 +49,51 @@ TEST_F(ParserImplTest, FunctionDecl) { EXPECT_TRUE(body->statements[0]->Is()); } +TEST_F(ParserImplTest, FunctionDecl_Unicode) { + const std::string function_ident = // "๐—ณ๐˜‚๐—ป๐—ฐ๐˜๐—ถ๐—ผ๐—ป" + "\xf0\x9d\x97\xb3\xf0\x9d\x98\x82\xf0\x9d\x97\xbb\xf0\x9d\x97\xb0\xf0\x9d" + "\x98\x81\xf0\x9d\x97\xb6\xf0\x9d\x97\xbc\xf0\x9d\x97\xbb"; + + const std::string param_a_ident = // "๐“น๐“ช๐“ป๐“ช๐“ถ_๐“ช" + "\xf0\x9d\x93\xb9\xf0\x9d\x93\xaa\xf0\x9d\x93\xbb\xf0\x9d\x93\xaa\xf0\x9d" + "\x93\xb6\x5f\xf0\x9d\x93\xaa"; + + const std::string param_b_ident = // "๐•ก๐•’๐•ฃ๐•’๐•ž_๐•“" + "\xf0\x9d\x95\xa1\xf0\x9d\x95\x92\xf0\x9d\x95\xa3\xf0\x9d\x95\x92\xf0\x9d" + "\x95\x9e\x5f\xf0\x9d\x95\x93"; + + std::string src = "fn $function($param_a : i32, $param_b : f32) { return; }"; + src = utils::ReplaceAll(src, "$function", function_ident); + src = utils::ReplaceAll(src, "$param_a", param_a_ident); + src = utils::ReplaceAll(src, "$param_b", param_b_ident); + + auto p = parser(src); + auto attrs = p->attribute_list(); + EXPECT_FALSE(p->has_error()) << p->error(); + ASSERT_FALSE(attrs.errored); + EXPECT_FALSE(attrs.matched); + auto f = p->function_decl(attrs.value); + EXPECT_FALSE(p->has_error()) << p->error(); + EXPECT_FALSE(f.errored); + EXPECT_TRUE(f.matched); + ASSERT_NE(f.value, nullptr); + + EXPECT_EQ(f->symbol, p->builder().Symbols().Get(function_ident)); + ASSERT_NE(f->return_type, nullptr); + EXPECT_TRUE(f->return_type->Is()); + + ASSERT_EQ(f->params.size(), 2u); + EXPECT_EQ(f->params[0]->symbol, p->builder().Symbols().Get(param_a_ident)); + EXPECT_EQ(f->params[1]->symbol, p->builder().Symbols().Get(param_b_ident)); + + ASSERT_NE(f->return_type, nullptr); + EXPECT_TRUE(f->return_type->Is()); + + auto* body = f->body; + ASSERT_EQ(body->statements.size(), 1u); + EXPECT_TRUE(body->statements[0]->Is()); +} + TEST_F(ParserImplTest, FunctionDecl_AttributeList) { auto p = parser("@workgroup_size(2, 3, 4) fn main() { return; }"); auto attrs = p->attribute_list(); diff --git a/src/reader/wgsl/parser_impl_struct_decl_test.cc b/src/reader/wgsl/parser_impl_struct_decl_test.cc index 5df3bd040a..40d6cc761d 100644 --- a/src/reader/wgsl/parser_impl_struct_decl_test.cc +++ b/src/reader/wgsl/parser_impl_struct_decl_test.cc @@ -14,6 +14,7 @@ #include "src/ast/struct_block_attribute.h" #include "src/reader/wgsl/parser_impl_test_helper.h" +#include "src/utils/string.h" namespace tint { namespace reader { @@ -42,6 +43,46 @@ struct S { EXPECT_EQ(s->members[1]->symbol, p->builder().Symbols().Register("b")); } +TEST_F(ParserImplTest, StructDecl_Unicode_Parses) { + const std::string struct_ident = // "๐“ผ๐“ฝ๐“ป๐“พ๐“ฌ๐“ฝ๐“พ๐“ป๐“ฎ" + "\xf0\x9d\x93\xbc\xf0\x9d\x93\xbd\xf0\x9d\x93\xbb\xf0\x9d\x93\xbe\xf0\x9d" + "\x93\xac\xf0\x9d\x93\xbd\xf0\x9d\x93\xbe\xf0\x9d\x93\xbb\xf0\x9d\x93" + "\xae"; + const std::string member_a_ident = // "๐•ž๐•–๐•ž๐•“๐•–๐•ฃ_๐•’" + "\xf0\x9d\x95\x9e\xf0\x9d\x95\x96\xf0\x9d\x95\x9e\xf0\x9d\x95\x93\xf0\x9d" + "\x95\x96\xf0\x9d\x95\xa3\x5f\xf0\x9d\x95\x92"; + const std::string member_b_ident = // "๐”ช๐”ข๐”ช๐”Ÿ๐”ข๐”ฏ_๐”Ÿ" + "\xf0\x9d\x94\xaa\xf0\x9d\x94\xa2\xf0\x9d\x94\xaa\xf0\x9d\x94\x9f\xf0\x9d" + "\x94\xa2\xf0\x9d\x94\xaf\x5f\xf0\x9d\x94\x9f"; + + std::string src = R"( +struct $struct { + $member_a : i32; + $member_b : f32; +})"; + src = utils::ReplaceAll(src, "$struct", struct_ident); + src = utils::ReplaceAll(src, "$member_a", member_a_ident); + src = utils::ReplaceAll(src, "$member_b", member_b_ident); + + auto p = parser(src); + auto attrs = p->attribute_list(); + EXPECT_FALSE(attrs.errored); + EXPECT_FALSE(attrs.matched); + ASSERT_EQ(attrs.value.size(), 0u); + + auto s = p->struct_decl(attrs.value); + EXPECT_FALSE(p->has_error()); + EXPECT_FALSE(s.errored); + EXPECT_TRUE(s.matched); + ASSERT_NE(s.value, nullptr); + ASSERT_EQ(s->name, p->builder().Symbols().Register(struct_ident)); + ASSERT_EQ(s->members.size(), 2u); + EXPECT_EQ(s->members[0]->symbol, + p->builder().Symbols().Register(member_a_ident)); + EXPECT_EQ(s->members[1]->symbol, + p->builder().Symbols().Register(member_b_ident)); +} + TEST_F(ParserImplTest, StructDecl_ParsesWithAttribute) { auto p = parser(R"( [[block]] struct B { diff --git a/src/reader/wgsl/parser_impl_type_alias_test.cc b/src/reader/wgsl/parser_impl_type_alias_test.cc index 45e2e32307..301db6f031 100644 --- a/src/reader/wgsl/parser_impl_type_alias_test.cc +++ b/src/reader/wgsl/parser_impl_type_alias_test.cc @@ -34,7 +34,7 @@ TEST_F(ParserImplTest, TypeDecl_ParsesType) { EXPECT_EQ(t.value->source.range, (Source::Range{{1u, 1u}, {1u, 13u}})); } -TEST_F(ParserImplTest, TypeDecl_ParsesStruct_Ident) { +TEST_F(ParserImplTest, TypeDecl_Parses_Ident) { auto p = parser("type a = B"); auto t = p->type_alias(); @@ -49,6 +49,25 @@ TEST_F(ParserImplTest, TypeDecl_ParsesStruct_Ident) { EXPECT_EQ(alias->source.range, (Source::Range{{1u, 1u}, {1u, 11u}})); } +TEST_F(ParserImplTest, TypeDecl_Unicode_Parses_Ident) { + const std::string ident = // "๐“ถ๐”‚_๐“ฝ๐”‚๐“น๐“ฎ" + "\xf0\x9d\x93\xb6\xf0\x9d\x94\x82\x5f\xf0\x9d\x93\xbd\xf0\x9d\x94\x82\xf0" + "\x9d\x93\xb9\xf0\x9d\x93\xae"; + + auto p = parser("type " + ident + " = i32"); + + auto t = p->type_alias(); + EXPECT_FALSE(p->has_error()); + EXPECT_FALSE(t.errored); + EXPECT_TRUE(t.matched); + ASSERT_NE(t.value, nullptr); + ASSERT_TRUE(t.value->Is()); + auto* alias = t.value->As(); + EXPECT_EQ(p->builder().Symbols().NameFor(alias->name), ident); + EXPECT_TRUE(alias->type->Is()); + EXPECT_EQ(alias->source.range, (Source::Range{{1u, 1u}, {1u, 37u}})); +} + TEST_F(ParserImplTest, TypeDecl_MissingIdent) { auto p = parser("type = i32"); auto t = p->type_alias(); diff --git a/src/reader/wgsl/parser_impl_variable_decl_test.cc b/src/reader/wgsl/parser_impl_variable_decl_test.cc index 97bffb1cb4..88a5873347 100644 --- a/src/reader/wgsl/parser_impl_variable_decl_test.cc +++ b/src/reader/wgsl/parser_impl_variable_decl_test.cc @@ -18,7 +18,6 @@ namespace tint { namespace reader { namespace wgsl { namespace { - TEST_F(ParserImplTest, VariableDecl_Parses) { auto p = parser("var my_var : f32"); auto v = p->variable_decl(); @@ -33,6 +32,25 @@ TEST_F(ParserImplTest, VariableDecl_Parses) { EXPECT_EQ(v->type->source.range, (Source::Range{{1u, 14u}, {1u, 17u}})); } +TEST_F(ParserImplTest, VariableDecl_Unicode_Parses) { + const std::string ident = // "๐–Ž๐–‰๐–Š๐–“๐–™๐–Ž๐–‹๐–Ž๐–Š๐–—123" + "\xf0\x9d\x96\x8e\xf0\x9d\x96\x89\xf0\x9d\x96\x8a\xf0\x9d\x96\x93" + "\xf0\x9d\x96\x99\xf0\x9d\x96\x8e\xf0\x9d\x96\x8b\xf0\x9d\x96\x8e" + "\xf0\x9d\x96\x8a\xf0\x9d\x96\x97\x31\x32\x33"; + + auto p = parser("var " + ident + " : f32"); + auto v = p->variable_decl(); + EXPECT_FALSE(p->has_error()); + EXPECT_TRUE(v.matched); + EXPECT_FALSE(v.errored); + EXPECT_EQ(v->name, ident); + EXPECT_NE(v->type, nullptr); + EXPECT_TRUE(v->type->Is()); + + EXPECT_EQ(v->source.range, (Source::Range{{1u, 5u}, {1u, 48u}})); + EXPECT_EQ(v->type->source.range, (Source::Range{{1u, 51u}, {1u, 54u}})); +} + TEST_F(ParserImplTest, VariableDecl_Inferred_Parses) { auto p = parser("var my_var = 1.0"); auto v = p->variable_decl(/*allow_inferred = */ true); diff --git a/src/source.h b/src/source.h index 4802d061ce..b40c9947ea 100644 --- a/src/source.h +++ b/src/source.h @@ -27,7 +27,7 @@ namespace tint { /// Source describes a range of characters within a source file. class Source { public: - /// FileContent describes the content of a source file. + /// FileContent describes the content of a source file encoded using utf-8. class FileContent { public: /// Constructs the FileContent with the given file content. @@ -78,7 +78,8 @@ class Source { public: /// the 1-based line number. 0 represents no line information. size_t line = 0; - /// the 1-based column number. 0 represents no column information. + /// the 1-based column number in utf8-code units (bytes). + /// 0 represents no column information. size_t column = 0; /// Returns true of `this` location is lexicographically less than `rhs` diff --git a/src/text/unicode.cc b/src/text/unicode.cc index 12767bf709..826eb5b890 100644 --- a/src/text/unicode.cc +++ b/src/text/unicode.cc @@ -427,7 +427,9 @@ std::ostream& operator<<(std::ostream& out, CodePoint code_point) { return out << "'U+" << std::hex << code_point.value << "'"; } -std::pair utf8::Decode(const uint8_t* ptr, size_t len) { +namespace utf8 { + +std::pair Decode(const uint8_t* ptr, size_t len) { if (len < 1) { return {}; } @@ -490,4 +492,15 @@ std::pair utf8::Decode(const uint8_t* ptr, size_t len) { return {c, n}; } +bool IsASCII(std::string_view str) { + for (auto c : str) { + if (c & 0x80) { + return false; + } + } + return true; +} + +} // namespace utf8 + } // namespace tint::text diff --git a/src/text/unicode.h b/src/text/unicode.h index c1ef9158da..3c74221ae5 100644 --- a/src/text/unicode.h +++ b/src/text/unicode.h @@ -69,6 +69,10 @@ namespace utf8 { /// If the next code point cannot be decoded then returns [0,0]. std::pair Decode(const uint8_t* ptr, size_t len); +/// @returns true if all the utf-8 code points in the string are ASCII +/// (code-points 0x00..0x7f). +bool IsASCII(std::string_view); + } // namespace utf8 } // namespace tint::text diff --git a/src/transform/glsl.cc b/src/transform/glsl.cc index 46cc8483a1..7744370305 100644 --- a/src/transform/glsl.cc +++ b/src/transform/glsl.cc @@ -56,7 +56,8 @@ Output Glsl::Run(const Program* in, const DataMap& inputs) const { data.Add(cfg->entry_point); } manager.Add(); - data.Add(Renamer::Target::kGlslKeywords); + data.Add(Renamer::Target::kGlslKeywords, + /* preserve_unicode */ false); manager.Add(); // Attempt to convert `loop`s into for-loops. This is to try and massage the diff --git a/src/transform/renamer.cc b/src/transform/renamer.cc index 0b49e6e7be..6863c3ddf4 100644 --- a/src/transform/renamer.cc +++ b/src/transform/renamer.cc @@ -21,13 +21,13 @@ #include "src/program_builder.h" #include "src/sem/call.h" #include "src/sem/member_accessor_expression.h" +#include "src/text/unicode.h" TINT_INSTANTIATE_TYPEINFO(tint::transform::Renamer); TINT_INSTANTIATE_TYPEINFO(tint::transform::Renamer::Data); TINT_INSTANTIATE_TYPEINFO(tint::transform::Renamer::Config); -namespace tint { -namespace transform { +namespace tint::transform { namespace { @@ -1245,7 +1245,7 @@ Renamer::Data::Data(Remappings&& r) : remappings(std::move(r)) {} Renamer::Data::Data(const Data&) = default; Renamer::Data::~Data() = default; -Renamer::Config::Config(Target t) : target(t) {} +Renamer::Config::Config(Target t, bool pu) : target(t), preserve_unicode(pu) {} Renamer::Config::Config(const Config&) = default; Renamer::Config::~Config() = default; @@ -1293,48 +1293,52 @@ Output Renamer::Run(const Program* in, const DataMap& inputs) const { Data::Remappings remappings; Target target = Target::kAll; + bool preserve_unicode = false; if (auto* cfg = inputs.Get()) { target = cfg->target; + preserve_unicode = cfg->preserve_unicode; } ctx.ReplaceAll([&](Symbol sym_in) { auto name_in = ctx.src->Symbols().NameFor(sym_in); - switch (target) { - case Target::kAll: - // Always rename. - break; - case Target::kGlslKeywords: - if (!std::binary_search( - kReservedKeywordsGLSL, - kReservedKeywordsGLSL + - sizeof(kReservedKeywordsGLSL) / sizeof(const char*), - name_in) && - name_in.compare(0, 3, "gl_")) { - // No match, just reuse the original name. - return ctx.dst->Symbols().New(name_in); - } - break; - case Target::kHlslKeywords: - if (!std::binary_search( - kReservedKeywordsHLSL, - kReservedKeywordsHLSL + - sizeof(kReservedKeywordsHLSL) / sizeof(const char*), - name_in)) { - // No match, just reuse the original name. - return ctx.dst->Symbols().New(name_in); - } - break; - case Target::kMslKeywords: - if (!std::binary_search( - kReservedKeywordsMSL, - kReservedKeywordsMSL + - sizeof(kReservedKeywordsMSL) / sizeof(const char*), - name_in)) { - // No match, just reuse the original name. - return ctx.dst->Symbols().New(name_in); - } - break; + if (preserve_unicode || text::utf8::IsASCII(name_in)) { + switch (target) { + case Target::kAll: + // Always rename. + break; + case Target::kGlslKeywords: + if (!std::binary_search( + kReservedKeywordsGLSL, + kReservedKeywordsGLSL + + sizeof(kReservedKeywordsGLSL) / sizeof(const char*), + name_in) && + name_in.compare(0, 3, "gl_")) { + // No match, just reuse the original name. + return ctx.dst->Symbols().New(name_in); + } + break; + case Target::kHlslKeywords: + if (!std::binary_search( + kReservedKeywordsHLSL, + kReservedKeywordsHLSL + + sizeof(kReservedKeywordsHLSL) / sizeof(const char*), + name_in)) { + // No match, just reuse the original name. + return ctx.dst->Symbols().New(name_in); + } + break; + case Target::kMslKeywords: + if (!std::binary_search( + kReservedKeywordsMSL, + kReservedKeywordsMSL + + sizeof(kReservedKeywordsMSL) / sizeof(const char*), + name_in)) { + // No match, just reuse the original name. + return ctx.dst->Symbols().New(name_in); + } + break; + } } auto sym_out = ctx.dst->Sym(); @@ -1359,5 +1363,4 @@ Output Renamer::Run(const Program* in, const DataMap& inputs) const { std::make_unique(std::move(remappings))); } -} // namespace transform -} // namespace tint +} // namespace tint::transform diff --git a/src/transform/renamer.h b/src/transform/renamer.h index 4bec367dbe..9b448e8145 100644 --- a/src/transform/renamer.h +++ b/src/transform/renamer.h @@ -20,8 +20,7 @@ #include "src/transform/transform.h" -namespace tint { -namespace transform { +namespace tint::transform { /// Renamer is a Transform that renames all the symbols in a program. class Renamer : public Castable { @@ -63,7 +62,9 @@ class Renamer : public Castable { struct Config : public Castable { /// Constructor /// @param tgt the targets to rename - explicit Config(Target tgt); + /// @param keep_unicode if false, symbols with non-ascii code-points are + /// renamed + explicit Config(Target tgt, bool keep_unicode = false); /// Copy constructor Config(const Config&); @@ -73,6 +74,9 @@ class Renamer : public Castable { /// The targets to rename Target const target = Target::kAll; + + /// If false, symbols with non-ascii code-points are renamed. + bool preserve_unicode = false; }; /// Constructor using a the configuration provided in the input Data @@ -88,7 +92,6 @@ class Renamer : public Castable { Output Run(const Program* program, const DataMap& data = {}) const override; }; -} // namespace transform -} // namespace tint +} // namespace tint::transform #endif // SRC_TRANSFORM_RENAMER_H_ diff --git a/src/transform/renamer_test.cc b/src/transform/renamer_test.cc index 0cdbdb43ec..56c887cb3e 100644 --- a/src/transform/renamer_test.cc +++ b/src/transform/renamer_test.cc @@ -23,6 +23,11 @@ namespace tint { namespace transform { namespace { +constexpr const char kUnicodeIdentifier[] = // "๐–Ž๐–‰๐–Š๐–“๐–™๐–Ž๐–‹๐–Ž๐–Š๐–—123" + "\xf0\x9d\x96\x8e\xf0\x9d\x96\x89\xf0\x9d\x96\x8a\xf0\x9d\x96\x93" + "\xf0\x9d\x96\x99\xf0\x9d\x96\x8e\xf0\x9d\x96\x8b\xf0\x9d\x96\x8e" + "\xf0\x9d\x96\x8a\xf0\x9d\x96\x97\x31\x32\x33"; + using ::testing::ContainerEq; using RenamerTest = TransformTest; @@ -184,6 +189,25 @@ fn tint_symbol() { EXPECT_THAT(data->remappings, ContainerEq(expected_remappings)); } +TEST_F(RenamerTest, PreserveUnicode) { + auto src = R"( +@stage(fragment) +fn frag_main() { + var )" + std::string(kUnicodeIdentifier) + + R"( : i32; +} +)"; + + auto expect = src; + + DataMap inputs; + inputs.Add(Renamer::Target::kMslKeywords, + /* preserve_unicode */ true); + auto got = Run(src, inputs); + + EXPECT_EQ(expect, str(got)); +} + TEST_F(RenamerTest, AttemptSymbolCollision) { auto* src = R"( @stage(vertex) @@ -244,7 +268,8 @@ fn frag_main() { )"; DataMap inputs; - inputs.Add(Renamer::Target::kGlslKeywords); + inputs.Add(Renamer::Target::kGlslKeywords, + /* preserve_unicode */ false); auto got = Run(src, inputs); EXPECT_EQ(expect, str(got)); @@ -269,7 +294,8 @@ fn frag_main() { )"; DataMap inputs; - inputs.Add(Renamer::Target::kHlslKeywords); + inputs.Add(Renamer::Target::kHlslKeywords, + /* preserve_unicode */ false); auto got = Run(src, inputs); EXPECT_EQ(expect, str(got)); @@ -294,7 +320,8 @@ fn frag_main() { )"; DataMap inputs; - inputs.Add(Renamer::Target::kMslKeywords); + inputs.Add(Renamer::Target::kMslKeywords, + /* preserve_unicode */ false); auto got = Run(src, inputs); EXPECT_EQ(expect, str(got)); @@ -528,7 +555,8 @@ INSTANTIATE_TEST_SUITE_P(RenamerTestGlsl, // "void", // WGSL keyword "volatile", // "while", // WGSL keyword - "writeonly")); + "writeonly", + kUnicodeIdentifier)); INSTANTIATE_TEST_SUITE_P(RenamerTestHlsl, RenamerTestHlsl, @@ -1142,8 +1170,9 @@ INSTANTIATE_TEST_SUITE_P(RenamerTestHlsl, "vertexshader", "virtual", // "void", // WGSL keyword - "volatile")); -// "while" // WGSL reserved keyword + "volatile", + // "while" // WGSL reserved keyword + kUnicodeIdentifier)); INSTANTIATE_TEST_SUITE_P( RenamerTestMsl, @@ -1425,7 +1454,9 @@ INSTANTIATE_TEST_SUITE_P( "M_2_PI_H", "M_2_SQRTPI_H", "M_SQRT2_H", - "M_SQRT1_2_H")); + "M_SQRT1_2_H", + // "while" // WGSL reserved keyword + kUnicodeIdentifier)); } // namespace } // namespace transform diff --git a/src/writer/hlsl/test_helper.h b/src/writer/hlsl/test_helper.h index e55b9c1e95..337a1ac20b 100644 --- a/src/writer/hlsl/test_helper.h +++ b/src/writer/hlsl/test_helper.h @@ -89,7 +89,8 @@ class TestHelperBase : public BODY, public ProgramBuilder { transform::Manager transform_manager; transform::DataMap transform_data; transform_data.Add( - transform::Renamer::Target::kHlslKeywords); + transform::Renamer::Target::kHlslKeywords, + /* preserve_unicode */ true); transform_manager.Add(); auto result = transform_manager.Run(&sanitized_result.program, transform_data); diff --git a/test/unicode/comments.wgsl b/test/unicode/comments.wgsl new file mode 100644 index 0000000000..0e42055222 --- /dev/null +++ b/test/unicode/comments.wgsl @@ -0,0 +1,11 @@ +// ร˜โ“‘๊šซ์นวด๐Œ’ๅฒพ๐Ÿฅโดตใจๅˆแฎ— + +/* +/* ๐Ÿ‘‹๐ŸŒŽ */ +เคจเคฎเคธเฅเคคเฅ‡ เคฆเฅเคจเคฟเคฏเคพ +*/ + +@stage(fragment) +fn /* ใ“ใ‚“ใซใกใฏไธ–็•Œ */ main( /* ไฝ ๅฅฝไธ–็•Œ */ ) { + // ์•ˆ๋…•ํ•˜์„ธ์š” ์„ธ๊ณ„ +} diff --git a/test/unicode/comments.wgsl.expected.glsl b/test/unicode/comments.wgsl.expected.glsl new file mode 100644 index 0000000000..bb42c59b0b --- /dev/null +++ b/test/unicode/comments.wgsl.expected.glsl @@ -0,0 +1,10 @@ +#version 310 es +precision mediump float; + +void tint_symbol() { +} + +void main() { + tint_symbol(); + return; +} diff --git a/test/unicode/comments.wgsl.expected.hlsl b/test/unicode/comments.wgsl.expected.hlsl new file mode 100644 index 0000000000..61d6d5716b --- /dev/null +++ b/test/unicode/comments.wgsl.expected.hlsl @@ -0,0 +1,3 @@ +void main() { + return; +} diff --git a/test/unicode/comments.wgsl.expected.msl b/test/unicode/comments.wgsl.expected.msl new file mode 100644 index 0000000000..ac4b77b52f --- /dev/null +++ b/test/unicode/comments.wgsl.expected.msl @@ -0,0 +1,7 @@ +#include + +using namespace metal; +fragment void tint_symbol() { + return; +} + diff --git a/test/unicode/comments.wgsl.expected.spvasm b/test/unicode/comments.wgsl.expected.spvasm new file mode 100644 index 0000000000..123699a260 --- /dev/null +++ b/test/unicode/comments.wgsl.expected.spvasm @@ -0,0 +1,16 @@ +; SPIR-V +; Version: 1.3 +; Generator: Google Tint Compiler; 0 +; Bound: 5 +; Schema: 0 + OpCapability Shader + OpMemoryModel Logical GLSL450 + OpEntryPoint Fragment %main "main" + OpExecutionMode %main OriginUpperLeft + OpName %main "main" + %void = OpTypeVoid + %1 = OpTypeFunction %void + %main = OpFunction %void None %1 + %4 = OpLabel + OpReturn + OpFunctionEnd diff --git a/test/unicode/comments.wgsl.expected.wgsl b/test/unicode/comments.wgsl.expected.wgsl new file mode 100644 index 0000000000..3662a35207 --- /dev/null +++ b/test/unicode/comments.wgsl.expected.wgsl @@ -0,0 +1,3 @@ +@stage(fragment) +fn main() { +} diff --git a/test/unicode/indentifiers.wgsl b/test/unicode/indentifiers.wgsl new file mode 100644 index 0000000000..84c904a58b --- /dev/null +++ b/test/unicode/indentifiers.wgsl @@ -0,0 +1,12 @@ +type ๐“‰๐“Ž๐“…โ„ฏ_๐’ถ = i32; +type ๐ญ๐ฒ๐ฉ๐ž_๐› = f32; + +fn ๐“ฏ๐“พ๐“ท๐“ฌ๐“ฝ๐“ฒ๐“ธ๐“ท(แต–แตƒสณแตƒแต : ๐“‰๐“Ž๐“…โ„ฏ_๐’ถ) -> ๐ญ๐ฒ๐ฉ๐ž_๐› { + return ๐ญ๐ฒ๐ฉ๐ž_๐›(แต–แตƒสณแตƒแต); +} + +@stage(fragment) +fn ๐•–๐•Ÿ๐•ฅ๐•ฃ๐•ช๐•ก๐• ๐•š๐•Ÿ๐•ฅ() { + var ๐™ซ๐™–๐™ง๐™ž๐™–๐™—๐™ก๐™š : ๐“‰๐“Ž๐“…โ„ฏ_๐’ถ; + let ๐–—๐–Š๐–˜๐–š๐–‘๐–™ = ๐“ฏ๐“พ๐“ท๐“ฌ๐“ฝ๐“ฒ๐“ธ๐“ท(๐™ซ๐™–๐™ง๐™ž๐™–๐™—๐™ก๐™š); +} diff --git a/test/unicode/indentifiers.wgsl.expected.glsl b/test/unicode/indentifiers.wgsl.expected.glsl new file mode 100644 index 0000000000..d5d5790133 --- /dev/null +++ b/test/unicode/indentifiers.wgsl.expected.glsl @@ -0,0 +1,16 @@ +#version 310 es +precision mediump float; + +float tint_symbol_2(int tint_symbol_3) { + return float(tint_symbol_3); +} + +void tint_symbol_4() { + int tint_symbol_5 = 0; + float tint_symbol_6 = tint_symbol_2(tint_symbol_5); +} + +void main() { + tint_symbol_4(); + return; +} diff --git a/test/unicode/indentifiers.wgsl.expected.hlsl b/test/unicode/indentifiers.wgsl.expected.hlsl new file mode 100644 index 0000000000..d82de3a9e7 --- /dev/null +++ b/test/unicode/indentifiers.wgsl.expected.hlsl @@ -0,0 +1,9 @@ +float tint_symbol_2(int tint_symbol_3) { + return float(tint_symbol_3); +} + +void tint_symbol_4() { + int tint_symbol_5 = 0; + const float tint_symbol_6 = tint_symbol_2(tint_symbol_5); + return; +} diff --git a/test/unicode/indentifiers.wgsl.expected.msl b/test/unicode/indentifiers.wgsl.expected.msl new file mode 100644 index 0000000000..ea0afa5971 --- /dev/null +++ b/test/unicode/indentifiers.wgsl.expected.msl @@ -0,0 +1,13 @@ +#include + +using namespace metal; +float tint_symbol_2(int tint_symbol_3) { + return float(tint_symbol_3); +} + +fragment void tint_symbol_4() { + int tint_symbol_5 = 0; + float const tint_symbol_6 = tint_symbol_2(tint_symbol_5); + return; +} + diff --git a/test/unicode/indentifiers.wgsl.expected.spvasm b/test/unicode/indentifiers.wgsl.expected.spvasm new file mode 100644 index 0000000000..93b3e825b4 --- /dev/null +++ b/test/unicode/indentifiers.wgsl.expected.spvasm @@ -0,0 +1,33 @@ +; SPIR-V +; Version: 1.3 +; Generator: Google Tint Compiler; 0 +; Bound: 17 +; Schema: 0 + OpCapability Shader + OpMemoryModel Logical GLSL450 + OpEntryPoint Fragment %________________________________________ "๐•–๐•Ÿ๐•ฅ๐•ฃ๐•ช๐•ก๐• ๐•š๐•Ÿ๐•ฅ" + OpExecutionMode %________________________________________ OriginUpperLeft + OpName %________________________________ "๐“ฏ๐“พ๐“ท๐“ฌ๐“ฝ๐“ฒ๐“ธ๐“ท" + OpName %______________ "แต–แตƒสณแตƒแต" + OpName %________________________________________ "๐•–๐•Ÿ๐•ฅ๐•ฃ๐•ช๐•ก๐• ๐•š๐•Ÿ๐•ฅ" + OpName %_________________________________0 "๐™ซ๐™–๐™ง๐™ž๐™–๐™—๐™ก๐™š" + %float = OpTypeFloat 32 + %int = OpTypeInt 32 1 + %1 = OpTypeFunction %float %int + %void = OpTypeVoid + %8 = OpTypeFunction %void +%_ptr_Function_int = OpTypePointer Function %int + %14 = OpConstantNull %int +%________________________________ = OpFunction %float None %1 +%______________ = OpFunctionParameter %int + %6 = OpLabel + %7 = OpConvertSToF %float %______________ + OpReturnValue %7 + OpFunctionEnd +%________________________________________ = OpFunction %void None %8 + %11 = OpLabel +%_________________________________0 = OpVariable %_ptr_Function_int Function %14 + %16 = OpLoad %int %_________________________________0 + %15 = OpFunctionCall %float %________________________________ %16 + OpReturn + OpFunctionEnd diff --git a/test/unicode/indentifiers.wgsl.expected.wgsl b/test/unicode/indentifiers.wgsl.expected.wgsl new file mode 100644 index 0000000000..d78ad348c1 --- /dev/null +++ b/test/unicode/indentifiers.wgsl.expected.wgsl @@ -0,0 +1,13 @@ +type ๐“‰๐“Ž๐“…โ„ฏ_๐’ถ = i32; + +type ๐ญ๐ฒ๐ฉ๐ž_๐› = f32; + +fn ๐“ฏ๐“พ๐“ท๐“ฌ๐“ฝ๐“ฒ๐“ธ๐“ท(แต–แตƒสณแตƒแต : ๐“‰๐“Ž๐“…โ„ฏ_๐’ถ) -> ๐ญ๐ฒ๐ฉ๐ž_๐› { + return ๐ญ๐ฒ๐ฉ๐ž_๐›(แต–แตƒสณแตƒแต); +} + +@stage(fragment) +fn ๐•–๐•Ÿ๐•ฅ๐•ฃ๐•ช๐•ก๐• ๐•š๐•Ÿ๐•ฅ() { + var ๐™ซ๐™–๐™ง๐™ž๐™–๐™—๐™ก๐™š : ๐“‰๐“Ž๐“…โ„ฏ_๐’ถ; + let ๐–—๐–Š๐–˜๐–š๐–‘๐–™ = ๐“ฏ๐“พ๐“ท๐“ฌ๐“ฝ๐“ฒ๐“ธ๐“ท(๐™ซ๐™–๐™ง๐™ž๐™–๐™—๐™ก๐™š); +}