Implement support for Unicode Pattern_White_Space

Bug: tint:1505 Bug: tint:1513 Change-Id: I40fa29c766dc35213e0846071322523e7fc81b79 Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/86402 Reviewed-by: Ben Clayton <bclayton@google.com> Kokoro: Kokoro <noreply+kokoro@google.com> Commit-Queue: Antonio Maiorano <amaiorano@google.com>
2025-10-14 05:49:09 +00:00 · 2022-04-25 19:49:01 +00:00 · 2022-04-25 19:49:01 +00:00 · 25775308a9
commit 25775308a9
parent d97ff53261
4 changed files with 240 additions and 32 deletions
--- a/src/tint/reader/wgsl/lexer.cc
+++ b/src/tint/reader/wgsl/lexer.cc
@ -18,6 +18,8 @@
 #include <cmath>
 #include <cstring>
 #include <limits>
 #include <optional>  // NOLINT(build/include_order)
 #include <tuple>
 #include <utility>
 #include "src/tint/debug.h"
@ -26,9 +28,39 @@
 namespace tint::reader::wgsl {
 namespace {
-bool is_blankspace(char c) {
+// Unicode parsing code assumes that the size of a single std::string element is
-  // See https://www.w3.org/TR/WGSL/#blankspace.
+// 1 byte.
-  return c == ' ' || c == '\t' || c == '\v' || c == '\f' || c == '\r';
+static_assert(sizeof(decltype(tint::Source::FileContent::data[0])) ==
                  sizeof(uint8_t),
              "tint::reader::wgsl requires the size of a std::string element "
              "to be a single byte");
 bool read_blankspace(std::string_view str,
                     size_t i,
                     bool* is_blankspace,
                     size_t* blankspace_size) {
  // See https://www.w3.org/TR/WGSL/#blankspace
  auto* utf8 = reinterpret_cast<const uint8_t*>(&str[i]);
  auto [cp, n] = text::utf8::Decode(utf8, str.size() - i);
  if (n == 0) {
    return false;
  }
  static const auto kSpace = text::CodePoint(0x0020);  // space
  static const auto kHTab = text::CodePoint(0x0009);   // horizontal tab
  static const auto kL2R = text::CodePoint(0x200E);    // left-to-right mark
  static const auto kR2L = text::CodePoint(0x200F);    // right-to-left mark
  if (cp == kSpace || cp == kHTab || cp == kL2R || cp == kR2L) {
    *is_blankspace = true;
    *blankspace_size = n;
    return true;
  }
  *is_blankspace = false;
  return true;
 }
 uint32_t dec_value(char c) {
@ -181,11 +213,16 @@ Token Lexer::skip_blankspace_and_comments() {
        continue;
      }
-      if (!is_blankspace(at(pos()))) {
+      bool is_blankspace;
      size_t blankspace_size;
      if (!read_blankspace(line(), pos(), &is_blankspace, &blankspace_size)) {
        return {Token::Type::kError, begin_source(), "invalid UTF-8"};
      }
      if (!is_blankspace) {
        break;
      }
-      advance();
+      advance(blankspace_size);
    }
    auto t = skip_comment();
@ -207,10 +244,8 @@ Token Lexer::skip_blankspace_and_comments() {
 Token Lexer::skip_comment() {
  if (matches(pos(), "//")) {
-    // Line comment: ignore everything until the end of input or a blankspace
+    // Line comment: ignore everything until the end of line.
-    // character other than space or horizontal tab.
+    while (!is_eol()) {
    while (!is_eol() && !(is_blankspace(at(pos())) && !matches(pos(), " ") &&
                          !matches(pos(), "\t"))) {
      if (is_null()) {
        return {Token::Type::kError, begin_source(), "null character found"};
      }
@ -758,11 +793,6 @@ Token Lexer::try_ident() {
  auto source = begin_source();
  auto start = pos();
  // This below assumes that the size of a single std::string element is 1 byte.
  static_assert(sizeof(at(0)) == sizeof(uint8_t),
                "tint::reader::wgsl requires the size of a std::string element "
                "to be a single byte");
  // Must begin with an XID_Source unicode character, or underscore
  {
    auto* utf8 = reinterpret_cast<const uint8_t*>(&at(pos()));
--- a/src/tint/reader/wgsl/lexer_test.cc
+++ b/src/tint/reader/wgsl/lexer_test.cc
@ -23,6 +23,23 @@ namespace {
 using LexerTest = testing::Test;
 // Blankspace constants. These are macros on purpose to be able to easily build
 // up string literals with them.
 //
 // Same line code points
 #define kSpace " "
 #define kHTab "\t"
 #define kL2R "\xE2\x80\x8E"
 #define kR2L "\xE2\x80\x8F"
 // Line break code points
 #define kCR "\r"
 #define kLF "\n"
 #define kVTab "\x0B"
 #define kFF "\x0C"
 #define kNL "\xC2\x85"
 #define kLS "\xE2\x80\xA8"
 #define kPS "\xE2\x80\xA9"
 TEST_F(LexerTest, Empty) {
  Source::File file("", "");
  Lexer l(&file);
@ -30,7 +47,7 @@ TEST_F(LexerTest, Empty) {
  EXPECT_TRUE(t.IsEof());
 }
-TEST_F(LexerTest, Skips_Blankspace) {
+TEST_F(LexerTest, Skips_Blankspace_Basic) {
  Source::File file("", "\t\r\n\t    ident\t\n\t  \r ");
  Lexer l(&file);
@ -46,6 +63,25 @@ TEST_F(LexerTest, Skips_Blankspace) {
  EXPECT_TRUE(t.IsEof());
 }
 TEST_F(LexerTest, Skips_Blankspace_Exotic) {
  Source::File file("",                              //
                    kVTab kFF kNL kLS kPS kL2R kR2L  //
                    "ident"                          //
                    kVTab kFF kNL kLS kPS kL2R kR2L);
  Lexer l(&file);
  auto t = l.next();
  EXPECT_TRUE(t.IsIdentifier());
  EXPECT_EQ(t.source().range.begin.line, 6u);
  EXPECT_EQ(t.source().range.begin.column, 7u);
  EXPECT_EQ(t.source().range.end.line, 6u);
  EXPECT_EQ(t.source().range.end.column, 12u);
  EXPECT_EQ(t.to_str(), "ident");
  t = l.next();
  EXPECT_TRUE(t.IsEof());
 }
 TEST_F(LexerTest, Skips_Comments_Line) {
  Source::File file("", R"(//starts with comment
 ident1 //ends with comment
@ -73,11 +109,38 @@ ident1 //ends with comment
  EXPECT_TRUE(t.IsEof());
 }
-using LineCommentTerminatorTest = testing::TestWithParam<char>;
+TEST_F(LexerTest, Skips_Comments_Unicode) {
  Source::File file("", R"(// starts with 🙂🙂🙂
 ident1 //ends with 🙂🙂🙂
 // blank line
 ident2)");
  Lexer l(&file);
  auto t = l.next();
  EXPECT_TRUE(t.IsIdentifier());
  EXPECT_EQ(t.source().range.begin.line, 2u);
  EXPECT_EQ(t.source().range.begin.column, 1u);
  EXPECT_EQ(t.source().range.end.line, 2u);
  EXPECT_EQ(t.source().range.end.column, 7u);
  EXPECT_EQ(t.to_str(), "ident1");
  t = l.next();
  EXPECT_TRUE(t.IsIdentifier());
  EXPECT_EQ(t.source().range.begin.line, 4u);
  EXPECT_EQ(t.source().range.begin.column, 2u);
  EXPECT_EQ(t.source().range.end.line, 4u);
  EXPECT_EQ(t.source().range.end.column, 8u);
  EXPECT_EQ(t.to_str(), "ident2");
  t = l.next();
  EXPECT_TRUE(t.IsEof());
 }
 using LineCommentTerminatorTest = testing::TestWithParam<const char*>;
 TEST_P(LineCommentTerminatorTest, Terminators) {
-  // Test that line comments are ended by blankspace characters other than space
+  // Test that line comments are ended by blankspace characters other than
-  // and horizontal tab.
+  // space, horizontal tab, left-to-right mark, and right-to-left mark.
-  char c = GetParam();
+  auto c = GetParam();
  std::string src = "let// This is a comment";
  src += c;
  src += "ident";
@ -91,9 +154,13 @@ TEST_P(LineCommentTerminatorTest, Terminators) {
  EXPECT_EQ(t.source().range.end.line, 1u);
  EXPECT_EQ(t.source().range.end.column, 4u);
-  if (c != ' ' && c != '\t') {
+  auto is_same_line = [](std::string_view v) {
-    size_t line = c == '\n' ? 2u : 1u;
+    return v == kSpace || v == kHTab || v == kL2R || v == kR2L;
-    size_t col = c == '\n' ? 1u : 25u;
+  };
  if (!is_same_line(c)) {
    size_t line = is_same_line(c) ? 1u : 2u;
    size_t col = is_same_line(c) ? 25u : 1u;
    t = l.next();
    EXPECT_TRUE(t.IsIdentifier());
    EXPECT_EQ(t.source().range.begin.line, line);
@ -108,7 +175,20 @@ TEST_P(LineCommentTerminatorTest, Terminators) {
 }
 INSTANTIATE_TEST_SUITE_P(LexerTest,
                         LineCommentTerminatorTest,
-                         testing::Values(' ', '\t', '\n', '\v', '\f', '\r'));
+                         testing::Values(
                             // same line
                             kSpace,
                             kHTab,
                             kCR,
                             kL2R,
                             kR2L,
                             // line break
                             kLF,
                             kVTab,
                             kFF,
                             kNL,
                             kLS,
                             kPS));
 TEST_F(LexerTest, Skips_Comments_Block) {
  Source::File file("", R"(/* comment
--- a/src/tint/source.cc
+++ b/src/tint/source.cc
@ -19,21 +19,82 @@
 #include <string_view>
 #include <utility>
 #include "src/tint/text/unicode.h"
 namespace tint {
 namespace {
 bool ParseLineBreak(std::string_view str,
                    size_t i,
                    bool* is_line_break,
                    size_t* line_break_size) {
  // See https://www.w3.org/TR/WGSL/#blankspace
  auto* utf8 = reinterpret_cast<const uint8_t*>(&str[i]);
  auto [cp, n] = text::utf8::Decode(utf8, str.size() - i);
  if (n == 0) {
    return false;
  }
  static const auto kLF = text::CodePoint(0x000A);    // line feed
  static const auto kVTab = text::CodePoint(0x000B);  // vertical tab
  static const auto kFF = text::CodePoint(0x000C);    // form feed
  static const auto kNL = text::CodePoint(0x0085);    // next line
  static const auto kCR = text::CodePoint(0x000D);    // carriage return
  static const auto kLS = text::CodePoint(0x2028);    // line separator
  static const auto kPS = text::CodePoint(0x2029);    // parargraph separator
  if (cp == kLF || cp == kVTab || cp == kFF || cp == kNL || cp == kPS ||
      cp == kLS) {
    *is_line_break = true;
    *line_break_size = n;
    return true;
  }
  // Handle CRLF as one line break, and CR alone as one line break
  if (cp == kCR) {
    *is_line_break = true;
    *line_break_size = n;
    if (auto next_i = i + n; next_i < str.size()) {
      auto* next_utf8 = reinterpret_cast<const uint8_t*>(&str[next_i]);
      auto [next_cp, next_n] =
          text::utf8::Decode(next_utf8, str.size() - next_i);
      if (next_n == 0) {
        return false;
      }
      if (next_cp == kLF) {
        // CRLF as one break
        *line_break_size = n + next_n;
      }
    }
    return true;
  }
  *is_line_break = false;
  return true;
 }
 std::vector<std::string_view> SplitLines(std::string_view str) {
  std::vector<std::string_view> lines;
  size_t lineStart = 0;
-  for (size_t i = 0; i < str.size(); ++i) {
+  for (size_t i = 0; i < str.size();) {
-    if (str[i] == '\n') {
+    bool is_line_break{};
-      // Handle CRLF on Windows
+    size_t line_break_size{};
-      size_t curr = i;
+    // We don't handle decode errors from ParseLineBreak. Instead, we rely on
-      if (i > 0 && str[i - 1] == '\r') {
+    // the Lexer to do so.
-        --curr;
+    ParseLineBreak(str, i, &is_line_break, &line_break_size);
-      }
+    if (is_line_break) {
-      lines.push_back(str.substr(lineStart, curr - lineStart));
+      lines.push_back(str.substr(lineStart, i - lineStart));
-      lineStart = i + 1;
+      i += line_break_size;
      lineStart = i;
    } else {
      ++i;
    }
  }
  if (lineStart < str.size()) {
--- a/src/tint/source_test.cc
+++ b/src/tint/source_test.cc
@ -62,5 +62,42 @@ TEST_F(SourceFileContentTest, MoveCtor) {
  EXPECT_EQ(fc.lines[2], "line three");
 }
 // Line break code points
 #define kCR "\r"
 #define kLF "\n"
 #define kVTab "\x0B"
 #define kFF "\x0C"
 #define kNL "\xC2\x85"
 #define kLS "\xE2\x80\xA8"
 #define kPS "\xE2\x80\xA9"
 using LineBreakTest = testing::TestWithParam<const char*>;
 TEST_P(LineBreakTest, Single) {
  std::string src = "line one";
  src += GetParam();
  src += "line two";
  Source::FileContent fc(src);
  EXPECT_EQ(fc.lines.size(), 2u);
  EXPECT_EQ(fc.lines[0], "line one");
  EXPECT_EQ(fc.lines[1], "line two");
 }
 TEST_P(LineBreakTest, Double) {
  std::string src = "line one";
  src += GetParam();
  src += GetParam();
  src += "line two";
  Source::FileContent fc(src);
  EXPECT_EQ(fc.lines.size(), 3u);
  EXPECT_EQ(fc.lines[0], "line one");
  EXPECT_EQ(fc.lines[1], "");
  EXPECT_EQ(fc.lines[2], "line two");
 }
 INSTANTIATE_TEST_SUITE_P(
    SourceFileContentTest,
    LineBreakTest,
    testing::Values(kVTab, kFF, kNL, kLS, kPS, kLF, kCR, kCR kLF));
 }  // namespace
 }  // namespace tint