Implement support for Unicode Pattern_White_Space

Bug: tint:1505 Bug: tint:1513 Change-Id: I40fa29c766dc35213e0846071322523e7fc81b79 Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/86402 Reviewed-by: Ben Clayton <bclayton@google.com> Kokoro: Kokoro <noreply+kokoro@google.com> Commit-Queue: Antonio Maiorano <amaiorano@google.com>
2022-04-25 19:49:01 +00:00 · 2022-04-25 19:49:01 +00:00 · 25775308a9
parent d97ff53261
commit 25775308a9
4 changed files with 240 additions and 32 deletions
--- a/src/tint/reader/wgsl/lexer.cc
+++ b/src/tint/reader/wgsl/lexer.cc
@ -18,6 +18,8 @@
 #include <cmath>
 #include <cstring>
 #include <limits>
+#include <optional>  // NOLINT(build/include_order)
+#include <tuple>
 #include <utility>

 #include "src/tint/debug.h"
@ -26,9 +28,39 @@
 namespace tint::reader::wgsl {
 namespace {

-bool is_blankspace(char c) {
-  // See https://www.w3.org/TR/WGSL/#blankspace.
-  return c == ' ' || c == '\t' || c == '\v' || c == '\f' || c == '\r';
+// Unicode parsing code assumes that the size of a single std::string element is
+// 1 byte.
+static_assert(sizeof(decltype(tint::Source::FileContent::data[0])) ==
+                  sizeof(uint8_t),
+              "tint::reader::wgsl requires the size of a std::string element "
+              "to be a single byte");
+
+bool read_blankspace(std::string_view str,
+                     size_t i,
+                     bool* is_blankspace,
+                     size_t* blankspace_size) {
+  // See https://www.w3.org/TR/WGSL/#blankspace
+
+  auto* utf8 = reinterpret_cast<const uint8_t*>(&str[i]);
+  auto [cp, n] = text::utf8::Decode(utf8, str.size() - i);
+
+  if (n == 0) {
+    return false;
+  }
+
+  static const auto kSpace = text::CodePoint(0x0020);  // space
+  static const auto kHTab = text::CodePoint(0x0009);   // horizontal tab
+  static const auto kL2R = text::CodePoint(0x200E);    // left-to-right mark
+  static const auto kR2L = text::CodePoint(0x200F);    // right-to-left mark
+
+  if (cp == kSpace || cp == kHTab || cp == kL2R || cp == kR2L) {
+    *is_blankspace = true;
+    *blankspace_size = n;
+    return true;
+  }
+
+  *is_blankspace = false;
+  return true;
 }

 uint32_t dec_value(char c) {
@ -181,11 +213,16 @@ Token Lexer::skip_blankspace_and_comments() {
        continue;
      }

-      if (!is_blankspace(at(pos()))) {
+      bool is_blankspace;
+      size_t blankspace_size;
+      if (!read_blankspace(line(), pos(), &is_blankspace, &blankspace_size)) {
+        return {Token::Type::kError, begin_source(), "invalid UTF-8"};
+      }
+      if (!is_blankspace) {
        break;
      }

-      advance();
+      advance(blankspace_size);
    }

    auto t = skip_comment();
@ -207,10 +244,8 @@ Token Lexer::skip_blankspace_and_comments() {

 Token Lexer::skip_comment() {
  if (matches(pos(), "//")) {
-    // Line comment: ignore everything until the end of input or a blankspace
-    // character other than space or horizontal tab.
-    while (!is_eol() && !(is_blankspace(at(pos())) && !matches(pos(), " ") &&
-                          !matches(pos(), "\t"))) {
+    // Line comment: ignore everything until the end of line.
+    while (!is_eol()) {
      if (is_null()) {
        return {Token::Type::kError, begin_source(), "null character found"};
      }
@ -758,11 +793,6 @@ Token Lexer::try_ident() {
  auto source = begin_source();
  auto start = pos();

-  // This below assumes that the size of a single std::string element is 1 byte.
-  static_assert(sizeof(at(0)) == sizeof(uint8_t),
-                "tint::reader::wgsl requires the size of a std::string element "
-                "to be a single byte");
-
  // Must begin with an XID_Source unicode character, or underscore
  {
    auto* utf8 = reinterpret_cast<const uint8_t*>(&at(pos()));
--- a/src/tint/reader/wgsl/lexer_test.cc
+++ b/src/tint/reader/wgsl/lexer_test.cc
@ -23,6 +23,23 @@ namespace {

 using LexerTest = testing::Test;

+// Blankspace constants. These are macros on purpose to be able to easily build
+// up string literals with them.
+//
+// Same line code points
+#define kSpace " "
+#define kHTab "\t"
+#define kL2R "\xE2\x80\x8E"
+#define kR2L "\xE2\x80\x8F"
+// Line break code points
+#define kCR "\r"
+#define kLF "\n"
+#define kVTab "\x0B"
+#define kFF "\x0C"
+#define kNL "\xC2\x85"
+#define kLS "\xE2\x80\xA8"
+#define kPS "\xE2\x80\xA9"
+
 TEST_F(LexerTest, Empty) {
  Source::File file("", "");
  Lexer l(&file);
@ -30,7 +47,7 @@ TEST_F(LexerTest, Empty) {
  EXPECT_TRUE(t.IsEof());
 }

-TEST_F(LexerTest, Skips_Blankspace) {
+TEST_F(LexerTest, Skips_Blankspace_Basic) {
  Source::File file("", "\t\r\n\t    ident\t\n\t  \r ");
  Lexer l(&file);

@ -46,6 +63,25 @@ TEST_F(LexerTest, Skips_Blankspace) {
  EXPECT_TRUE(t.IsEof());
 }

+TEST_F(LexerTest, Skips_Blankspace_Exotic) {
+  Source::File file("",                              //
+                    kVTab kFF kNL kLS kPS kL2R kR2L  //
+                    "ident"                          //
+                    kVTab kFF kNL kLS kPS kL2R kR2L);
+  Lexer l(&file);
+
+  auto t = l.next();
+  EXPECT_TRUE(t.IsIdentifier());
+  EXPECT_EQ(t.source().range.begin.line, 6u);
+  EXPECT_EQ(t.source().range.begin.column, 7u);
+  EXPECT_EQ(t.source().range.end.line, 6u);
+  EXPECT_EQ(t.source().range.end.column, 12u);
+  EXPECT_EQ(t.to_str(), "ident");
+
+  t = l.next();
+  EXPECT_TRUE(t.IsEof());
+}
+
 TEST_F(LexerTest, Skips_Comments_Line) {
  Source::File file("", R"(//starts with comment
 ident1 //ends with comment
@ -73,11 +109,38 @@ ident1 //ends with comment
  EXPECT_TRUE(t.IsEof());
 }

-using LineCommentTerminatorTest = testing::TestWithParam<char>;
+TEST_F(LexerTest, Skips_Comments_Unicode) {
+  Source::File file("", R"(// starts with 🙂🙂🙂
+ident1 //ends with 🙂🙂🙂
+// blank line
+ ident2)");
+  Lexer l(&file);
+
+  auto t = l.next();
+  EXPECT_TRUE(t.IsIdentifier());
+  EXPECT_EQ(t.source().range.begin.line, 2u);
+  EXPECT_EQ(t.source().range.begin.column, 1u);
+  EXPECT_EQ(t.source().range.end.line, 2u);
+  EXPECT_EQ(t.source().range.end.column, 7u);
+  EXPECT_EQ(t.to_str(), "ident1");
+
+  t = l.next();
+  EXPECT_TRUE(t.IsIdentifier());
+  EXPECT_EQ(t.source().range.begin.line, 4u);
+  EXPECT_EQ(t.source().range.begin.column, 2u);
+  EXPECT_EQ(t.source().range.end.line, 4u);
+  EXPECT_EQ(t.source().range.end.column, 8u);
+  EXPECT_EQ(t.to_str(), "ident2");
+
+  t = l.next();
+  EXPECT_TRUE(t.IsEof());
+}
+
+using LineCommentTerminatorTest = testing::TestWithParam<const char*>;
 TEST_P(LineCommentTerminatorTest, Terminators) {
-  // Test that line comments are ended by blankspace characters other than space
-  // and horizontal tab.
-  char c = GetParam();
+  // Test that line comments are ended by blankspace characters other than
+  // space, horizontal tab, left-to-right mark, and right-to-left mark.
+  auto c = GetParam();
  std::string src = "let// This is a comment";
  src += c;
  src += "ident";
@ -91,9 +154,13 @@ TEST_P(LineCommentTerminatorTest, Terminators) {
  EXPECT_EQ(t.source().range.end.line, 1u);
  EXPECT_EQ(t.source().range.end.column, 4u);

-  if (c != ' ' && c != '\t') {
-    size_t line = c == '\n' ? 2u : 1u;
-    size_t col = c == '\n' ? 1u : 25u;
+  auto is_same_line = [](std::string_view v) {
+    return v == kSpace || v == kHTab || v == kL2R || v == kR2L;
+  };
+
+  if (!is_same_line(c)) {
+    size_t line = is_same_line(c) ? 1u : 2u;
+    size_t col = is_same_line(c) ? 25u : 1u;
    t = l.next();
    EXPECT_TRUE(t.IsIdentifier());
    EXPECT_EQ(t.source().range.begin.line, line);
@ -108,7 +175,20 @@ TEST_P(LineCommentTerminatorTest, Terminators) {
 }
 INSTANTIATE_TEST_SUITE_P(LexerTest,
                         LineCommentTerminatorTest,
-                         testing::Values(' ', '\t', '\n', '\v', '\f', '\r'));
+                         testing::Values(
+                             // same line
+                             kSpace,
+                             kHTab,
+                             kCR,
+                             kL2R,
+                             kR2L,
+                             // line break
+                             kLF,
+                             kVTab,
+                             kFF,
+                             kNL,
+                             kLS,
+                             kPS));

 TEST_F(LexerTest, Skips_Comments_Block) {
  Source::File file("", R"(/* comment
--- a/src/tint/source.cc
+++ b/src/tint/source.cc
@ -19,21 +19,82 @@
 #include <string_view>
 #include <utility>

+#include "src/tint/text/unicode.h"
+
 namespace tint {
 namespace {
+
+bool ParseLineBreak(std::string_view str,
+                    size_t i,
+                    bool* is_line_break,
+                    size_t* line_break_size) {
+  // See https://www.w3.org/TR/WGSL/#blankspace
+
+  auto* utf8 = reinterpret_cast<const uint8_t*>(&str[i]);
+  auto [cp, n] = text::utf8::Decode(utf8, str.size() - i);
+
+  if (n == 0) {
+    return false;
+  }
+
+  static const auto kLF = text::CodePoint(0x000A);    // line feed
+  static const auto kVTab = text::CodePoint(0x000B);  // vertical tab
+  static const auto kFF = text::CodePoint(0x000C);    // form feed
+  static const auto kNL = text::CodePoint(0x0085);    // next line
+  static const auto kCR = text::CodePoint(0x000D);    // carriage return
+  static const auto kLS = text::CodePoint(0x2028);    // line separator
+  static const auto kPS = text::CodePoint(0x2029);    // parargraph separator
+
+  if (cp == kLF || cp == kVTab || cp == kFF || cp == kNL || cp == kPS ||
+      cp == kLS) {
+    *is_line_break = true;
+    *line_break_size = n;
+    return true;
+  }
+
+  // Handle CRLF as one line break, and CR alone as one line break
+  if (cp == kCR) {
+    *is_line_break = true;
+    *line_break_size = n;
+
+    if (auto next_i = i + n; next_i < str.size()) {
+      auto* next_utf8 = reinterpret_cast<const uint8_t*>(&str[next_i]);
+      auto [next_cp, next_n] =
+          text::utf8::Decode(next_utf8, str.size() - next_i);
+
+      if (next_n == 0) {
+        return false;
+      }
+
+      if (next_cp == kLF) {
+        // CRLF as one break
+        *line_break_size = n + next_n;
+      }
+    }
+
+    return true;
+  }
+
+  *is_line_break = false;
+  return true;
+}
+
 std::vector<std::string_view> SplitLines(std::string_view str) {
  std::vector<std::string_view> lines;

  size_t lineStart = 0;
-  for (size_t i = 0; i < str.size(); ++i) {
-    if (str[i] == '\n') {
-      // Handle CRLF on Windows
-      size_t curr = i;
-      if (i > 0 && str[i - 1] == '\r') {
-        --curr;
-      }
-      lines.push_back(str.substr(lineStart, curr - lineStart));
-      lineStart = i + 1;
+  for (size_t i = 0; i < str.size();) {
+    bool is_line_break{};
+    size_t line_break_size{};
+    // We don't handle decode errors from ParseLineBreak. Instead, we rely on
+    // the Lexer to do so.
+    ParseLineBreak(str, i, &is_line_break, &line_break_size);
+    if (is_line_break) {
+      lines.push_back(str.substr(lineStart, i - lineStart));
+      i += line_break_size;
+      lineStart = i;
+    } else {
+      ++i;
    }
  }
  if (lineStart < str.size()) {
--- a/src/tint/source_test.cc
+++ b/src/tint/source_test.cc
@ -62,5 +62,42 @@ TEST_F(SourceFileContentTest, MoveCtor) {
  EXPECT_EQ(fc.lines[2], "line three");
 }

+// Line break code points
+#define kCR "\r"
+#define kLF "\n"
+#define kVTab "\x0B"
+#define kFF "\x0C"
+#define kNL "\xC2\x85"
+#define kLS "\xE2\x80\xA8"
+#define kPS "\xE2\x80\xA9"
+
+using LineBreakTest = testing::TestWithParam<const char*>;
+TEST_P(LineBreakTest, Single) {
+  std::string src = "line one";
+  src += GetParam();
+  src += "line two";
+
+  Source::FileContent fc(src);
+  EXPECT_EQ(fc.lines.size(), 2u);
+  EXPECT_EQ(fc.lines[0], "line one");
+  EXPECT_EQ(fc.lines[1], "line two");
+}
+TEST_P(LineBreakTest, Double) {
+  std::string src = "line one";
+  src += GetParam();
+  src += GetParam();
+  src += "line two";
+
+  Source::FileContent fc(src);
+  EXPECT_EQ(fc.lines.size(), 3u);
+  EXPECT_EQ(fc.lines[0], "line one");
+  EXPECT_EQ(fc.lines[1], "");
+  EXPECT_EQ(fc.lines[2], "line two");
+}
+INSTANTIATE_TEST_SUITE_P(
+    SourceFileContentTest,
+    LineBreakTest,
+    testing::Values(kVTab, kFF, kNL, kLS, kPS, kLF, kCR, kCR kLF));
+
 }  // namespace
 }  // namespace tint