Implement support for Unicode Pattern_White_Space

Bug: tint:1505
Bug: tint:1513
Change-Id: I40fa29c766dc35213e0846071322523e7fc81b79
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/86402
Reviewed-by: Ben Clayton <bclayton@google.com>
Kokoro: Kokoro <noreply+kokoro@google.com>
Commit-Queue: Antonio Maiorano <amaiorano@google.com>
This commit is contained in:
Antonio Maiorano 2022-04-25 19:49:01 +00:00 committed by Dawn LUCI CQ
parent d97ff53261
commit 25775308a9
4 changed files with 240 additions and 32 deletions

View File

@ -18,6 +18,8 @@
#include <cmath> #include <cmath>
#include <cstring> #include <cstring>
#include <limits> #include <limits>
#include <optional> // NOLINT(build/include_order)
#include <tuple>
#include <utility> #include <utility>
#include "src/tint/debug.h" #include "src/tint/debug.h"
@ -26,9 +28,39 @@
namespace tint::reader::wgsl { namespace tint::reader::wgsl {
namespace { namespace {
bool is_blankspace(char c) { // Unicode parsing code assumes that the size of a single std::string element is
// See https://www.w3.org/TR/WGSL/#blankspace. // 1 byte.
return c == ' ' || c == '\t' || c == '\v' || c == '\f' || c == '\r'; static_assert(sizeof(decltype(tint::Source::FileContent::data[0])) ==
sizeof(uint8_t),
"tint::reader::wgsl requires the size of a std::string element "
"to be a single byte");
bool read_blankspace(std::string_view str,
size_t i,
bool* is_blankspace,
size_t* blankspace_size) {
// See https://www.w3.org/TR/WGSL/#blankspace
auto* utf8 = reinterpret_cast<const uint8_t*>(&str[i]);
auto [cp, n] = text::utf8::Decode(utf8, str.size() - i);
if (n == 0) {
return false;
}
static const auto kSpace = text::CodePoint(0x0020); // space
static const auto kHTab = text::CodePoint(0x0009); // horizontal tab
static const auto kL2R = text::CodePoint(0x200E); // left-to-right mark
static const auto kR2L = text::CodePoint(0x200F); // right-to-left mark
if (cp == kSpace || cp == kHTab || cp == kL2R || cp == kR2L) {
*is_blankspace = true;
*blankspace_size = n;
return true;
}
*is_blankspace = false;
return true;
} }
uint32_t dec_value(char c) { uint32_t dec_value(char c) {
@ -181,11 +213,16 @@ Token Lexer::skip_blankspace_and_comments() {
continue; continue;
} }
if (!is_blankspace(at(pos()))) { bool is_blankspace;
size_t blankspace_size;
if (!read_blankspace(line(), pos(), &is_blankspace, &blankspace_size)) {
return {Token::Type::kError, begin_source(), "invalid UTF-8"};
}
if (!is_blankspace) {
break; break;
} }
advance(); advance(blankspace_size);
} }
auto t = skip_comment(); auto t = skip_comment();
@ -207,10 +244,8 @@ Token Lexer::skip_blankspace_and_comments() {
Token Lexer::skip_comment() { Token Lexer::skip_comment() {
if (matches(pos(), "//")) { if (matches(pos(), "//")) {
// Line comment: ignore everything until the end of input or a blankspace // Line comment: ignore everything until the end of line.
// character other than space or horizontal tab. while (!is_eol()) {
while (!is_eol() && !(is_blankspace(at(pos())) && !matches(pos(), " ") &&
!matches(pos(), "\t"))) {
if (is_null()) { if (is_null()) {
return {Token::Type::kError, begin_source(), "null character found"}; return {Token::Type::kError, begin_source(), "null character found"};
} }
@ -758,11 +793,6 @@ Token Lexer::try_ident() {
auto source = begin_source(); auto source = begin_source();
auto start = pos(); auto start = pos();
// This below assumes that the size of a single std::string element is 1 byte.
static_assert(sizeof(at(0)) == sizeof(uint8_t),
"tint::reader::wgsl requires the size of a std::string element "
"to be a single byte");
// Must begin with an XID_Source unicode character, or underscore // Must begin with an XID_Source unicode character, or underscore
{ {
auto* utf8 = reinterpret_cast<const uint8_t*>(&at(pos())); auto* utf8 = reinterpret_cast<const uint8_t*>(&at(pos()));

View File

@ -23,6 +23,23 @@ namespace {
using LexerTest = testing::Test; using LexerTest = testing::Test;
// Blankspace constants. These are macros on purpose to be able to easily build
// up string literals with them.
//
// Same line code points
#define kSpace " "
#define kHTab "\t"
#define kL2R "\xE2\x80\x8E"
#define kR2L "\xE2\x80\x8F"
// Line break code points
#define kCR "\r"
#define kLF "\n"
#define kVTab "\x0B"
#define kFF "\x0C"
#define kNL "\xC2\x85"
#define kLS "\xE2\x80\xA8"
#define kPS "\xE2\x80\xA9"
TEST_F(LexerTest, Empty) { TEST_F(LexerTest, Empty) {
Source::File file("", ""); Source::File file("", "");
Lexer l(&file); Lexer l(&file);
@ -30,7 +47,7 @@ TEST_F(LexerTest, Empty) {
EXPECT_TRUE(t.IsEof()); EXPECT_TRUE(t.IsEof());
} }
TEST_F(LexerTest, Skips_Blankspace) { TEST_F(LexerTest, Skips_Blankspace_Basic) {
Source::File file("", "\t\r\n\t ident\t\n\t \r "); Source::File file("", "\t\r\n\t ident\t\n\t \r ");
Lexer l(&file); Lexer l(&file);
@ -46,6 +63,25 @@ TEST_F(LexerTest, Skips_Blankspace) {
EXPECT_TRUE(t.IsEof()); EXPECT_TRUE(t.IsEof());
} }
TEST_F(LexerTest, Skips_Blankspace_Exotic) {
Source::File file("", //
kVTab kFF kNL kLS kPS kL2R kR2L //
"ident" //
kVTab kFF kNL kLS kPS kL2R kR2L);
Lexer l(&file);
auto t = l.next();
EXPECT_TRUE(t.IsIdentifier());
EXPECT_EQ(t.source().range.begin.line, 6u);
EXPECT_EQ(t.source().range.begin.column, 7u);
EXPECT_EQ(t.source().range.end.line, 6u);
EXPECT_EQ(t.source().range.end.column, 12u);
EXPECT_EQ(t.to_str(), "ident");
t = l.next();
EXPECT_TRUE(t.IsEof());
}
TEST_F(LexerTest, Skips_Comments_Line) { TEST_F(LexerTest, Skips_Comments_Line) {
Source::File file("", R"(//starts with comment Source::File file("", R"(//starts with comment
ident1 //ends with comment ident1 //ends with comment
@ -73,11 +109,38 @@ ident1 //ends with comment
EXPECT_TRUE(t.IsEof()); EXPECT_TRUE(t.IsEof());
} }
using LineCommentTerminatorTest = testing::TestWithParam<char>; TEST_F(LexerTest, Skips_Comments_Unicode) {
Source::File file("", R"(// starts with 🙂🙂🙂
ident1 //ends with 🙂🙂🙂
// blank line
ident2)");
Lexer l(&file);
auto t = l.next();
EXPECT_TRUE(t.IsIdentifier());
EXPECT_EQ(t.source().range.begin.line, 2u);
EXPECT_EQ(t.source().range.begin.column, 1u);
EXPECT_EQ(t.source().range.end.line, 2u);
EXPECT_EQ(t.source().range.end.column, 7u);
EXPECT_EQ(t.to_str(), "ident1");
t = l.next();
EXPECT_TRUE(t.IsIdentifier());
EXPECT_EQ(t.source().range.begin.line, 4u);
EXPECT_EQ(t.source().range.begin.column, 2u);
EXPECT_EQ(t.source().range.end.line, 4u);
EXPECT_EQ(t.source().range.end.column, 8u);
EXPECT_EQ(t.to_str(), "ident2");
t = l.next();
EXPECT_TRUE(t.IsEof());
}
using LineCommentTerminatorTest = testing::TestWithParam<const char*>;
TEST_P(LineCommentTerminatorTest, Terminators) { TEST_P(LineCommentTerminatorTest, Terminators) {
// Test that line comments are ended by blankspace characters other than space // Test that line comments are ended by blankspace characters other than
// and horizontal tab. // space, horizontal tab, left-to-right mark, and right-to-left mark.
char c = GetParam(); auto c = GetParam();
std::string src = "let// This is a comment"; std::string src = "let// This is a comment";
src += c; src += c;
src += "ident"; src += "ident";
@ -91,9 +154,13 @@ TEST_P(LineCommentTerminatorTest, Terminators) {
EXPECT_EQ(t.source().range.end.line, 1u); EXPECT_EQ(t.source().range.end.line, 1u);
EXPECT_EQ(t.source().range.end.column, 4u); EXPECT_EQ(t.source().range.end.column, 4u);
if (c != ' ' && c != '\t') { auto is_same_line = [](std::string_view v) {
size_t line = c == '\n' ? 2u : 1u; return v == kSpace || v == kHTab || v == kL2R || v == kR2L;
size_t col = c == '\n' ? 1u : 25u; };
if (!is_same_line(c)) {
size_t line = is_same_line(c) ? 1u : 2u;
size_t col = is_same_line(c) ? 25u : 1u;
t = l.next(); t = l.next();
EXPECT_TRUE(t.IsIdentifier()); EXPECT_TRUE(t.IsIdentifier());
EXPECT_EQ(t.source().range.begin.line, line); EXPECT_EQ(t.source().range.begin.line, line);
@ -108,7 +175,20 @@ TEST_P(LineCommentTerminatorTest, Terminators) {
} }
INSTANTIATE_TEST_SUITE_P(LexerTest, INSTANTIATE_TEST_SUITE_P(LexerTest,
LineCommentTerminatorTest, LineCommentTerminatorTest,
testing::Values(' ', '\t', '\n', '\v', '\f', '\r')); testing::Values(
// same line
kSpace,
kHTab,
kCR,
kL2R,
kR2L,
// line break
kLF,
kVTab,
kFF,
kNL,
kLS,
kPS));
TEST_F(LexerTest, Skips_Comments_Block) { TEST_F(LexerTest, Skips_Comments_Block) {
Source::File file("", R"(/* comment Source::File file("", R"(/* comment

View File

@ -19,21 +19,82 @@
#include <string_view> #include <string_view>
#include <utility> #include <utility>
#include "src/tint/text/unicode.h"
namespace tint { namespace tint {
namespace { namespace {
bool ParseLineBreak(std::string_view str,
size_t i,
bool* is_line_break,
size_t* line_break_size) {
// See https://www.w3.org/TR/WGSL/#blankspace
auto* utf8 = reinterpret_cast<const uint8_t*>(&str[i]);
auto [cp, n] = text::utf8::Decode(utf8, str.size() - i);
if (n == 0) {
return false;
}
static const auto kLF = text::CodePoint(0x000A); // line feed
static const auto kVTab = text::CodePoint(0x000B); // vertical tab
static const auto kFF = text::CodePoint(0x000C); // form feed
static const auto kNL = text::CodePoint(0x0085); // next line
static const auto kCR = text::CodePoint(0x000D); // carriage return
static const auto kLS = text::CodePoint(0x2028); // line separator
static const auto kPS = text::CodePoint(0x2029); // parargraph separator
if (cp == kLF || cp == kVTab || cp == kFF || cp == kNL || cp == kPS ||
cp == kLS) {
*is_line_break = true;
*line_break_size = n;
return true;
}
// Handle CRLF as one line break, and CR alone as one line break
if (cp == kCR) {
*is_line_break = true;
*line_break_size = n;
if (auto next_i = i + n; next_i < str.size()) {
auto* next_utf8 = reinterpret_cast<const uint8_t*>(&str[next_i]);
auto [next_cp, next_n] =
text::utf8::Decode(next_utf8, str.size() - next_i);
if (next_n == 0) {
return false;
}
if (next_cp == kLF) {
// CRLF as one break
*line_break_size = n + next_n;
}
}
return true;
}
*is_line_break = false;
return true;
}
std::vector<std::string_view> SplitLines(std::string_view str) { std::vector<std::string_view> SplitLines(std::string_view str) {
std::vector<std::string_view> lines; std::vector<std::string_view> lines;
size_t lineStart = 0; size_t lineStart = 0;
for (size_t i = 0; i < str.size(); ++i) { for (size_t i = 0; i < str.size();) {
if (str[i] == '\n') { bool is_line_break{};
// Handle CRLF on Windows size_t line_break_size{};
size_t curr = i; // We don't handle decode errors from ParseLineBreak. Instead, we rely on
if (i > 0 && str[i - 1] == '\r') { // the Lexer to do so.
--curr; ParseLineBreak(str, i, &is_line_break, &line_break_size);
} if (is_line_break) {
lines.push_back(str.substr(lineStart, curr - lineStart)); lines.push_back(str.substr(lineStart, i - lineStart));
lineStart = i + 1; i += line_break_size;
lineStart = i;
} else {
++i;
} }
} }
if (lineStart < str.size()) { if (lineStart < str.size()) {

View File

@ -62,5 +62,42 @@ TEST_F(SourceFileContentTest, MoveCtor) {
EXPECT_EQ(fc.lines[2], "line three"); EXPECT_EQ(fc.lines[2], "line three");
} }
// Line break code points
#define kCR "\r"
#define kLF "\n"
#define kVTab "\x0B"
#define kFF "\x0C"
#define kNL "\xC2\x85"
#define kLS "\xE2\x80\xA8"
#define kPS "\xE2\x80\xA9"
using LineBreakTest = testing::TestWithParam<const char*>;
TEST_P(LineBreakTest, Single) {
std::string src = "line one";
src += GetParam();
src += "line two";
Source::FileContent fc(src);
EXPECT_EQ(fc.lines.size(), 2u);
EXPECT_EQ(fc.lines[0], "line one");
EXPECT_EQ(fc.lines[1], "line two");
}
TEST_P(LineBreakTest, Double) {
std::string src = "line one";
src += GetParam();
src += GetParam();
src += "line two";
Source::FileContent fc(src);
EXPECT_EQ(fc.lines.size(), 3u);
EXPECT_EQ(fc.lines[0], "line one");
EXPECT_EQ(fc.lines[1], "");
EXPECT_EQ(fc.lines[2], "line two");
}
INSTANTIATE_TEST_SUITE_P(
SourceFileContentTest,
LineBreakTest,
testing::Values(kVTab, kFF, kNL, kLS, kPS, kLF, kCR, kCR kLF));
} // namespace } // namespace
} // namespace tint } // namespace tint