Implement support for Unicode Pattern_White_Space
Bug: tint:1505 Bug: tint:1513 Change-Id: I40fa29c766dc35213e0846071322523e7fc81b79 Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/86402 Reviewed-by: Ben Clayton <bclayton@google.com> Kokoro: Kokoro <noreply+kokoro@google.com> Commit-Queue: Antonio Maiorano <amaiorano@google.com>
This commit is contained in:
parent
d97ff53261
commit
25775308a9
|
@ -18,6 +18,8 @@
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <limits>
|
#include <limits>
|
||||||
|
#include <optional> // NOLINT(build/include_order)
|
||||||
|
#include <tuple>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
|
|
||||||
#include "src/tint/debug.h"
|
#include "src/tint/debug.h"
|
||||||
|
@ -26,9 +28,39 @@
|
||||||
namespace tint::reader::wgsl {
|
namespace tint::reader::wgsl {
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
bool is_blankspace(char c) {
|
// Unicode parsing code assumes that the size of a single std::string element is
|
||||||
// See https://www.w3.org/TR/WGSL/#blankspace.
|
// 1 byte.
|
||||||
return c == ' ' || c == '\t' || c == '\v' || c == '\f' || c == '\r';
|
static_assert(sizeof(decltype(tint::Source::FileContent::data[0])) ==
|
||||||
|
sizeof(uint8_t),
|
||||||
|
"tint::reader::wgsl requires the size of a std::string element "
|
||||||
|
"to be a single byte");
|
||||||
|
|
||||||
|
bool read_blankspace(std::string_view str,
|
||||||
|
size_t i,
|
||||||
|
bool* is_blankspace,
|
||||||
|
size_t* blankspace_size) {
|
||||||
|
// See https://www.w3.org/TR/WGSL/#blankspace
|
||||||
|
|
||||||
|
auto* utf8 = reinterpret_cast<const uint8_t*>(&str[i]);
|
||||||
|
auto [cp, n] = text::utf8::Decode(utf8, str.size() - i);
|
||||||
|
|
||||||
|
if (n == 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const auto kSpace = text::CodePoint(0x0020); // space
|
||||||
|
static const auto kHTab = text::CodePoint(0x0009); // horizontal tab
|
||||||
|
static const auto kL2R = text::CodePoint(0x200E); // left-to-right mark
|
||||||
|
static const auto kR2L = text::CodePoint(0x200F); // right-to-left mark
|
||||||
|
|
||||||
|
if (cp == kSpace || cp == kHTab || cp == kL2R || cp == kR2L) {
|
||||||
|
*is_blankspace = true;
|
||||||
|
*blankspace_size = n;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
*is_blankspace = false;
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t dec_value(char c) {
|
uint32_t dec_value(char c) {
|
||||||
|
@ -181,11 +213,16 @@ Token Lexer::skip_blankspace_and_comments() {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!is_blankspace(at(pos()))) {
|
bool is_blankspace;
|
||||||
|
size_t blankspace_size;
|
||||||
|
if (!read_blankspace(line(), pos(), &is_blankspace, &blankspace_size)) {
|
||||||
|
return {Token::Type::kError, begin_source(), "invalid UTF-8"};
|
||||||
|
}
|
||||||
|
if (!is_blankspace) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
advance();
|
advance(blankspace_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto t = skip_comment();
|
auto t = skip_comment();
|
||||||
|
@ -207,10 +244,8 @@ Token Lexer::skip_blankspace_and_comments() {
|
||||||
|
|
||||||
Token Lexer::skip_comment() {
|
Token Lexer::skip_comment() {
|
||||||
if (matches(pos(), "//")) {
|
if (matches(pos(), "//")) {
|
||||||
// Line comment: ignore everything until the end of input or a blankspace
|
// Line comment: ignore everything until the end of line.
|
||||||
// character other than space or horizontal tab.
|
while (!is_eol()) {
|
||||||
while (!is_eol() && !(is_blankspace(at(pos())) && !matches(pos(), " ") &&
|
|
||||||
!matches(pos(), "\t"))) {
|
|
||||||
if (is_null()) {
|
if (is_null()) {
|
||||||
return {Token::Type::kError, begin_source(), "null character found"};
|
return {Token::Type::kError, begin_source(), "null character found"};
|
||||||
}
|
}
|
||||||
|
@ -758,11 +793,6 @@ Token Lexer::try_ident() {
|
||||||
auto source = begin_source();
|
auto source = begin_source();
|
||||||
auto start = pos();
|
auto start = pos();
|
||||||
|
|
||||||
// This below assumes that the size of a single std::string element is 1 byte.
|
|
||||||
static_assert(sizeof(at(0)) == sizeof(uint8_t),
|
|
||||||
"tint::reader::wgsl requires the size of a std::string element "
|
|
||||||
"to be a single byte");
|
|
||||||
|
|
||||||
// Must begin with an XID_Source unicode character, or underscore
|
// Must begin with an XID_Source unicode character, or underscore
|
||||||
{
|
{
|
||||||
auto* utf8 = reinterpret_cast<const uint8_t*>(&at(pos()));
|
auto* utf8 = reinterpret_cast<const uint8_t*>(&at(pos()));
|
||||||
|
|
|
@ -23,6 +23,23 @@ namespace {
|
||||||
|
|
||||||
using LexerTest = testing::Test;
|
using LexerTest = testing::Test;
|
||||||
|
|
||||||
|
// Blankspace constants. These are macros on purpose to be able to easily build
|
||||||
|
// up string literals with them.
|
||||||
|
//
|
||||||
|
// Same line code points
|
||||||
|
#define kSpace " "
|
||||||
|
#define kHTab "\t"
|
||||||
|
#define kL2R "\xE2\x80\x8E"
|
||||||
|
#define kR2L "\xE2\x80\x8F"
|
||||||
|
// Line break code points
|
||||||
|
#define kCR "\r"
|
||||||
|
#define kLF "\n"
|
||||||
|
#define kVTab "\x0B"
|
||||||
|
#define kFF "\x0C"
|
||||||
|
#define kNL "\xC2\x85"
|
||||||
|
#define kLS "\xE2\x80\xA8"
|
||||||
|
#define kPS "\xE2\x80\xA9"
|
||||||
|
|
||||||
TEST_F(LexerTest, Empty) {
|
TEST_F(LexerTest, Empty) {
|
||||||
Source::File file("", "");
|
Source::File file("", "");
|
||||||
Lexer l(&file);
|
Lexer l(&file);
|
||||||
|
@ -30,7 +47,7 @@ TEST_F(LexerTest, Empty) {
|
||||||
EXPECT_TRUE(t.IsEof());
|
EXPECT_TRUE(t.IsEof());
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(LexerTest, Skips_Blankspace) {
|
TEST_F(LexerTest, Skips_Blankspace_Basic) {
|
||||||
Source::File file("", "\t\r\n\t ident\t\n\t \r ");
|
Source::File file("", "\t\r\n\t ident\t\n\t \r ");
|
||||||
Lexer l(&file);
|
Lexer l(&file);
|
||||||
|
|
||||||
|
@ -46,6 +63,25 @@ TEST_F(LexerTest, Skips_Blankspace) {
|
||||||
EXPECT_TRUE(t.IsEof());
|
EXPECT_TRUE(t.IsEof());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(LexerTest, Skips_Blankspace_Exotic) {
|
||||||
|
Source::File file("", //
|
||||||
|
kVTab kFF kNL kLS kPS kL2R kR2L //
|
||||||
|
"ident" //
|
||||||
|
kVTab kFF kNL kLS kPS kL2R kR2L);
|
||||||
|
Lexer l(&file);
|
||||||
|
|
||||||
|
auto t = l.next();
|
||||||
|
EXPECT_TRUE(t.IsIdentifier());
|
||||||
|
EXPECT_EQ(t.source().range.begin.line, 6u);
|
||||||
|
EXPECT_EQ(t.source().range.begin.column, 7u);
|
||||||
|
EXPECT_EQ(t.source().range.end.line, 6u);
|
||||||
|
EXPECT_EQ(t.source().range.end.column, 12u);
|
||||||
|
EXPECT_EQ(t.to_str(), "ident");
|
||||||
|
|
||||||
|
t = l.next();
|
||||||
|
EXPECT_TRUE(t.IsEof());
|
||||||
|
}
|
||||||
|
|
||||||
TEST_F(LexerTest, Skips_Comments_Line) {
|
TEST_F(LexerTest, Skips_Comments_Line) {
|
||||||
Source::File file("", R"(//starts with comment
|
Source::File file("", R"(//starts with comment
|
||||||
ident1 //ends with comment
|
ident1 //ends with comment
|
||||||
|
@ -73,11 +109,38 @@ ident1 //ends with comment
|
||||||
EXPECT_TRUE(t.IsEof());
|
EXPECT_TRUE(t.IsEof());
|
||||||
}
|
}
|
||||||
|
|
||||||
using LineCommentTerminatorTest = testing::TestWithParam<char>;
|
TEST_F(LexerTest, Skips_Comments_Unicode) {
|
||||||
|
Source::File file("", R"(// starts with 🙂🙂🙂
|
||||||
|
ident1 //ends with 🙂🙂🙂
|
||||||
|
// blank line
|
||||||
|
ident2)");
|
||||||
|
Lexer l(&file);
|
||||||
|
|
||||||
|
auto t = l.next();
|
||||||
|
EXPECT_TRUE(t.IsIdentifier());
|
||||||
|
EXPECT_EQ(t.source().range.begin.line, 2u);
|
||||||
|
EXPECT_EQ(t.source().range.begin.column, 1u);
|
||||||
|
EXPECT_EQ(t.source().range.end.line, 2u);
|
||||||
|
EXPECT_EQ(t.source().range.end.column, 7u);
|
||||||
|
EXPECT_EQ(t.to_str(), "ident1");
|
||||||
|
|
||||||
|
t = l.next();
|
||||||
|
EXPECT_TRUE(t.IsIdentifier());
|
||||||
|
EXPECT_EQ(t.source().range.begin.line, 4u);
|
||||||
|
EXPECT_EQ(t.source().range.begin.column, 2u);
|
||||||
|
EXPECT_EQ(t.source().range.end.line, 4u);
|
||||||
|
EXPECT_EQ(t.source().range.end.column, 8u);
|
||||||
|
EXPECT_EQ(t.to_str(), "ident2");
|
||||||
|
|
||||||
|
t = l.next();
|
||||||
|
EXPECT_TRUE(t.IsEof());
|
||||||
|
}
|
||||||
|
|
||||||
|
using LineCommentTerminatorTest = testing::TestWithParam<const char*>;
|
||||||
TEST_P(LineCommentTerminatorTest, Terminators) {
|
TEST_P(LineCommentTerminatorTest, Terminators) {
|
||||||
// Test that line comments are ended by blankspace characters other than space
|
// Test that line comments are ended by blankspace characters other than
|
||||||
// and horizontal tab.
|
// space, horizontal tab, left-to-right mark, and right-to-left mark.
|
||||||
char c = GetParam();
|
auto c = GetParam();
|
||||||
std::string src = "let// This is a comment";
|
std::string src = "let// This is a comment";
|
||||||
src += c;
|
src += c;
|
||||||
src += "ident";
|
src += "ident";
|
||||||
|
@ -91,9 +154,13 @@ TEST_P(LineCommentTerminatorTest, Terminators) {
|
||||||
EXPECT_EQ(t.source().range.end.line, 1u);
|
EXPECT_EQ(t.source().range.end.line, 1u);
|
||||||
EXPECT_EQ(t.source().range.end.column, 4u);
|
EXPECT_EQ(t.source().range.end.column, 4u);
|
||||||
|
|
||||||
if (c != ' ' && c != '\t') {
|
auto is_same_line = [](std::string_view v) {
|
||||||
size_t line = c == '\n' ? 2u : 1u;
|
return v == kSpace || v == kHTab || v == kL2R || v == kR2L;
|
||||||
size_t col = c == '\n' ? 1u : 25u;
|
};
|
||||||
|
|
||||||
|
if (!is_same_line(c)) {
|
||||||
|
size_t line = is_same_line(c) ? 1u : 2u;
|
||||||
|
size_t col = is_same_line(c) ? 25u : 1u;
|
||||||
t = l.next();
|
t = l.next();
|
||||||
EXPECT_TRUE(t.IsIdentifier());
|
EXPECT_TRUE(t.IsIdentifier());
|
||||||
EXPECT_EQ(t.source().range.begin.line, line);
|
EXPECT_EQ(t.source().range.begin.line, line);
|
||||||
|
@ -108,7 +175,20 @@ TEST_P(LineCommentTerminatorTest, Terminators) {
|
||||||
}
|
}
|
||||||
INSTANTIATE_TEST_SUITE_P(LexerTest,
|
INSTANTIATE_TEST_SUITE_P(LexerTest,
|
||||||
LineCommentTerminatorTest,
|
LineCommentTerminatorTest,
|
||||||
testing::Values(' ', '\t', '\n', '\v', '\f', '\r'));
|
testing::Values(
|
||||||
|
// same line
|
||||||
|
kSpace,
|
||||||
|
kHTab,
|
||||||
|
kCR,
|
||||||
|
kL2R,
|
||||||
|
kR2L,
|
||||||
|
// line break
|
||||||
|
kLF,
|
||||||
|
kVTab,
|
||||||
|
kFF,
|
||||||
|
kNL,
|
||||||
|
kLS,
|
||||||
|
kPS));
|
||||||
|
|
||||||
TEST_F(LexerTest, Skips_Comments_Block) {
|
TEST_F(LexerTest, Skips_Comments_Block) {
|
||||||
Source::File file("", R"(/* comment
|
Source::File file("", R"(/* comment
|
||||||
|
|
|
@ -19,21 +19,82 @@
|
||||||
#include <string_view>
|
#include <string_view>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
|
|
||||||
|
#include "src/tint/text/unicode.h"
|
||||||
|
|
||||||
namespace tint {
|
namespace tint {
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
|
bool ParseLineBreak(std::string_view str,
|
||||||
|
size_t i,
|
||||||
|
bool* is_line_break,
|
||||||
|
size_t* line_break_size) {
|
||||||
|
// See https://www.w3.org/TR/WGSL/#blankspace
|
||||||
|
|
||||||
|
auto* utf8 = reinterpret_cast<const uint8_t*>(&str[i]);
|
||||||
|
auto [cp, n] = text::utf8::Decode(utf8, str.size() - i);
|
||||||
|
|
||||||
|
if (n == 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const auto kLF = text::CodePoint(0x000A); // line feed
|
||||||
|
static const auto kVTab = text::CodePoint(0x000B); // vertical tab
|
||||||
|
static const auto kFF = text::CodePoint(0x000C); // form feed
|
||||||
|
static const auto kNL = text::CodePoint(0x0085); // next line
|
||||||
|
static const auto kCR = text::CodePoint(0x000D); // carriage return
|
||||||
|
static const auto kLS = text::CodePoint(0x2028); // line separator
|
||||||
|
static const auto kPS = text::CodePoint(0x2029); // parargraph separator
|
||||||
|
|
||||||
|
if (cp == kLF || cp == kVTab || cp == kFF || cp == kNL || cp == kPS ||
|
||||||
|
cp == kLS) {
|
||||||
|
*is_line_break = true;
|
||||||
|
*line_break_size = n;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle CRLF as one line break, and CR alone as one line break
|
||||||
|
if (cp == kCR) {
|
||||||
|
*is_line_break = true;
|
||||||
|
*line_break_size = n;
|
||||||
|
|
||||||
|
if (auto next_i = i + n; next_i < str.size()) {
|
||||||
|
auto* next_utf8 = reinterpret_cast<const uint8_t*>(&str[next_i]);
|
||||||
|
auto [next_cp, next_n] =
|
||||||
|
text::utf8::Decode(next_utf8, str.size() - next_i);
|
||||||
|
|
||||||
|
if (next_n == 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (next_cp == kLF) {
|
||||||
|
// CRLF as one break
|
||||||
|
*line_break_size = n + next_n;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
*is_line_break = false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<std::string_view> SplitLines(std::string_view str) {
|
std::vector<std::string_view> SplitLines(std::string_view str) {
|
||||||
std::vector<std::string_view> lines;
|
std::vector<std::string_view> lines;
|
||||||
|
|
||||||
size_t lineStart = 0;
|
size_t lineStart = 0;
|
||||||
for (size_t i = 0; i < str.size(); ++i) {
|
for (size_t i = 0; i < str.size();) {
|
||||||
if (str[i] == '\n') {
|
bool is_line_break{};
|
||||||
// Handle CRLF on Windows
|
size_t line_break_size{};
|
||||||
size_t curr = i;
|
// We don't handle decode errors from ParseLineBreak. Instead, we rely on
|
||||||
if (i > 0 && str[i - 1] == '\r') {
|
// the Lexer to do so.
|
||||||
--curr;
|
ParseLineBreak(str, i, &is_line_break, &line_break_size);
|
||||||
}
|
if (is_line_break) {
|
||||||
lines.push_back(str.substr(lineStart, curr - lineStart));
|
lines.push_back(str.substr(lineStart, i - lineStart));
|
||||||
lineStart = i + 1;
|
i += line_break_size;
|
||||||
|
lineStart = i;
|
||||||
|
} else {
|
||||||
|
++i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (lineStart < str.size()) {
|
if (lineStart < str.size()) {
|
||||||
|
|
|
@ -62,5 +62,42 @@ TEST_F(SourceFileContentTest, MoveCtor) {
|
||||||
EXPECT_EQ(fc.lines[2], "line three");
|
EXPECT_EQ(fc.lines[2], "line three");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Line break code points
|
||||||
|
#define kCR "\r"
|
||||||
|
#define kLF "\n"
|
||||||
|
#define kVTab "\x0B"
|
||||||
|
#define kFF "\x0C"
|
||||||
|
#define kNL "\xC2\x85"
|
||||||
|
#define kLS "\xE2\x80\xA8"
|
||||||
|
#define kPS "\xE2\x80\xA9"
|
||||||
|
|
||||||
|
using LineBreakTest = testing::TestWithParam<const char*>;
|
||||||
|
TEST_P(LineBreakTest, Single) {
|
||||||
|
std::string src = "line one";
|
||||||
|
src += GetParam();
|
||||||
|
src += "line two";
|
||||||
|
|
||||||
|
Source::FileContent fc(src);
|
||||||
|
EXPECT_EQ(fc.lines.size(), 2u);
|
||||||
|
EXPECT_EQ(fc.lines[0], "line one");
|
||||||
|
EXPECT_EQ(fc.lines[1], "line two");
|
||||||
|
}
|
||||||
|
TEST_P(LineBreakTest, Double) {
|
||||||
|
std::string src = "line one";
|
||||||
|
src += GetParam();
|
||||||
|
src += GetParam();
|
||||||
|
src += "line two";
|
||||||
|
|
||||||
|
Source::FileContent fc(src);
|
||||||
|
EXPECT_EQ(fc.lines.size(), 3u);
|
||||||
|
EXPECT_EQ(fc.lines[0], "line one");
|
||||||
|
EXPECT_EQ(fc.lines[1], "");
|
||||||
|
EXPECT_EQ(fc.lines[2], "line two");
|
||||||
|
}
|
||||||
|
INSTANTIATE_TEST_SUITE_P(
|
||||||
|
SourceFileContentTest,
|
||||||
|
LineBreakTest,
|
||||||
|
testing::Values(kVTab, kFF, kNL, kLS, kPS, kLF, kCR, kCR kLF));
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
} // namespace tint
|
} // namespace tint
|
||||||
|
|
Loading…
Reference in New Issue