Implement support for Unicode Pattern_White_Space
Bug: tint:1505 Bug: tint:1513 Change-Id: I40fa29c766dc35213e0846071322523e7fc81b79 Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/86402 Reviewed-by: Ben Clayton <bclayton@google.com> Kokoro: Kokoro <noreply+kokoro@google.com> Commit-Queue: Antonio Maiorano <amaiorano@google.com>
This commit is contained in:
parent
d97ff53261
commit
25775308a9
|
@ -18,6 +18,8 @@
|
|||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <optional> // NOLINT(build/include_order)
|
||||
#include <tuple>
|
||||
#include <utility>
|
||||
|
||||
#include "src/tint/debug.h"
|
||||
|
@ -26,9 +28,39 @@
|
|||
namespace tint::reader::wgsl {
|
||||
namespace {
|
||||
|
||||
bool is_blankspace(char c) {
|
||||
// See https://www.w3.org/TR/WGSL/#blankspace.
|
||||
return c == ' ' || c == '\t' || c == '\v' || c == '\f' || c == '\r';
|
||||
// Unicode parsing code assumes that the size of a single std::string element is
|
||||
// 1 byte.
|
||||
static_assert(sizeof(decltype(tint::Source::FileContent::data[0])) ==
|
||||
sizeof(uint8_t),
|
||||
"tint::reader::wgsl requires the size of a std::string element "
|
||||
"to be a single byte");
|
||||
|
||||
bool read_blankspace(std::string_view str,
|
||||
size_t i,
|
||||
bool* is_blankspace,
|
||||
size_t* blankspace_size) {
|
||||
// See https://www.w3.org/TR/WGSL/#blankspace
|
||||
|
||||
auto* utf8 = reinterpret_cast<const uint8_t*>(&str[i]);
|
||||
auto [cp, n] = text::utf8::Decode(utf8, str.size() - i);
|
||||
|
||||
if (n == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
static const auto kSpace = text::CodePoint(0x0020); // space
|
||||
static const auto kHTab = text::CodePoint(0x0009); // horizontal tab
|
||||
static const auto kL2R = text::CodePoint(0x200E); // left-to-right mark
|
||||
static const auto kR2L = text::CodePoint(0x200F); // right-to-left mark
|
||||
|
||||
if (cp == kSpace || cp == kHTab || cp == kL2R || cp == kR2L) {
|
||||
*is_blankspace = true;
|
||||
*blankspace_size = n;
|
||||
return true;
|
||||
}
|
||||
|
||||
*is_blankspace = false;
|
||||
return true;
|
||||
}
|
||||
|
||||
uint32_t dec_value(char c) {
|
||||
|
@ -181,11 +213,16 @@ Token Lexer::skip_blankspace_and_comments() {
|
|||
continue;
|
||||
}
|
||||
|
||||
if (!is_blankspace(at(pos()))) {
|
||||
bool is_blankspace;
|
||||
size_t blankspace_size;
|
||||
if (!read_blankspace(line(), pos(), &is_blankspace, &blankspace_size)) {
|
||||
return {Token::Type::kError, begin_source(), "invalid UTF-8"};
|
||||
}
|
||||
if (!is_blankspace) {
|
||||
break;
|
||||
}
|
||||
|
||||
advance();
|
||||
advance(blankspace_size);
|
||||
}
|
||||
|
||||
auto t = skip_comment();
|
||||
|
@ -207,10 +244,8 @@ Token Lexer::skip_blankspace_and_comments() {
|
|||
|
||||
Token Lexer::skip_comment() {
|
||||
if (matches(pos(), "//")) {
|
||||
// Line comment: ignore everything until the end of input or a blankspace
|
||||
// character other than space or horizontal tab.
|
||||
while (!is_eol() && !(is_blankspace(at(pos())) && !matches(pos(), " ") &&
|
||||
!matches(pos(), "\t"))) {
|
||||
// Line comment: ignore everything until the end of line.
|
||||
while (!is_eol()) {
|
||||
if (is_null()) {
|
||||
return {Token::Type::kError, begin_source(), "null character found"};
|
||||
}
|
||||
|
@ -758,11 +793,6 @@ Token Lexer::try_ident() {
|
|||
auto source = begin_source();
|
||||
auto start = pos();
|
||||
|
||||
// This below assumes that the size of a single std::string element is 1 byte.
|
||||
static_assert(sizeof(at(0)) == sizeof(uint8_t),
|
||||
"tint::reader::wgsl requires the size of a std::string element "
|
||||
"to be a single byte");
|
||||
|
||||
// Must begin with an XID_Source unicode character, or underscore
|
||||
{
|
||||
auto* utf8 = reinterpret_cast<const uint8_t*>(&at(pos()));
|
||||
|
|
|
@ -23,6 +23,23 @@ namespace {
|
|||
|
||||
using LexerTest = testing::Test;
|
||||
|
||||
// Blankspace constants. These are macros on purpose to be able to easily build
|
||||
// up string literals with them.
|
||||
//
|
||||
// Same line code points
|
||||
#define kSpace " "
|
||||
#define kHTab "\t"
|
||||
#define kL2R "\xE2\x80\x8E"
|
||||
#define kR2L "\xE2\x80\x8F"
|
||||
// Line break code points
|
||||
#define kCR "\r"
|
||||
#define kLF "\n"
|
||||
#define kVTab "\x0B"
|
||||
#define kFF "\x0C"
|
||||
#define kNL "\xC2\x85"
|
||||
#define kLS "\xE2\x80\xA8"
|
||||
#define kPS "\xE2\x80\xA9"
|
||||
|
||||
TEST_F(LexerTest, Empty) {
|
||||
Source::File file("", "");
|
||||
Lexer l(&file);
|
||||
|
@ -30,7 +47,7 @@ TEST_F(LexerTest, Empty) {
|
|||
EXPECT_TRUE(t.IsEof());
|
||||
}
|
||||
|
||||
TEST_F(LexerTest, Skips_Blankspace) {
|
||||
TEST_F(LexerTest, Skips_Blankspace_Basic) {
|
||||
Source::File file("", "\t\r\n\t ident\t\n\t \r ");
|
||||
Lexer l(&file);
|
||||
|
||||
|
@ -46,6 +63,25 @@ TEST_F(LexerTest, Skips_Blankspace) {
|
|||
EXPECT_TRUE(t.IsEof());
|
||||
}
|
||||
|
||||
TEST_F(LexerTest, Skips_Blankspace_Exotic) {
|
||||
Source::File file("", //
|
||||
kVTab kFF kNL kLS kPS kL2R kR2L //
|
||||
"ident" //
|
||||
kVTab kFF kNL kLS kPS kL2R kR2L);
|
||||
Lexer l(&file);
|
||||
|
||||
auto t = l.next();
|
||||
EXPECT_TRUE(t.IsIdentifier());
|
||||
EXPECT_EQ(t.source().range.begin.line, 6u);
|
||||
EXPECT_EQ(t.source().range.begin.column, 7u);
|
||||
EXPECT_EQ(t.source().range.end.line, 6u);
|
||||
EXPECT_EQ(t.source().range.end.column, 12u);
|
||||
EXPECT_EQ(t.to_str(), "ident");
|
||||
|
||||
t = l.next();
|
||||
EXPECT_TRUE(t.IsEof());
|
||||
}
|
||||
|
||||
TEST_F(LexerTest, Skips_Comments_Line) {
|
||||
Source::File file("", R"(//starts with comment
|
||||
ident1 //ends with comment
|
||||
|
@ -73,11 +109,38 @@ ident1 //ends with comment
|
|||
EXPECT_TRUE(t.IsEof());
|
||||
}
|
||||
|
||||
using LineCommentTerminatorTest = testing::TestWithParam<char>;
|
||||
TEST_F(LexerTest, Skips_Comments_Unicode) {
|
||||
Source::File file("", R"(// starts with 🙂🙂🙂
|
||||
ident1 //ends with 🙂🙂🙂
|
||||
// blank line
|
||||
ident2)");
|
||||
Lexer l(&file);
|
||||
|
||||
auto t = l.next();
|
||||
EXPECT_TRUE(t.IsIdentifier());
|
||||
EXPECT_EQ(t.source().range.begin.line, 2u);
|
||||
EXPECT_EQ(t.source().range.begin.column, 1u);
|
||||
EXPECT_EQ(t.source().range.end.line, 2u);
|
||||
EXPECT_EQ(t.source().range.end.column, 7u);
|
||||
EXPECT_EQ(t.to_str(), "ident1");
|
||||
|
||||
t = l.next();
|
||||
EXPECT_TRUE(t.IsIdentifier());
|
||||
EXPECT_EQ(t.source().range.begin.line, 4u);
|
||||
EXPECT_EQ(t.source().range.begin.column, 2u);
|
||||
EXPECT_EQ(t.source().range.end.line, 4u);
|
||||
EXPECT_EQ(t.source().range.end.column, 8u);
|
||||
EXPECT_EQ(t.to_str(), "ident2");
|
||||
|
||||
t = l.next();
|
||||
EXPECT_TRUE(t.IsEof());
|
||||
}
|
||||
|
||||
using LineCommentTerminatorTest = testing::TestWithParam<const char*>;
|
||||
TEST_P(LineCommentTerminatorTest, Terminators) {
|
||||
// Test that line comments are ended by blankspace characters other than space
|
||||
// and horizontal tab.
|
||||
char c = GetParam();
|
||||
// Test that line comments are ended by blankspace characters other than
|
||||
// space, horizontal tab, left-to-right mark, and right-to-left mark.
|
||||
auto c = GetParam();
|
||||
std::string src = "let// This is a comment";
|
||||
src += c;
|
||||
src += "ident";
|
||||
|
@ -91,9 +154,13 @@ TEST_P(LineCommentTerminatorTest, Terminators) {
|
|||
EXPECT_EQ(t.source().range.end.line, 1u);
|
||||
EXPECT_EQ(t.source().range.end.column, 4u);
|
||||
|
||||
if (c != ' ' && c != '\t') {
|
||||
size_t line = c == '\n' ? 2u : 1u;
|
||||
size_t col = c == '\n' ? 1u : 25u;
|
||||
auto is_same_line = [](std::string_view v) {
|
||||
return v == kSpace || v == kHTab || v == kL2R || v == kR2L;
|
||||
};
|
||||
|
||||
if (!is_same_line(c)) {
|
||||
size_t line = is_same_line(c) ? 1u : 2u;
|
||||
size_t col = is_same_line(c) ? 25u : 1u;
|
||||
t = l.next();
|
||||
EXPECT_TRUE(t.IsIdentifier());
|
||||
EXPECT_EQ(t.source().range.begin.line, line);
|
||||
|
@ -108,7 +175,20 @@ TEST_P(LineCommentTerminatorTest, Terminators) {
|
|||
}
|
||||
INSTANTIATE_TEST_SUITE_P(LexerTest,
|
||||
LineCommentTerminatorTest,
|
||||
testing::Values(' ', '\t', '\n', '\v', '\f', '\r'));
|
||||
testing::Values(
|
||||
// same line
|
||||
kSpace,
|
||||
kHTab,
|
||||
kCR,
|
||||
kL2R,
|
||||
kR2L,
|
||||
// line break
|
||||
kLF,
|
||||
kVTab,
|
||||
kFF,
|
||||
kNL,
|
||||
kLS,
|
||||
kPS));
|
||||
|
||||
TEST_F(LexerTest, Skips_Comments_Block) {
|
||||
Source::File file("", R"(/* comment
|
||||
|
|
|
@ -19,21 +19,82 @@
|
|||
#include <string_view>
|
||||
#include <utility>
|
||||
|
||||
#include "src/tint/text/unicode.h"
|
||||
|
||||
namespace tint {
|
||||
namespace {
|
||||
|
||||
bool ParseLineBreak(std::string_view str,
|
||||
size_t i,
|
||||
bool* is_line_break,
|
||||
size_t* line_break_size) {
|
||||
// See https://www.w3.org/TR/WGSL/#blankspace
|
||||
|
||||
auto* utf8 = reinterpret_cast<const uint8_t*>(&str[i]);
|
||||
auto [cp, n] = text::utf8::Decode(utf8, str.size() - i);
|
||||
|
||||
if (n == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
static const auto kLF = text::CodePoint(0x000A); // line feed
|
||||
static const auto kVTab = text::CodePoint(0x000B); // vertical tab
|
||||
static const auto kFF = text::CodePoint(0x000C); // form feed
|
||||
static const auto kNL = text::CodePoint(0x0085); // next line
|
||||
static const auto kCR = text::CodePoint(0x000D); // carriage return
|
||||
static const auto kLS = text::CodePoint(0x2028); // line separator
|
||||
static const auto kPS = text::CodePoint(0x2029); // parargraph separator
|
||||
|
||||
if (cp == kLF || cp == kVTab || cp == kFF || cp == kNL || cp == kPS ||
|
||||
cp == kLS) {
|
||||
*is_line_break = true;
|
||||
*line_break_size = n;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Handle CRLF as one line break, and CR alone as one line break
|
||||
if (cp == kCR) {
|
||||
*is_line_break = true;
|
||||
*line_break_size = n;
|
||||
|
||||
if (auto next_i = i + n; next_i < str.size()) {
|
||||
auto* next_utf8 = reinterpret_cast<const uint8_t*>(&str[next_i]);
|
||||
auto [next_cp, next_n] =
|
||||
text::utf8::Decode(next_utf8, str.size() - next_i);
|
||||
|
||||
if (next_n == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (next_cp == kLF) {
|
||||
// CRLF as one break
|
||||
*line_break_size = n + next_n;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
*is_line_break = false;
|
||||
return true;
|
||||
}
|
||||
|
||||
std::vector<std::string_view> SplitLines(std::string_view str) {
|
||||
std::vector<std::string_view> lines;
|
||||
|
||||
size_t lineStart = 0;
|
||||
for (size_t i = 0; i < str.size(); ++i) {
|
||||
if (str[i] == '\n') {
|
||||
// Handle CRLF on Windows
|
||||
size_t curr = i;
|
||||
if (i > 0 && str[i - 1] == '\r') {
|
||||
--curr;
|
||||
}
|
||||
lines.push_back(str.substr(lineStart, curr - lineStart));
|
||||
lineStart = i + 1;
|
||||
for (size_t i = 0; i < str.size();) {
|
||||
bool is_line_break{};
|
||||
size_t line_break_size{};
|
||||
// We don't handle decode errors from ParseLineBreak. Instead, we rely on
|
||||
// the Lexer to do so.
|
||||
ParseLineBreak(str, i, &is_line_break, &line_break_size);
|
||||
if (is_line_break) {
|
||||
lines.push_back(str.substr(lineStart, i - lineStart));
|
||||
i += line_break_size;
|
||||
lineStart = i;
|
||||
} else {
|
||||
++i;
|
||||
}
|
||||
}
|
||||
if (lineStart < str.size()) {
|
||||
|
|
|
@ -62,5 +62,42 @@ TEST_F(SourceFileContentTest, MoveCtor) {
|
|||
EXPECT_EQ(fc.lines[2], "line three");
|
||||
}
|
||||
|
||||
// Line break code points
|
||||
#define kCR "\r"
|
||||
#define kLF "\n"
|
||||
#define kVTab "\x0B"
|
||||
#define kFF "\x0C"
|
||||
#define kNL "\xC2\x85"
|
||||
#define kLS "\xE2\x80\xA8"
|
||||
#define kPS "\xE2\x80\xA9"
|
||||
|
||||
using LineBreakTest = testing::TestWithParam<const char*>;
|
||||
TEST_P(LineBreakTest, Single) {
|
||||
std::string src = "line one";
|
||||
src += GetParam();
|
||||
src += "line two";
|
||||
|
||||
Source::FileContent fc(src);
|
||||
EXPECT_EQ(fc.lines.size(), 2u);
|
||||
EXPECT_EQ(fc.lines[0], "line one");
|
||||
EXPECT_EQ(fc.lines[1], "line two");
|
||||
}
|
||||
TEST_P(LineBreakTest, Double) {
|
||||
std::string src = "line one";
|
||||
src += GetParam();
|
||||
src += GetParam();
|
||||
src += "line two";
|
||||
|
||||
Source::FileContent fc(src);
|
||||
EXPECT_EQ(fc.lines.size(), 3u);
|
||||
EXPECT_EQ(fc.lines[0], "line one");
|
||||
EXPECT_EQ(fc.lines[1], "");
|
||||
EXPECT_EQ(fc.lines[2], "line two");
|
||||
}
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
SourceFileContentTest,
|
||||
LineBreakTest,
|
||||
testing::Values(kVTab, kFF, kNL, kLS, kPS, kLF, kCR, kCR kLF));
|
||||
|
||||
} // namespace
|
||||
} // namespace tint
|
||||
|
|
Loading…
Reference in New Issue