diff --git a/src/tint/reader/wgsl/lexer.cc b/src/tint/reader/wgsl/lexer.cc index 9881b40601..ea01d8953b 100644 --- a/src/tint/reader/wgsl/lexer.cc +++ b/src/tint/reader/wgsl/lexer.cc @@ -18,6 +18,8 @@ #include #include #include +#include // NOLINT(build/include_order) +#include #include #include "src/tint/debug.h" @@ -26,9 +28,39 @@ namespace tint::reader::wgsl { namespace { -bool is_blankspace(char c) { - // See https://www.w3.org/TR/WGSL/#blankspace. - return c == ' ' || c == '\t' || c == '\v' || c == '\f' || c == '\r'; +// Unicode parsing code assumes that the size of a single std::string element is +// 1 byte. +static_assert(sizeof(decltype(tint::Source::FileContent::data[0])) == + sizeof(uint8_t), + "tint::reader::wgsl requires the size of a std::string element " + "to be a single byte"); + +bool read_blankspace(std::string_view str, + size_t i, + bool* is_blankspace, + size_t* blankspace_size) { + // See https://www.w3.org/TR/WGSL/#blankspace + + auto* utf8 = reinterpret_cast(&str[i]); + auto [cp, n] = text::utf8::Decode(utf8, str.size() - i); + + if (n == 0) { + return false; + } + + static const auto kSpace = text::CodePoint(0x0020); // space + static const auto kHTab = text::CodePoint(0x0009); // horizontal tab + static const auto kL2R = text::CodePoint(0x200E); // left-to-right mark + static const auto kR2L = text::CodePoint(0x200F); // right-to-left mark + + if (cp == kSpace || cp == kHTab || cp == kL2R || cp == kR2L) { + *is_blankspace = true; + *blankspace_size = n; + return true; + } + + *is_blankspace = false; + return true; } uint32_t dec_value(char c) { @@ -181,11 +213,16 @@ Token Lexer::skip_blankspace_and_comments() { continue; } - if (!is_blankspace(at(pos()))) { + bool is_blankspace; + size_t blankspace_size; + if (!read_blankspace(line(), pos(), &is_blankspace, &blankspace_size)) { + return {Token::Type::kError, begin_source(), "invalid UTF-8"}; + } + if (!is_blankspace) { break; } - advance(); + advance(blankspace_size); } auto t = skip_comment(); @@ -207,10 +244,8 @@ Token Lexer::skip_blankspace_and_comments() { Token Lexer::skip_comment() { if (matches(pos(), "//")) { - // Line comment: ignore everything until the end of input or a blankspace - // character other than space or horizontal tab. - while (!is_eol() && !(is_blankspace(at(pos())) && !matches(pos(), " ") && - !matches(pos(), "\t"))) { + // Line comment: ignore everything until the end of line. + while (!is_eol()) { if (is_null()) { return {Token::Type::kError, begin_source(), "null character found"}; } @@ -758,11 +793,6 @@ Token Lexer::try_ident() { auto source = begin_source(); auto start = pos(); - // This below assumes that the size of a single std::string element is 1 byte. - static_assert(sizeof(at(0)) == sizeof(uint8_t), - "tint::reader::wgsl requires the size of a std::string element " - "to be a single byte"); - // Must begin with an XID_Source unicode character, or underscore { auto* utf8 = reinterpret_cast(&at(pos())); diff --git a/src/tint/reader/wgsl/lexer_test.cc b/src/tint/reader/wgsl/lexer_test.cc index 6ea313eaed..de65fcbc85 100644 --- a/src/tint/reader/wgsl/lexer_test.cc +++ b/src/tint/reader/wgsl/lexer_test.cc @@ -23,6 +23,23 @@ namespace { using LexerTest = testing::Test; +// Blankspace constants. These are macros on purpose to be able to easily build +// up string literals with them. +// +// Same line code points +#define kSpace " " +#define kHTab "\t" +#define kL2R "\xE2\x80\x8E" +#define kR2L "\xE2\x80\x8F" +// Line break code points +#define kCR "\r" +#define kLF "\n" +#define kVTab "\x0B" +#define kFF "\x0C" +#define kNL "\xC2\x85" +#define kLS "\xE2\x80\xA8" +#define kPS "\xE2\x80\xA9" + TEST_F(LexerTest, Empty) { Source::File file("", ""); Lexer l(&file); @@ -30,7 +47,7 @@ TEST_F(LexerTest, Empty) { EXPECT_TRUE(t.IsEof()); } -TEST_F(LexerTest, Skips_Blankspace) { +TEST_F(LexerTest, Skips_Blankspace_Basic) { Source::File file("", "\t\r\n\t ident\t\n\t \r "); Lexer l(&file); @@ -46,6 +63,25 @@ TEST_F(LexerTest, Skips_Blankspace) { EXPECT_TRUE(t.IsEof()); } +TEST_F(LexerTest, Skips_Blankspace_Exotic) { + Source::File file("", // + kVTab kFF kNL kLS kPS kL2R kR2L // + "ident" // + kVTab kFF kNL kLS kPS kL2R kR2L); + Lexer l(&file); + + auto t = l.next(); + EXPECT_TRUE(t.IsIdentifier()); + EXPECT_EQ(t.source().range.begin.line, 6u); + EXPECT_EQ(t.source().range.begin.column, 7u); + EXPECT_EQ(t.source().range.end.line, 6u); + EXPECT_EQ(t.source().range.end.column, 12u); + EXPECT_EQ(t.to_str(), "ident"); + + t = l.next(); + EXPECT_TRUE(t.IsEof()); +} + TEST_F(LexerTest, Skips_Comments_Line) { Source::File file("", R"(//starts with comment ident1 //ends with comment @@ -73,11 +109,38 @@ ident1 //ends with comment EXPECT_TRUE(t.IsEof()); } -using LineCommentTerminatorTest = testing::TestWithParam; +TEST_F(LexerTest, Skips_Comments_Unicode) { + Source::File file("", R"(// starts with 🙂🙂🙂 +ident1 //ends with 🙂🙂🙂 +// blank line + ident2)"); + Lexer l(&file); + + auto t = l.next(); + EXPECT_TRUE(t.IsIdentifier()); + EXPECT_EQ(t.source().range.begin.line, 2u); + EXPECT_EQ(t.source().range.begin.column, 1u); + EXPECT_EQ(t.source().range.end.line, 2u); + EXPECT_EQ(t.source().range.end.column, 7u); + EXPECT_EQ(t.to_str(), "ident1"); + + t = l.next(); + EXPECT_TRUE(t.IsIdentifier()); + EXPECT_EQ(t.source().range.begin.line, 4u); + EXPECT_EQ(t.source().range.begin.column, 2u); + EXPECT_EQ(t.source().range.end.line, 4u); + EXPECT_EQ(t.source().range.end.column, 8u); + EXPECT_EQ(t.to_str(), "ident2"); + + t = l.next(); + EXPECT_TRUE(t.IsEof()); +} + +using LineCommentTerminatorTest = testing::TestWithParam; TEST_P(LineCommentTerminatorTest, Terminators) { - // Test that line comments are ended by blankspace characters other than space - // and horizontal tab. - char c = GetParam(); + // Test that line comments are ended by blankspace characters other than + // space, horizontal tab, left-to-right mark, and right-to-left mark. + auto c = GetParam(); std::string src = "let// This is a comment"; src += c; src += "ident"; @@ -91,9 +154,13 @@ TEST_P(LineCommentTerminatorTest, Terminators) { EXPECT_EQ(t.source().range.end.line, 1u); EXPECT_EQ(t.source().range.end.column, 4u); - if (c != ' ' && c != '\t') { - size_t line = c == '\n' ? 2u : 1u; - size_t col = c == '\n' ? 1u : 25u; + auto is_same_line = [](std::string_view v) { + return v == kSpace || v == kHTab || v == kL2R || v == kR2L; + }; + + if (!is_same_line(c)) { + size_t line = is_same_line(c) ? 1u : 2u; + size_t col = is_same_line(c) ? 25u : 1u; t = l.next(); EXPECT_TRUE(t.IsIdentifier()); EXPECT_EQ(t.source().range.begin.line, line); @@ -108,7 +175,20 @@ TEST_P(LineCommentTerminatorTest, Terminators) { } INSTANTIATE_TEST_SUITE_P(LexerTest, LineCommentTerminatorTest, - testing::Values(' ', '\t', '\n', '\v', '\f', '\r')); + testing::Values( + // same line + kSpace, + kHTab, + kCR, + kL2R, + kR2L, + // line break + kLF, + kVTab, + kFF, + kNL, + kLS, + kPS)); TEST_F(LexerTest, Skips_Comments_Block) { Source::File file("", R"(/* comment diff --git a/src/tint/source.cc b/src/tint/source.cc index 9a172e3a79..a6749311ca 100644 --- a/src/tint/source.cc +++ b/src/tint/source.cc @@ -19,21 +19,82 @@ #include #include +#include "src/tint/text/unicode.h" + namespace tint { namespace { + +bool ParseLineBreak(std::string_view str, + size_t i, + bool* is_line_break, + size_t* line_break_size) { + // See https://www.w3.org/TR/WGSL/#blankspace + + auto* utf8 = reinterpret_cast(&str[i]); + auto [cp, n] = text::utf8::Decode(utf8, str.size() - i); + + if (n == 0) { + return false; + } + + static const auto kLF = text::CodePoint(0x000A); // line feed + static const auto kVTab = text::CodePoint(0x000B); // vertical tab + static const auto kFF = text::CodePoint(0x000C); // form feed + static const auto kNL = text::CodePoint(0x0085); // next line + static const auto kCR = text::CodePoint(0x000D); // carriage return + static const auto kLS = text::CodePoint(0x2028); // line separator + static const auto kPS = text::CodePoint(0x2029); // parargraph separator + + if (cp == kLF || cp == kVTab || cp == kFF || cp == kNL || cp == kPS || + cp == kLS) { + *is_line_break = true; + *line_break_size = n; + return true; + } + + // Handle CRLF as one line break, and CR alone as one line break + if (cp == kCR) { + *is_line_break = true; + *line_break_size = n; + + if (auto next_i = i + n; next_i < str.size()) { + auto* next_utf8 = reinterpret_cast(&str[next_i]); + auto [next_cp, next_n] = + text::utf8::Decode(next_utf8, str.size() - next_i); + + if (next_n == 0) { + return false; + } + + if (next_cp == kLF) { + // CRLF as one break + *line_break_size = n + next_n; + } + } + + return true; + } + + *is_line_break = false; + return true; +} + std::vector SplitLines(std::string_view str) { std::vector lines; size_t lineStart = 0; - for (size_t i = 0; i < str.size(); ++i) { - if (str[i] == '\n') { - // Handle CRLF on Windows - size_t curr = i; - if (i > 0 && str[i - 1] == '\r') { - --curr; - } - lines.push_back(str.substr(lineStart, curr - lineStart)); - lineStart = i + 1; + for (size_t i = 0; i < str.size();) { + bool is_line_break{}; + size_t line_break_size{}; + // We don't handle decode errors from ParseLineBreak. Instead, we rely on + // the Lexer to do so. + ParseLineBreak(str, i, &is_line_break, &line_break_size); + if (is_line_break) { + lines.push_back(str.substr(lineStart, i - lineStart)); + i += line_break_size; + lineStart = i; + } else { + ++i; } } if (lineStart < str.size()) { diff --git a/src/tint/source_test.cc b/src/tint/source_test.cc index a3b9825095..c52231e04e 100644 --- a/src/tint/source_test.cc +++ b/src/tint/source_test.cc @@ -62,5 +62,42 @@ TEST_F(SourceFileContentTest, MoveCtor) { EXPECT_EQ(fc.lines[2], "line three"); } +// Line break code points +#define kCR "\r" +#define kLF "\n" +#define kVTab "\x0B" +#define kFF "\x0C" +#define kNL "\xC2\x85" +#define kLS "\xE2\x80\xA8" +#define kPS "\xE2\x80\xA9" + +using LineBreakTest = testing::TestWithParam; +TEST_P(LineBreakTest, Single) { + std::string src = "line one"; + src += GetParam(); + src += "line two"; + + Source::FileContent fc(src); + EXPECT_EQ(fc.lines.size(), 2u); + EXPECT_EQ(fc.lines[0], "line one"); + EXPECT_EQ(fc.lines[1], "line two"); +} +TEST_P(LineBreakTest, Double) { + std::string src = "line one"; + src += GetParam(); + src += GetParam(); + src += "line two"; + + Source::FileContent fc(src); + EXPECT_EQ(fc.lines.size(), 3u); + EXPECT_EQ(fc.lines[0], "line one"); + EXPECT_EQ(fc.lines[1], ""); + EXPECT_EQ(fc.lines[2], "line two"); +} +INSTANTIATE_TEST_SUITE_P( + SourceFileContentTest, + LineBreakTest, + testing::Values(kVTab, kFF, kNL, kLS, kPS, kLF, kCR, kCR kLF)); + } // namespace } // namespace tint