Error on invalid UTF-8 sequences
Invalid UTF-8 was being best-effort consumed, which given the right sequence of brokenness, could end up with diagnostic locations referring to bytes beyond the end of a line. Improve the UTF-8 decoding so that it can detect when multi-byte codepoints are missing the high-bit being set. Actually detect this in a lexer, and parser and produce errors. Bug: tint:1437 Bug: chromium:1305648 Change-Id: I459f0df840b4ce8c4f5f82363f93602bf8326984 Reviewed-on: https://dawn-review.googlesource.com/c/tint/+/83540 Kokoro: Kokoro <noreply+kokoro@google.com> Reviewed-by: Antonio Maiorano <amaiorano@google.com> Commit-Queue: Ben Clayton <bclayton@google.com>
This commit is contained in:
parent
93baaae60b
commit
64b775419d
|
@ -739,6 +739,10 @@ Token Lexer::try_ident() {
|
||||||
auto* utf8 = reinterpret_cast<const uint8_t*>(&file_->content.data[pos_]);
|
auto* utf8 = reinterpret_cast<const uint8_t*>(&file_->content.data[pos_]);
|
||||||
auto [code_point, n] =
|
auto [code_point, n] =
|
||||||
text::utf8::Decode(utf8, file_->content.data.size() - pos_);
|
text::utf8::Decode(utf8, file_->content.data.size() - pos_);
|
||||||
|
if (n == 0) {
|
||||||
|
pos_++; // Skip the bad byte.
|
||||||
|
return {Token::Type::kError, source, "invalid UTF-8"};
|
||||||
|
}
|
||||||
if (code_point != text::CodePoint('_') && !code_point.IsXIDStart()) {
|
if (code_point != text::CodePoint('_') && !code_point.IsXIDStart()) {
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
@ -752,6 +756,10 @@ Token Lexer::try_ident() {
|
||||||
auto* utf8 = reinterpret_cast<const uint8_t*>(&file_->content.data[pos_]);
|
auto* utf8 = reinterpret_cast<const uint8_t*>(&file_->content.data[pos_]);
|
||||||
auto [code_point, n] =
|
auto [code_point, n] =
|
||||||
text::utf8::Decode(utf8, file_->content.data.size() - pos_);
|
text::utf8::Decode(utf8, file_->content.data.size() - pos_);
|
||||||
|
if (n == 0) {
|
||||||
|
pos_++; // Skip the bad byte.
|
||||||
|
return {Token::Type::kError, source, "invalid UTF-8"};
|
||||||
|
}
|
||||||
if (!code_point.IsXIDContinue()) {
|
if (!code_point.IsXIDContinue()) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
|
@ -344,11 +344,11 @@ INSTANTIATE_TEST_SUITE_P(LexerTest,
|
||||||
|
|
||||||
struct UnicodeCase {
|
struct UnicodeCase {
|
||||||
const char* utf8;
|
const char* utf8;
|
||||||
size_t code_units;
|
size_t count;
|
||||||
};
|
};
|
||||||
|
|
||||||
using UnicodeIdentifierTest = testing::TestWithParam<UnicodeCase>;
|
using ValidUnicodeIdentifierTest = testing::TestWithParam<UnicodeCase>;
|
||||||
TEST_P(UnicodeIdentifierTest, Parse) {
|
TEST_P(ValidUnicodeIdentifierTest, Parse) {
|
||||||
Source::File file("", GetParam().utf8);
|
Source::File file("", GetParam().utf8);
|
||||||
Lexer l(&file);
|
Lexer l(&file);
|
||||||
|
|
||||||
|
@ -357,12 +357,12 @@ TEST_P(UnicodeIdentifierTest, Parse) {
|
||||||
EXPECT_EQ(t.source().range.begin.line, 1u);
|
EXPECT_EQ(t.source().range.begin.line, 1u);
|
||||||
EXPECT_EQ(t.source().range.begin.column, 1u);
|
EXPECT_EQ(t.source().range.begin.column, 1u);
|
||||||
EXPECT_EQ(t.source().range.end.line, 1u);
|
EXPECT_EQ(t.source().range.end.line, 1u);
|
||||||
EXPECT_EQ(t.source().range.end.column, 1u + GetParam().code_units);
|
EXPECT_EQ(t.source().range.end.column, 1u + GetParam().count);
|
||||||
EXPECT_EQ(t.to_str(), GetParam().utf8);
|
EXPECT_EQ(t.to_str(), GetParam().utf8);
|
||||||
}
|
}
|
||||||
INSTANTIATE_TEST_SUITE_P(
|
INSTANTIATE_TEST_SUITE_P(
|
||||||
LexerTest,
|
LexerTest,
|
||||||
UnicodeIdentifierTest,
|
ValidUnicodeIdentifierTest,
|
||||||
testing::ValuesIn({
|
testing::ValuesIn({
|
||||||
UnicodeCase{// "𝐢𝐝𝐞𝐧𝐭𝐢𝐟𝐢𝐞𝐫"
|
UnicodeCase{// "𝐢𝐝𝐞𝐧𝐭𝐢𝐟𝐢𝐞𝐫"
|
||||||
"\xf0\x9d\x90\xa2\xf0\x9d\x90\x9d\xf0\x9d\x90\x9e\xf0\x9d"
|
"\xf0\x9d\x90\xa2\xf0\x9d\x90\x9d\xf0\x9d\x90\x9e\xf0\x9d"
|
||||||
|
@ -393,6 +393,56 @@ INSTANTIATE_TEST_SUITE_P(
|
||||||
43},
|
43},
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
using InvalidUnicodeIdentifierTest = testing::TestWithParam<const char*>;
|
||||||
|
TEST_P(InvalidUnicodeIdentifierTest, Parse) {
|
||||||
|
Source::File file("", GetParam());
|
||||||
|
Lexer l(&file);
|
||||||
|
|
||||||
|
auto t = l.next();
|
||||||
|
EXPECT_TRUE(t.IsError());
|
||||||
|
EXPECT_EQ(t.source().range.begin.line, 1u);
|
||||||
|
EXPECT_EQ(t.source().range.begin.column, 1u);
|
||||||
|
EXPECT_EQ(t.source().range.end.line, 1u);
|
||||||
|
EXPECT_EQ(t.source().range.end.column, 1u);
|
||||||
|
EXPECT_EQ(t.to_str(), "invalid UTF-8");
|
||||||
|
}
|
||||||
|
INSTANTIATE_TEST_SUITE_P(
|
||||||
|
LexerTest,
|
||||||
|
InvalidUnicodeIdentifierTest,
|
||||||
|
testing::ValuesIn({
|
||||||
|
"\x80\x80\x80\x80", // 10000000
|
||||||
|
"\x81\x80\x80\x80", // 10000001
|
||||||
|
"\x8f\x80\x80\x80", // 10001111
|
||||||
|
"\x90\x80\x80\x80", // 10010000
|
||||||
|
"\x91\x80\x80\x80", // 10010001
|
||||||
|
"\x9f\x80\x80\x80", // 10011111
|
||||||
|
"\xa0\x80\x80\x80", // 10100000
|
||||||
|
"\xa1\x80\x80\x80", // 10100001
|
||||||
|
"\xaf\x80\x80\x80", // 10101111
|
||||||
|
"\xb0\x80\x80\x80", // 10110000
|
||||||
|
"\xb1\x80\x80\x80", // 10110001
|
||||||
|
"\xbf\x80\x80\x80", // 10111111
|
||||||
|
"\xc0\x80\x80\x80", // 11000000
|
||||||
|
"\xc1\x80\x80\x80", // 11000001
|
||||||
|
"\xf5\x80\x80\x80", // 11110101
|
||||||
|
"\xf6\x80\x80\x80", // 11110110
|
||||||
|
"\xf7\x80\x80\x80", // 11110111
|
||||||
|
"\xf8\x80\x80\x80", // 11111000
|
||||||
|
"\xfe\x80\x80\x80", // 11111110
|
||||||
|
"\xff\x80\x80\x80", // 11111111
|
||||||
|
|
||||||
|
"\xd0", // 2-bytes, missing second byte
|
||||||
|
"\xe8\x8f", // 3-bytes, missing third byte
|
||||||
|
"\xf4\x8f\x8f", // 4-bytes, missing fourth byte
|
||||||
|
|
||||||
|
"\xd0\x7f", // 2-bytes, second byte MSB unset
|
||||||
|
"\xe8\x7f\x8f", // 3-bytes, second byte MSB unset
|
||||||
|
"\xe8\x8f\x7f", // 3-bytes, third byte MSB unset
|
||||||
|
"\xf4\x7f\x8f\x8f", // 4-bytes, second byte MSB unset
|
||||||
|
"\xf4\x8f\x7f\x8f", // 4-bytes, third byte MSB unset
|
||||||
|
"\xf4\x8f\x8f\x7f", // 4-bytes, fourth byte MSB unset
|
||||||
|
}));
|
||||||
|
|
||||||
TEST_F(LexerTest, IdentifierTest_SingleUnderscoreDoesNotMatch) {
|
TEST_F(LexerTest, IdentifierTest_SingleUnderscoreDoesNotMatch) {
|
||||||
Source::File file("", "_");
|
Source::File file("", "_");
|
||||||
Lexer l(&file);
|
Lexer l(&file);
|
||||||
|
|
|
@ -303,8 +303,9 @@ Token ParserImpl::next() {
|
||||||
}
|
}
|
||||||
|
|
||||||
Token ParserImpl::peek(size_t idx) {
|
Token ParserImpl::peek(size_t idx) {
|
||||||
while (token_queue_.size() < (idx + 1))
|
while (token_queue_.size() < (idx + 1)) {
|
||||||
token_queue_.push_back(lexer_->next());
|
token_queue_.push_back(lexer_->next());
|
||||||
|
}
|
||||||
return token_queue_[idx];
|
return token_queue_[idx];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -448,9 +449,8 @@ Expect<bool> ParserImpl::expect_global_decl() {
|
||||||
}
|
}
|
||||||
|
|
||||||
// The token might itself be an error.
|
// The token might itself be an error.
|
||||||
if (t.IsError()) {
|
if (handle_error(t)) {
|
||||||
next(); // Consume it.
|
return Failure::kErrored;
|
||||||
return add_error(t.source(), t.to_str());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Exhausted all attempts to make sense of where we're at.
|
// Exhausted all attempts to make sense of where we're at.
|
||||||
|
@ -2746,9 +2746,6 @@ Maybe<const ast::AssignmentStatement*> ParserImpl::assignment_stmt() {
|
||||||
// | FALSE
|
// | FALSE
|
||||||
Maybe<const ast::LiteralExpression*> ParserImpl::const_literal() {
|
Maybe<const ast::LiteralExpression*> ParserImpl::const_literal() {
|
||||||
auto t = peek();
|
auto t = peek();
|
||||||
if (t.IsError()) {
|
|
||||||
return add_error(t.source(), t.to_str());
|
|
||||||
}
|
|
||||||
if (match(Token::Type::kTrue)) {
|
if (match(Token::Type::kTrue)) {
|
||||||
return create<ast::BoolLiteralExpression>(t.source(), true);
|
return create<ast::BoolLiteralExpression>(t.source(), true);
|
||||||
}
|
}
|
||||||
|
@ -2764,6 +2761,9 @@ Maybe<const ast::LiteralExpression*> ParserImpl::const_literal() {
|
||||||
if (match(Token::Type::kFloatLiteral)) {
|
if (match(Token::Type::kFloatLiteral)) {
|
||||||
return create<ast::FloatLiteralExpression>(t.source(), t.to_f32());
|
return create<ast::FloatLiteralExpression>(t.source(), t.to_f32());
|
||||||
}
|
}
|
||||||
|
if (handle_error(t)) {
|
||||||
|
return Failure::kErrored;
|
||||||
|
}
|
||||||
return Failure::kNoMatch;
|
return Failure::kNoMatch;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3165,13 +3165,18 @@ bool ParserImpl::expect(std::string_view use, Token::Type tok) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Error cases
|
||||||
|
synchronized_ = false;
|
||||||
|
if (handle_error(t)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
std::stringstream err;
|
std::stringstream err;
|
||||||
err << "expected '" << Token::TypeToName(tok) << "'";
|
err << "expected '" << Token::TypeToName(tok) << "'";
|
||||||
if (!use.empty()) {
|
if (!use.empty()) {
|
||||||
err << " for " << use;
|
err << " for " << use;
|
||||||
}
|
}
|
||||||
add_error(t, err.str());
|
add_error(t, err.str());
|
||||||
synchronized_ = false;
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3220,6 +3225,9 @@ Expect<std::string> ParserImpl::expect_ident(std::string_view use) {
|
||||||
|
|
||||||
return {t.to_str(), t.source()};
|
return {t.to_str(), t.source()};
|
||||||
}
|
}
|
||||||
|
if (handle_error(t)) {
|
||||||
|
return Failure::kErrored;
|
||||||
|
}
|
||||||
synchronized_ = false;
|
synchronized_ = false;
|
||||||
return add_error(t.source(), "expected identifier", use);
|
return add_error(t.source(), "expected identifier", use);
|
||||||
}
|
}
|
||||||
|
@ -3342,6 +3350,16 @@ bool ParserImpl::is_sync_token(const Token& t) const {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool ParserImpl::handle_error(const Token& t) {
|
||||||
|
// The token might itself be an error.
|
||||||
|
if (t.IsError()) {
|
||||||
|
synchronized_ = false;
|
||||||
|
add_error(t.source(), t.to_str());
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
template <typename F, typename T>
|
template <typename F, typename T>
|
||||||
T ParserImpl::without_error(F&& body) {
|
T ParserImpl::without_error(F&& body) {
|
||||||
silence_errors_++;
|
silence_errors_++;
|
||||||
|
|
|
@ -828,6 +828,12 @@ class ParserImpl {
|
||||||
/// @see sync().
|
/// @see sync().
|
||||||
bool is_sync_token(const Token& t) const;
|
bool is_sync_token(const Token& t) const;
|
||||||
|
|
||||||
|
/// If `t` is an error token, then `synchronized_` is set to false and the
|
||||||
|
/// token's error is appended to the builder's diagnostics. If `t` is not an
|
||||||
|
/// error token, then this function does nothing and false is returned.
|
||||||
|
/// @returns true if `t` is an error, otherwise false.
|
||||||
|
bool handle_error(const Token& t);
|
||||||
|
|
||||||
/// @returns true if #synchronized_ is true and the number of reported errors
|
/// @returns true if #synchronized_ is true and the number of reported errors
|
||||||
/// is less than #max_errors_.
|
/// is less than #max_errors_.
|
||||||
bool continue_parsing() {
|
bool continue_parsing() {
|
||||||
|
|
|
@ -1689,6 +1689,12 @@ fn f() { return 1 ^ >; }
|
||||||
)");
|
)");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(ParserImplErrorTest, InvalidUTF8) {
|
||||||
|
EXPECT("fn fu\xd0nc() {}",
|
||||||
|
"test.wgsl:1:4 error: invalid UTF-8\n"
|
||||||
|
"fn fu\xD0nc() {}\n");
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
} // namespace wgsl
|
} // namespace wgsl
|
||||||
} // namespace reader
|
} // namespace reader
|
||||||
|
|
|
@ -468,27 +468,35 @@ std::pair<CodePoint, size_t> Decode(const uint8_t* ptr, size_t len) {
|
||||||
|
|
||||||
CodePoint c;
|
CodePoint c;
|
||||||
|
|
||||||
|
uint8_t valid = 0x80;
|
||||||
switch (n) {
|
switch (n) {
|
||||||
// Note: n=0 (invalid) is correctly handled without a case.
|
// Note: n=0 (invalid) is correctly handled without a case.
|
||||||
case 1:
|
case 1:
|
||||||
c = CodePoint{ptr[0]};
|
c = CodePoint{ptr[0]};
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
|
valid &= ptr[1];
|
||||||
c = CodePoint{(static_cast<uint32_t>(ptr[0] & 0b00011111) << 6) |
|
c = CodePoint{(static_cast<uint32_t>(ptr[0] & 0b00011111) << 6) |
|
||||||
(static_cast<uint32_t>(ptr[1] & 0b00111111))};
|
(static_cast<uint32_t>(ptr[1] & 0b00111111))};
|
||||||
break;
|
break;
|
||||||
case 3:
|
case 3:
|
||||||
|
valid &= ptr[1] & ptr[2];
|
||||||
c = CodePoint{(static_cast<uint32_t>(ptr[0] & 0b00001111) << 12) |
|
c = CodePoint{(static_cast<uint32_t>(ptr[0] & 0b00001111) << 12) |
|
||||||
(static_cast<uint32_t>(ptr[1] & 0b00111111) << 6) |
|
(static_cast<uint32_t>(ptr[1] & 0b00111111) << 6) |
|
||||||
(static_cast<uint32_t>(ptr[2] & 0b00111111))};
|
(static_cast<uint32_t>(ptr[2] & 0b00111111))};
|
||||||
break;
|
break;
|
||||||
case 4:
|
case 4:
|
||||||
|
valid &= ptr[1] & ptr[2] & ptr[3];
|
||||||
c = CodePoint{(static_cast<uint32_t>(ptr[0] & 0b00000111) << 18) |
|
c = CodePoint{(static_cast<uint32_t>(ptr[0] & 0b00000111) << 18) |
|
||||||
(static_cast<uint32_t>(ptr[1] & 0b00111111) << 12) |
|
(static_cast<uint32_t>(ptr[1] & 0b00111111) << 12) |
|
||||||
(static_cast<uint32_t>(ptr[2] & 0b00111111) << 6) |
|
(static_cast<uint32_t>(ptr[2] & 0b00111111) << 6) |
|
||||||
(static_cast<uint32_t>(ptr[3] & 0b00111111))};
|
(static_cast<uint32_t>(ptr[3] & 0b00111111))};
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
if (!valid) {
|
||||||
|
n = 0;
|
||||||
|
c = 0;
|
||||||
|
}
|
||||||
return {c, n};
|
return {c, n};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -455,7 +455,8 @@ TEST_P(DecodeUTF8InvalidTest, Invalid) {
|
||||||
EXPECT_EQ(width, 0u);
|
EXPECT_EQ(width, 0u);
|
||||||
}
|
}
|
||||||
|
|
||||||
INSTANTIATE_TEST_SUITE_P(Invalid,
|
INSTANTIATE_TEST_SUITE_P(
|
||||||
|
Invalid,
|
||||||
DecodeUTF8InvalidTest,
|
DecodeUTF8InvalidTest,
|
||||||
::testing::ValuesIn({
|
::testing::ValuesIn({
|
||||||
"\x80\x80\x80\x80", // 10000000
|
"\x80\x80\x80\x80", // 10000000
|
||||||
|
@ -478,6 +479,17 @@ INSTANTIATE_TEST_SUITE_P(Invalid,
|
||||||
"\xf8\x80\x80\x80", // 11111000
|
"\xf8\x80\x80\x80", // 11111000
|
||||||
"\xfe\x80\x80\x80", // 11111110
|
"\xfe\x80\x80\x80", // 11111110
|
||||||
"\xff\x80\x80\x80", // 11111111
|
"\xff\x80\x80\x80", // 11111111
|
||||||
|
|
||||||
|
"\xd0", // 2-bytes, missing second byte
|
||||||
|
"\xe8\x8f", // 3-bytes, missing third byte
|
||||||
|
"\xf4\x8f\x8f", // 4-bytes, missing fourth byte
|
||||||
|
|
||||||
|
"\xd0\x7f", // 2-bytes, second byte MSB unset
|
||||||
|
"\xe8\x7f\x8f", // 3-bytes, second byte MSB unset
|
||||||
|
"\xe8\x8f\x7f", // 3-bytes, third byte MSB unset
|
||||||
|
"\xf4\x7f\x8f\x8f", // 4-bytes, second byte MSB unset
|
||||||
|
"\xf4\x8f\x7f\x8f", // 4-bytes, third byte MSB unset
|
||||||
|
"\xf4\x8f\x8f\x7f", // 4-bytes, fourth byte MSB unset
|
||||||
}));
|
}));
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
Loading…
Reference in New Issue