Check number of digits in integer during tokenization

While looking ahead to determine if a token is an integer, check the number of digits to make sure that it can actually fit in the internal representation. This is an optimization on the existing code, to cause an early exit and prevent pathological cases with huge integers from consuming too much processing time, when they will never succeed. From a functional perspective this has not effect on whether or not a token will be accepted as an integer, so almost all of the tests do no need an update. The one exception is a case where the lexer now catches the invalid integer earlier in the tokenization, so the error message is a shorter. This does not handle the equivalent problem for float literals, though I believe that only exists for non-hex floats. BUG=chromium:1240715 Change-Id: I27e43711d5f5eda1d54a4128ba514f810abd0313 Reviewed-on: https://dawn-review.googlesource.com/c/tint/+/62280 Auto-Submit: Ryan Harrison <rharrison@chromium.org> Kokoro: Kokoro <noreply+kokoro@google.com> Commit-Queue: Ben Clayton <bclayton@google.com> Reviewed-by: Ben Clayton <bclayton@google.com>
2021-08-27 08:29:37 +00:00 · 2021-08-27 08:29:37 +00:00 · 200cdd2052
parent 9021eb5594
commit 200cdd2052
2 changed files with 65 additions and 5 deletions
--- a/src/reader/wgsl/lexer.cc
+++ b/src/reader/wgsl/lexer.cc
@ -543,6 +543,7 @@ Token Lexer::build_token_from_int_if_possible(Source source,
 }

 Token Lexer::try_hex_integer() {
+  constexpr size_t kMaxDigits = 8;  // Valid for both 32-bit integer types
  auto start = pos_;
  auto end = pos_;

@ -551,13 +552,23 @@ Token Lexer::try_hex_integer() {
  if (matches(end, "-")) {
    end++;
  }
+
  if (!matches(end, "0x")) {
-    return Token();
+    return {};
  }
  end += 2;

+  auto first = end;
  while (!is_eof() && is_hex(content_->data[end])) {
-    end += 1;
+    end++;
+
+    auto digits = end - first;
+    if (digits > kMaxDigits) {
+      return {Token::Type::kError, source,
+              "integer literal (" +
+                  content_->data.substr(start, end - 1 - start) +
+                  "...) has too many digits"};
+    }
  }

  pos_ = end;
@ -567,6 +578,7 @@ Token Lexer::try_hex_integer() {
 }

 Token Lexer::try_integer() {
+  constexpr size_t kMaxDigits = 10;  // Valid for both 32-bit integer types
  auto start = pos_;
  auto end = start;

@ -575,6 +587,7 @@ Token Lexer::try_integer() {
  if (matches(end, "-")) {
    end++;
  }
+
  if (end >= len_ || !is_digit(content_->data[end])) {
    return {};
  }
@ -582,6 +595,14 @@ Token Lexer::try_integer() {
  auto first = end;
  while (end < len_ && is_digit(content_->data[end])) {
    end++;
+
+    auto digits = end - first;
+    if (digits > kMaxDigits) {
+      return {Token::Type::kError, source,
+              "integer literal (" +
+                  content_->data.substr(start, end - 1 - start) +
+                  "...) has too many digits"};
+    }
  }

  // If the first digit is a zero this must only be zero as leading zeros
--- a/src/reader/wgsl/lexer_test.cc
+++ b/src/reader/wgsl/lexer_test.cc
@ -251,6 +251,27 @@ TEST_F(LexerTest, IntegerTest_HexSignedTooSmall) {
  EXPECT_EQ(t.to_str(), "i32 (-0x8000000F) too small");
 }

+TEST_F(LexerTest, IntegerTest_HexSignedTooManyDigits) {
+  {
+    Source::FileContent content("-0x100000000000000000000000");
+    Lexer l("test.wgsl", &content);
+
+    auto t = l.next();
+    ASSERT_TRUE(t.Is(Token::Type::kError));
+    EXPECT_EQ(t.to_str(),
+              "integer literal (-0x10000000...) has too many digits");
+  }
+  {
+    Source::FileContent content("0x100000000000000");
+    Lexer l("test.wgsl", &content);
+
+    auto t = l.next();
+    ASSERT_TRUE(t.Is(Token::Type::kError));
+    EXPECT_EQ(t.to_str(),
+              "integer literal (0x10000000...) has too many digits");
+  }
+}
+
 struct HexUnsignedIntData {
  const char* input;
  uint32_t result;
@ -287,13 +308,13 @@ INSTANTIATE_TEST_SUITE_P(
                    HexUnsignedIntData{"0xFFFFFFFFu",
                                       std::numeric_limits<uint32_t>::max()}));

-TEST_F(LexerTest, IntegerTest_HexUnsignedTooLarge) {
-  Source::FileContent content("0xffffffffffu");
+TEST_F(LexerTest, IntegerTest_HexUnsignedTooManyDigits) {
+  Source::FileContent content("0x1000000000000000000000u");
  Lexer l("test.wgsl", &content);

  auto t = l.next();
  ASSERT_TRUE(t.Is(Token::Type::kError));
-  EXPECT_EQ(t.to_str(), "u32 (0xffffffffff) too large");
+  EXPECT_EQ(t.to_str(), "integer literal (0x10000000...) has too many digits");
 }

 struct UnsignedIntData {
@ -325,6 +346,15 @@ INSTANTIATE_TEST_SUITE_P(LexerTest,
                                         UnsignedIntData{"4294967295u",
                                                         4294967295u}));

+TEST_F(LexerTest, IntegerTest_UnsignedTooManyDigits) {
+  Source::FileContent content("10000000000000000000000u");
+  Lexer l("test.wgsl", &content);
+
+  auto t = l.next();
+  ASSERT_TRUE(t.Is(Token::Type::kError));
+  EXPECT_EQ(t.to_str(), "integer literal (1000000000...) has too many digits");
+}
+
 struct SignedIntData {
  const char* input;
  int32_t result;
@ -357,6 +387,15 @@ INSTANTIATE_TEST_SUITE_P(
                    SignedIntData{"2147483647", 2147483647},
                    SignedIntData{"-2147483648", -2147483648LL}));

+TEST_F(LexerTest, IntegerTest_SignedTooManyDigits) {
+  Source::FileContent content("-10000000000000000");
+  Lexer l("test.wgsl", &content);
+
+  auto t = l.next();
+  ASSERT_TRUE(t.Is(Token::Type::kError));
+  EXPECT_EQ(t.to_str(), "integer literal (-1000000000...) has too many digits");
+}
+
 using IntegerTest_Invalid = testing::TestWithParam<const char*>;
 TEST_P(IntegerTest_Invalid, Parses) {
  Source::FileContent content(GetParam());