Count the line pos, offset and size of compilation message in UTF-16

This patch counts the line position, offset and size of the compilation message in UTF-16 and saves them to WGPUCompilationMessage to align the latest WebGPU SPEC. Bug: dawn:1357 Change-Id: If8f4026bd5b4a64a078e100762b6d1f61da50053 Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/115640 Kokoro: Kokoro <noreply+kokoro@google.com> Commit-Queue: Jiawei Shao <jiawei.shao@intel.com> Reviewed-by: Corentin Wallez <cwallez@chromium.org>
2025-12-16 00:17:03 +00:00 · 2023-01-10 00:03:24 +00:00
parent 3d2caaae47
commit f7beb85fd1
10 changed files with 177 additions and 27 deletions
--- a/dawn.json
+++ b/dawn.json
@@ -749,7 +749,10 @@
            {"name": "line num", "type": "uint64_t"},
            {"name": "line pos", "type": "uint64_t"},
            {"name": "offset", "type": "uint64_t"},
-            {"name": "length", "type": "uint64_t"}
+            {"name": "length", "type": "uint64_t"},
+            {"name": "utf16 line pos", "type": "uint64_t"},
+            {"name": "utf16 offset", "type": "uint64_t"},
+            {"name": "utf16 length", "type": "uint64_t"}
        ]
    },
    "compilation message type": {
--- a/include/tint/tint.h
+++ b/include/tint/tint.h
@@ -25,6 +25,7 @@
 #include "src/tint/diagnostic/printer.h"
 #include "src/tint/inspector/inspector.h"
 #include "src/tint/reader/reader.h"
+#include "src/tint/text/unicode.h"
 #include "src/tint/transform/binding_remapper.h"
 #include "src/tint/transform/clamp_frag_depth.h"
 #include "src/tint/transform/first_index_offset.h"
--- a/src/dawn/native/CompilationMessages.cpp
+++ b/src/dawn/native/CompilationMessages.cpp
@@ -36,6 +36,39 @@ WGPUCompilationMessageType tintSeverityToMessageType(tint::diag::Severity severi

 }  // anonymous namespace

+ResultOrError<uint64_t> CountUTF16CodeUnitsFromUTF8String(const std::string_view& utf8String) {
+    if (tint::text::utf8::IsASCII(utf8String)) {
+        return utf8String.size();
+    }
+
+    uint64_t numberOfUTF16CodeUnits = 0;
+    std::string_view remaining = utf8String;
+    while (!remaining.empty()) {
+        auto [codePoint, utf8CharacterByteLength] = tint::text::utf8::Decode(remaining);
+        // Directly return as something wrong has happened during the UTF-8 decoding.
+        if (utf8CharacterByteLength == 0) {
+            return DAWN_INTERNAL_ERROR("Fail to decode the unicode string");
+        }
+        remaining = remaining.substr(utf8CharacterByteLength);
+
+        // Count the number of code units in UTF-16. See https://en.wikipedia.org/wiki/UTF-16 for
+        // more details.
+        if (codePoint.value <= 0xD7FF || (codePoint.value >= 0xE000 && codePoint.value <= 0xFFFF)) {
+            // Code points from U+0000 to U+D7FF and U+E000 to U+FFFF are encoded as single 16-bit
+            // code units.
+            ++numberOfUTF16CodeUnits;
+        } else if (codePoint.value >= 0x10000) {
+            // Code points from U+010000 to U+10FFFF are encoded as two 16-bit code units.
+            numberOfUTF16CodeUnits += 2;
+        } else {
+            // UTF-16 cannot encode the code points from U+D800 to U+DFFF.
+            return DAWN_INTERNAL_ERROR("The unicode string contains illegal unicode code point.");
+        }
+    }
+
+    return numberOfUTF16CodeUnits;
+}
+
 OwnedCompilationMessages::OwnedCompilationMessages() {
    mCompilationInfo.nextInChain = 0;
    mCompilationInfo.messageCount = 0;
@@ -53,23 +86,29 @@ void OwnedCompilationMessages::AddMessageForTesting(std::string message,
    // Cannot add messages after GetCompilationInfo has been called.
    ASSERT(mCompilationInfo.messages == nullptr);

+    // Message can only contain ascii characters.
+    ASSERT(tint::text::utf8::IsASCII(message));
+
    mMessageStrings.push_back(message);
    mMessages.push_back({nullptr, nullptr, static_cast<WGPUCompilationMessageType>(type), lineNum,
-                         linePos, offset, length});
+                         linePos, offset, length, linePos, offset, length});
 }

-void OwnedCompilationMessages::AddMessage(const tint::diag::Diagnostic& diagnostic) {
+MaybeError OwnedCompilationMessages::AddMessage(const tint::diag::Diagnostic& diagnostic) {
    // Cannot add messages after GetCompilationInfo has been called.
    ASSERT(mCompilationInfo.messages == nullptr);

    // Tint line and column values are 1-based.
    uint64_t lineNum = diagnostic.source.range.begin.line;
-    uint64_t lineCol = diagnostic.source.range.begin.column;
+    uint64_t linePosInBytes = diagnostic.source.range.begin.column;
    // The offset is 0-based.
-    uint64_t offset = 0;
-    uint64_t length = 0;
+    uint64_t offsetInBytes = 0;
+    uint64_t lengthInBytes = 0;
+    uint64_t linePosInUTF16 = 0;
+    uint64_t offsetInUTF16 = 0;
+    uint64_t lengthInUTF16 = 0;

-    if (lineNum && lineCol && diagnostic.source.file) {
+    if (lineNum && linePosInBytes && diagnostic.source.file) {
        const tint::Source::FileContent& content = diagnostic.source.file->content;

        // Tint stores line as std::string_view in a complete source std::string that's in the
@@ -78,23 +117,38 @@ void OwnedCompilationMessages::AddMessage(const tint::diag::Diagnostic& diagnost
        // range starts at 1 while the array of lines start at 0 (hence the -1).
        const char* fileStart = content.data.data();
        const char* lineStart = content.lines[lineNum - 1].data();
-        offset = static_cast<uint64_t>(lineStart - fileStart) + lineCol - 1;
+        offsetInBytes = static_cast<uint64_t>(lineStart - fileStart) + linePosInBytes - 1;
+
+        // The linePosInBytes is 1-based.
+        uint64_t linePosOffsetInUTF16 = 0;
+        DAWN_TRY_ASSIGN(linePosOffsetInUTF16, CountUTF16CodeUnitsFromUTF8String(
+                                                  std::string_view(lineStart, linePosInBytes - 1)));
+        linePosInUTF16 = linePosOffsetInUTF16 + 1;
+
+        // The offset is 0-based.
+        uint64_t lineStartToFileStartOffsetInUTF16 = 0;
+        DAWN_TRY_ASSIGN(lineStartToFileStartOffsetInUTF16,
+                        CountUTF16CodeUnitsFromUTF8String(std::string_view(
+                            fileStart, static_cast<uint64_t>(lineStart - fileStart))));
+        offsetInUTF16 = lineStartToFileStartOffsetInUTF16 + linePosInUTF16 - 1;

        // If the range has a valid start but the end is not specified, clamp it to the start.
        uint64_t endLineNum = diagnostic.source.range.end.line;
        uint64_t endLineCol = diagnostic.source.range.end.column;
        if (endLineNum == 0 || endLineCol == 0) {
            endLineNum = lineNum;
-            endLineCol = lineCol;
+            endLineCol = linePosInBytes;
        }

        const char* endLineStart = content.lines[endLineNum - 1].data();
-        uint64_t endOffset = static_cast<uint64_t>(endLineStart - fileStart) + endLineCol - 1;
-
+        uint64_t endOffsetInBytes =
+            static_cast<uint64_t>(endLineStart - fileStart) + endLineCol - 1;
        // The length of the message is the difference between the starting offset and the
-        // ending offset. Negative ranges aren't allowed
-        ASSERT(endOffset >= offset);
-        length = endOffset - offset;
+        // ending offset. Negative ranges aren't allowed.
+        ASSERT(endOffsetInBytes >= offsetInBytes);
+        lengthInBytes = endOffsetInBytes - offsetInBytes;
+        DAWN_TRY_ASSIGN(lengthInUTF16, CountUTF16CodeUnitsFromUTF8String(std::string_view(
+                                           fileStart + offsetInBytes, lengthInBytes)));
    }

    if (diagnostic.code) {
@@ -104,18 +158,23 @@ void OwnedCompilationMessages::AddMessage(const tint::diag::Diagnostic& diagnost
    }

    mMessages.push_back({nullptr, nullptr, tintSeverityToMessageType(diagnostic.severity), lineNum,
-                         lineCol, offset, length});
+                         linePosInBytes, offsetInBytes, lengthInBytes, linePosInUTF16,
+                         offsetInUTF16, lengthInUTF16});
+
+    return {};
 }

-void OwnedCompilationMessages::AddMessages(const tint::diag::List& diagnostics) {
+MaybeError OwnedCompilationMessages::AddMessages(const tint::diag::List& diagnostics) {
    // Cannot add messages after GetCompilationInfo has been called.
    ASSERT(mCompilationInfo.messages == nullptr);

    for (const auto& diag : diagnostics) {
-        AddMessage(diag);
+        DAWN_TRY(AddMessage(diag));
    }

    AddFormattedTintMessages(diagnostics);
+
+    return {};
 }

 void OwnedCompilationMessages::ClearMessages() {
--- a/src/dawn/native/CompilationMessages.h
+++ b/src/dawn/native/CompilationMessages.h
@@ -18,6 +18,7 @@
 #include <string>
 #include <vector>

+#include "dawn/native/Error.h"
 #include "dawn/native/dawn_platform.h"

 #include "dawn/common/NonCopyable.h"
@@ -29,6 +30,8 @@ class List;

 namespace dawn::native {

+ResultOrError<uint64_t> CountUTF16CodeUnitsFromUTF8String(const std::string_view& utf8String);
+
 class OwnedCompilationMessages : public NonCopyable {
  public:
    OwnedCompilationMessages();
@@ -41,14 +44,14 @@ class OwnedCompilationMessages : public NonCopyable {
        uint64_t linePos = 0,
        uint64_t offset = 0,
        uint64_t length = 0);
-    void AddMessages(const tint::diag::List& diagnostics);
+    MaybeError AddMessages(const tint::diag::List& diagnostics);
    void ClearMessages();

    const WGPUCompilationInfo* GetCompilationInfo();
    const std::vector<std::string>& GetFormattedTintMessages();

  private:
-    void AddMessage(const tint::diag::Diagnostic& diagnostic);
+    MaybeError AddMessage(const tint::diag::Diagnostic& diagnostic);
    void AddFormattedTintMessages(const tint::diag::List& diagnostics);

    WGPUCompilationInfo mCompilationInfo;
--- a/src/dawn/native/ShaderModule.cpp
+++ b/src/dawn/native/ShaderModule.cpp
@@ -299,7 +299,7 @@ ResultOrError<tint::Program> ParseWGSL(const tint::Source::File* file,
 #if TINT_BUILD_WGSL_READER
    tint::Program program = tint::reader::wgsl::Parse(file);
    if (outMessages != nullptr) {
-        outMessages->AddMessages(program.Diagnostics());
+        DAWN_TRY(outMessages->AddMessages(program.Diagnostics()));
    }
    if (!program.IsValid()) {
        return DAWN_VALIDATION_ERROR("Tint WGSL reader failure: %s\n", program.Diagnostics().str());
@@ -316,7 +316,7 @@ ResultOrError<tint::Program> ParseSPIRV(const std::vector<uint32_t>& spirv,
 #if TINT_BUILD_SPV_READER
    tint::Program program = tint::reader::spirv::Parse(spirv);
    if (outMessages != nullptr) {
-        outMessages->AddMessages(program.Diagnostics());
+        DAWN_TRY(outMessages->AddMessages(program.Diagnostics()));
    }
    if (!program.IsValid()) {
        return DAWN_VALIDATION_ERROR("Tint SPIR-V reader failure:\nParser: %s\n",
@@ -789,7 +789,7 @@ MaybeError ValidateWGSLProgramExtension(const DeviceBase* device,

    if (hasDisallowedExtension) {
        if (outMessages != nullptr) {
-            outMessages->AddMessages(messages);
+            DAWN_TRY(outMessages->AddMessages(messages));
        }
        return DAWN_MAKE_ERROR(InternalErrorType::Validation,
                               "Shader module uses extension(s) not enabled for its device.");
@@ -983,7 +983,7 @@ ResultOrError<tint::Program> RunTransforms(tint::transform::Transform* transform
                                           OwnedCompilationMessages* outMessages) {
    tint::transform::Output output = transform->Run(program, inputs);
    if (outMessages != nullptr) {
-        outMessages->AddMessages(output.program.Diagnostics());
+        DAWN_TRY(outMessages->AddMessages(output.program.Diagnostics()));
    }
    DAWN_INVALID_IF(!output.program.IsValid(), "Tint program failure: %s\n",
                    output.program.Diagnostics().str());
--- a/src/dawn/tests/BUILD.gn
+++ b/src/dawn/tests/BUILD.gn
@@ -301,6 +301,7 @@ dawn_test("dawn_unittests") {
    "unittests/SystemUtilsTests.cpp",
    "unittests/ToBackendTests.cpp",
    "unittests/TypedIntegerTests.cpp",
+    "unittests/UnicodeTests.cpp",
    "unittests/native/BlobTests.cpp",
    "unittests/native/CacheRequestTests.cpp",
    "unittests/native/CommandBufferEncodingTests.cpp",
--- a/src/dawn/tests/unittests/UnicodeTests.cpp
+++ b/src/dawn/tests/unittests/UnicodeTests.cpp
@@ -0,0 +1,73 @@
+// Copyright 2022 The Dawn Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dawn/native/ShaderModule.h"
+#include "dawn/tests/unittests/validation/ValidationTest.h"
+
+class CountUTF16CodeUnitsFromUTF8StringTest : public ValidationTest {};
+
+TEST_F(CountUTF16CodeUnitsFromUTF8StringTest, ValidUnicodeString) {
+    struct TestCase {
+        const char* u8String;
+        uint64_t lengthInUTF16;
+    };
+
+    // Referenced from src/tint/text/unicode_test.cc
+    constexpr std::array<TestCase, 12> kTestCases = {{
+        {"", 0},
+        {"abc", 3},
+        {"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xb8\x96\xe7\x95\x8c", 4},
+        {"def\xf0\x9f\x91\x8b\xf0\x9f\x8c\x8e", 7},
+        {"\xed\x9f\xbf", 1},      // CodePoint == 0xD7FF
+        {"\xed\x9f\xbe", 1},      // CodePoint == 0xD7FF - 1
+        {"\xee\x80\x80", 1},      // CodePoint == 0xE000
+        {"\xee\x80\x81", 1},      // CodePoint == 0xE000 + 1
+        {"\xef\xbf\xbf", 1},      // CodePoint == 0xFFFF
+        {"\xef\xbf\xbe", 1},      // CodePoint == 0xFFFF - 1
+        {"\xf0\x90\x80\x80", 2},  // CodePoint == 0x10000
+        {"\xf0\x90\x80\x81", 2},  // CodePoint == 0x10000 + 1
+    }};
+
+    for (const TestCase& testCase : kTestCases) {
+        dawn::native::ResultOrError<uint64_t> resultOrError =
+            dawn::native::CountUTF16CodeUnitsFromUTF8String(std::string_view(testCase.u8String));
+        ASSERT_TRUE(resultOrError.IsSuccess());
+        ASSERT_EQ(testCase.lengthInUTF16, resultOrError.AcquireSuccess());
+    }
+}
+
+TEST_F(CountUTF16CodeUnitsFromUTF8StringTest, InvalidUnicodeString) {
+    // Referenced from src/tint/text/unicode_test.cc
+    constexpr std::array<const char*, 12> kTestCases = {{
+        "\xed\xa0\x80",  // CodePoint == 0xD7FF + 1
+        "\xed\xbf\xbf",  // CodePoint == 0xE000 - 1
+        "ab\xed\xa0\x80",
+        "\xd0",              // 2-bytes, missing second byte
+        "\xe8\x8f",          // 3-bytes, missing third byte
+        "\xf4\x8f\x8f",      // 4-bytes, missing fourth byte
+        "\xd0\x7f",          // 2-bytes, second byte MSB unset
+        "\xe8\x7f\x8f",      // 3-bytes, second byte MSB unset
+        "\xe8\x8f\x7f",      // 3-bytes, third byte MSB unset
+        "\xf4\x7f\x8f\x8f",  // 4-bytes, second byte MSB unset
+        "\xf4\x8f\x7f\x8f",  // 4-bytes, third byte MSB unset
+        "\xf4\x8f\x8f\x7f",  // 4-bytes, fourth byte MSB unset
+    }};
+
+    for (const char* testCase : kTestCases) {
+        dawn::native::ResultOrError<uint64_t> resultOrError =
+            dawn::native::CountUTF16CodeUnitsFromUTF8String(std::string_view(testCase));
+        ASSERT_TRUE(resultOrError.IsError());
+        std::ignore = resultOrError.AcquireError();
+    }
+}
--- a/src/dawn/tests/unittests/wire/WireShaderModuleTests.cpp
+++ b/src/dawn/tests/unittests/wire/WireShaderModuleTests.cpp
@@ -93,7 +93,7 @@ TEST_F(WireShaderModuleTests, GetCompilationInfo) {
    wgpuShaderModuleGetCompilationInfo(shaderModule, ToMockGetCompilationInfoCallback, nullptr);

    WGPUCompilationMessage message = {
-        nullptr, "Test Message", WGPUCompilationMessageType_Info, 2, 4, 6, 8};
+        nullptr, "Test Message", WGPUCompilationMessageType_Info, 2, 4, 6, 8, 4, 6, 8};
    WGPUCompilationInfo compilationInfo;
    compilationInfo.nextInChain = nullptr;
    compilationInfo.messageCount = 1;
@@ -133,7 +133,7 @@ TEST_F(WireShaderModuleTests, GetCompilationInfoBeforeDisconnect) {
    wgpuShaderModuleGetCompilationInfo(shaderModule, ToMockGetCompilationInfoCallback, nullptr);

    WGPUCompilationMessage message = {
-        nullptr, "Test Message", WGPUCompilationMessageType_Info, 2, 4, 6, 8};
+        nullptr, "Test Message", WGPUCompilationMessageType_Info, 2, 4, 6, 8, 4, 6, 8};
    WGPUCompilationInfo compilationInfo;
    compilationInfo.nextInChain = nullptr;
    compilationInfo.messageCount = 1;
@@ -193,7 +193,7 @@ TEST_F(WireShaderModuleTests, GetCompilationInfoInsideCallbackBeforeDisconnect)
                                       &testData);

    WGPUCompilationMessage message = {
-        nullptr, "Test Message", WGPUCompilationMessageType_Info, 2, 4, 6, 8};
+        nullptr, "Test Message", WGPUCompilationMessageType_Info, 2, 4, 6, 8, 4, 6, 8};
    WGPUCompilationInfo compilationInfo;
    compilationInfo.nextInChain = nullptr;
    compilationInfo.messageCount = 1;
@@ -220,7 +220,7 @@ TEST_F(WireShaderModuleTests, GetCompilationInfoInsideCallbackBeforeDestruction)
                                       &testData);

    WGPUCompilationMessage message = {
-        nullptr, "Test Message", WGPUCompilationMessageType_Info, 2, 4, 6, 8};
+        nullptr, "Test Message", WGPUCompilationMessageType_Info, 2, 4, 6, 8, 4, 6, 8};
    WGPUCompilationInfo compilationInfo;
    compilationInfo.nextInChain = nullptr;
    compilationInfo.messageCount = 1;
--- a/src/tint/text/unicode.cc
+++ b/src/tint/text/unicode.cc
@@ -427,6 +427,10 @@ std::pair<CodePoint, size_t> Decode(const uint8_t* ptr, size_t len) {
    return {c, n};
 }

+std::pair<CodePoint, size_t> Decode(std::string_view utf8_string) {
+    return Decode(reinterpret_cast<const uint8_t*>(utf8_string.data()), utf8_string.size());
+}
+
 bool IsASCII(std::string_view str) {
    for (auto c : str) {
        if (c & 0x80) {
--- a/src/tint/text/unicode.h
+++ b/src/tint/text/unicode.h
@@ -69,6 +69,12 @@ namespace utf8 {
 ///          If the next code point cannot be decoded then returns [0,0].
 std::pair<CodePoint, size_t> Decode(const uint8_t* ptr, size_t len);

+/// Decodes the first code point in the utf8 string.
+/// @param utf8_string the string view that contains the utf8 sequence
+/// @returns a pair of CodePoint and width in code units (bytes).
+///          If the next code point cannot be decoded then returns [0,0].
+std::pair<CodePoint, size_t> Decode(std::string_view utf8_string);
+
 /// @returns true if all the utf-8 code points in the string are ASCII
 /// (code-points 0x00..0x7f).
 bool IsASCII(std::string_view);