QueryHelper: improve the precision of the tick->ns conversion.

Previously the conversion lost precision because of the repeated float additions and rounding of float to uint. It is reworked to operate using only integers, giving a better precision (3e-5 instead of 2e-3) for all periods between 1ns and 2^16ns (which all GPUs seem to fall in to). The QueryHelper test is reworked to test multiple periods to provide better certainty that the maths in the conversion is correct. Bug: dawn:1250 Change-Id: I43703bb3a40c4e362d78126e3bf0d830690bc1d7 Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/77780 Reviewed-by: Loko Kung <lokokung@google.com> Commit-Queue: Corentin Wallez <cwallez@chromium.org>
2025-10-28 20:49:53 +00:00 · 2022-02-01 12:05:22 +00:00 · 2022-02-01 12:05:22 +00:00 · 777654e6b9
commit 777654e6b9
parent 9dfcc20750
4 changed files with 116 additions and 62 deletions
--- a/src/dawn_native/CommandEncoder.cpp
+++ b/src/dawn_native/CommandEncoder.cpp
@ -439,9 +439,8 @@ namespace dawn::native {
                                                     availability.size() * sizeof(uint32_t)));

            // Timestamp params uniform buffer
-            TimestampParams params = {firstQuery, queryCount,
-                                      static_cast<uint32_t>(destinationOffset),
-                                      device->GetTimestampPeriodInNS()};
+            TimestampParams params(firstQuery, queryCount, static_cast<uint32_t>(destinationOffset),
+                                   device->GetTimestampPeriodInNS());

            BufferDescriptor parmsDesc = {};
            parmsDesc.usage = wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst;
--- a/src/dawn_native/QueryHelper.cpp
+++ b/src/dawn_native/QueryHelper.cpp
@ -24,6 +24,8 @@
 #include "dawn_native/InternalPipelineStore.h"
 #include "dawn_native/utils/WGPUHelpers.h"

+#include <cmath>
+
 namespace dawn::native {

    namespace {
@ -32,7 +34,8 @@ namespace dawn::native {
        static_assert(offsetof(dawn::native::TimestampParams, first) == 0);
        static_assert(offsetof(dawn::native::TimestampParams, count) == 4);
        static_assert(offsetof(dawn::native::TimestampParams, offset) == 8);
-        static_assert(offsetof(dawn::native::TimestampParams, period) == 12);
+        static_assert(offsetof(dawn::native::TimestampParams, multiplier) == 12);
+        static_assert(offsetof(dawn::native::TimestampParams, rightShift) == 16);

        static const char sConvertTimestampsToNanoseconds[] = R"(
            struct Timestamp {
@ -52,14 +55,14 @@ namespace dawn::native {
                first  : u32;
                count  : u32;
                offset : u32;
-                period : f32;
+                multiplier : u32;
+                right_shift  : u32;
            };

            @group(0) @binding(0) var<storage, read_write> timestamps : TimestampArr;
            @group(0) @binding(1) var<storage, read> availability : AvailabilityArr;
            @group(0) @binding(2) var<uniform> params : TimestampParams;

-
            let sizeofTimestamp : u32 = 8u;

            @stage(compute) @workgroup_size(8, 1, 1)
@ -68,8 +71,6 @@ namespace dawn::native {

                var index = GlobalInvocationID.x + params.offset / sizeofTimestamp;

-                var timestamp = timestamps.t[index];
-
                // Return 0 for the unavailable value.
                if (availability.v[GlobalInvocationID.x + params.first] == 0u) {
                    timestamps.t[index].low = 0u;
@ -77,31 +78,40 @@ namespace dawn::native {
                    return;
                }

-                // Multiply the values in timestamps buffer by the period.
-                var period = params.period;
-                var w = 0u;
+                var timestamp = timestamps.t[index];

-                // If the product of low 32-bits and the period does not exceed the maximum of u32,
-                // directly do the multiplication, otherwise, use two u32 to represent the high
-                // 16-bits and low 16-bits of this u32, then multiply them by the period separately.
-                if (timestamp.low <= u32(f32(0xFFFFFFFFu) / period)) {
-                    timestamps.t[index].low = u32(round(f32(timestamp.low) * period));
-                } else {
-                    var lo = timestamp.low & 0xFFFFu;
-                    var hi = timestamp.low >> 16u;
+                // TODO(dawn:1250): Consider using the umulExtended and uaddCarry intrinsics once
+                // available.
+                var chunks : array<u32, 5>;
+                chunks[0] = timestamp.low & 0xFFFFu;
+                chunks[1] = timestamp.low >> 16u;
+                chunks[2] = timestamp.high & 0xFFFFu;
+                chunks[3] = timestamp.high >> 16u;
+                chunks[4] = 0u;

-                    var t0 = u32(round(f32(lo) * period));
-                    var t1 = u32(round(f32(hi) * period)) + (t0 >> 16u);
-                    w = t1 >> 16u;
-
-                    var result = t1 << 16u;
-                    result = result | (t0 & 0xFFFFu);
-                    timestamps.t[index].low = result;
+                // Multiply all the chunks with the integer period.
+                for (var i = 0u; i < 4u; i = i + 1u) {
+                    chunks[i] = chunks[i] * params.multiplier;
                }

-                // Get the nearest integer to the float result. For high 32-bits, the round
-                // function will greatly help reduce the accuracy loss of the final result.
-                timestamps.t[index].high = u32(round(f32(timestamp.high) * period)) + w;
+                // Propagate the carry
+                var carry = 0u;
+                for (var i = 0u; i < 4u; i = i + 1u) {
+                    var chunk_with_carry = chunks[i] + carry;
+                    carry = chunk_with_carry >> 16u;
+                    chunks[i] = chunk_with_carry & 0xFFFFu;
+                }
+                chunks[4] = carry;
+
+                // Apply the right shift.
+                for (var i = 0u; i < 4u; i = i + 1u) {
+                    var low = chunks[i] >> params.right_shift;
+                    var high = (chunks[i + 1u] << (16u - params.right_shift)) & 0xFFFFu;
+                    chunks[i] = low | high;
+                }
+
+                timestamps.t[index].low = chunks[0] | (chunks[1] << 16u);
+                timestamps.t[index].high = chunks[2] | (chunks[3] << 16u);
            }
        )";

@ -150,6 +160,30 @@ namespace dawn::native {

    }  // anonymous namespace

+    TimestampParams::TimestampParams(uint32_t first, uint32_t count, uint32_t offset, float period)
+        : first(first), count(count), offset(offset) {
+        // The overall conversion happening, if p is the period, m the multiplier, s the shift, is::
+        //
+        //   m = round(p * 2^s)
+        //
+        // Then in the shader we compute:
+        //
+        //   m / 2^s = round(p * 2^s) / 2*s ~= p
+        //
+        // The goal is to find the best shift to keep the precision of computations. The
+        // conversion shader uses chunks of 16 bits to compute the multiplication with the perios,
+        // so we need to keep the multiplier under 2^16. At the same time, the larger the
+        // multiplier, the better the precision, so we maximize the value of the right shift while
+        // keeping the multiplier under 2 ^ 16
+        uint32_t upperLog2 = ceil(log2(period));
+
+        // Clamp the shift to 16 because we're doing computations in 16bit chunks. The
+        // multiplication by the period will overflow the chunks, but timestamps are mostly
+        // informational so that's ok.
+        rightShift = 16u - std::min(upperLog2, 16u);
+        multiplier = uint32_t(period * (1 << rightShift));
+    }
+
    MaybeError EncodeConvertTimestampsToNanoseconds(CommandEncoder* encoder,
                                                    BufferBase* timestamps,
                                                    BufferBase* availability,
--- a/src/dawn_native/QueryHelper.h
+++ b/src/dawn_native/QueryHelper.h
@ -24,10 +24,13 @@ namespace dawn::native {
    class CommandEncoder;

    struct TimestampParams {
+        TimestampParams(uint32_t first, uint32_t count, uint32_t offset, float period);
+
        uint32_t first;
        uint32_t count;
        uint32_t offset;
-        float period;
+        uint32_t multiplier;
+        uint32_t rightShift;
    };

    MaybeError EncodeConvertTimestampsToNanoseconds(CommandEncoder* encoder,
--- a/src/tests/white_box/QueryInternalShaderTests.cpp
+++ b/src/tests/white_box/QueryInternalShaderTests.cpp
@ -43,22 +43,31 @@ namespace {
        // Expect the actual results are approximately equal to the expected values.
        testing::AssertionResult Check(const void* data, size_t size) override {
            DAWN_ASSERT(size == sizeof(uint64_t) * mExpected.size());
-            constexpr static float kErrorToleranceRatio = 0.002f;
+            // The computations in the shader use a multiplier that's a 16bit integer plus a shift
+            // that maximize the multiplier. This means that for the range of periods we care about
+            // (1 to 2^16-1 ns per tick), the high order bit of the multiplier will always be set.
+            // Intuitively this means that we have 15 bits of precision in the computation so we
+            // expect that for the error tolerance.
+            constexpr static float kErrorToleranceRatio = 1.0 / (1 << 15);  // about 3e-5.

            const uint64_t* actual = static_cast<const uint64_t*>(data);
            for (size_t i = 0; i < mExpected.size(); ++i) {
-                if (mExpected[i] == 0 && actual[i] != 0) {
-                    return testing::AssertionFailure()
-                           << "Expected data[" << i << "] to be 0, actual " << actual[i]
-                           << std::endl;
+                if (mExpected[i] == 0) {
+                    if (actual[i] != 0) {
+                        return testing::AssertionFailure()
+                               << "Expected data[" << i << "] to be 0, actual " << actual[i]
+                               << std::endl;
+                    }
+                    return testing::AssertionSuccess();
                }

-                if (abs(static_cast<int64_t>(mExpected[i] - actual[i])) >
-                    mExpected[i] * kErrorToleranceRatio) {
+                float errorRate =
+                    abs(static_cast<int64_t>(mExpected[i] - actual[i])) / float(mExpected[i]);
+                if (errorRate > kErrorToleranceRatio) {
                    return testing::AssertionFailure()
                           << "Expected data[" << i << "] to be " << mExpected[i] << ", actual "
-                           << actual[i] << ". Error rate is larger than " << kErrorToleranceRatio
-                           << std::endl;
+                           << actual[i] << ". Error rate " << errorRate << " is larger than "
+                           << kErrorToleranceRatio << std::endl;
                }
            }

@ -73,12 +82,6 @@ namespace {

 constexpr static uint64_t kSentinelValue = ~uint64_t(0u);

-// A gpu frequency on Intel D3D12 (ticks/second)
-constexpr uint64_t kGPUFrequency = 12000048u;
-constexpr uint64_t kNsPerSecond = 1000000000u;
-// Timestamp period in nanoseconds
-constexpr float kPeriod = static_cast<float>(kNsPerSecond) / kGPUFrequency;
-
 class QueryInternalShaderTests : public DawnTest {
  protected:
    // Original timestamp values in query set for testing
@ -103,7 +106,8 @@ class QueryInternalShaderTests : public DawnTest {
    const std::vector<uint64_t> GetExpectedResults(const std::vector<uint64_t>& origin,
                                                   uint32_t start,
                                                   uint32_t firstQuery,
-                                                   uint32_t queryCount) {
+                                                   uint32_t queryCount,
+                                                   float period) {
        std::vector<uint64_t> expected(origin.begin(), origin.end());
        for (size_t i = 0; i < queryCount; i++) {
            if (availabilities[firstQuery + i] == 0) {
@ -113,13 +117,16 @@ class QueryInternalShaderTests : public DawnTest {
                // Maybe the timestamp * period is larger than the maximum of uint64, so cast the
                // delta value to double (higher precision than float)
                expected[start + i] =
-                    static_cast<uint64_t>(static_cast<double>(origin[start + i]) * kPeriod);
+                    static_cast<uint64_t>(static_cast<double>(origin[start + i]) * period);
            }
        }
        return expected;
    }

-    void RunTest(uint32_t firstQuery, uint32_t queryCount, uint32_t destinationOffset) {
+    void RunTest(uint32_t firstQuery,
+                 uint32_t queryCount,
+                 uint32_t destinationOffset,
+                 float period) {
        ASSERT(destinationOffset % 256 == 0);

        uint64_t size = queryCount * sizeof(uint64_t) + destinationOffset;
@ -147,7 +154,7 @@ class QueryInternalShaderTests : public DawnTest {
                                        kQueryCount * sizeof(uint32_t), wgpu::BufferUsage::Storage);

        // The params uniform buffer
-        dawn::native::TimestampParams params = {firstQuery, queryCount, destinationOffset, kPeriod};
+        dawn::native::TimestampParams params(firstQuery, queryCount, destinationOffset, period);
        wgpu::Buffer paramsBuffer = utils::CreateBufferFromData(device, &params, sizeof(params),
                                                                wgpu::BufferUsage::Uniform);

@ -158,13 +165,13 @@ class QueryInternalShaderTests : public DawnTest {
        queue.Submit(1, &commands);

        const std::vector<uint64_t> expected =
-            GetExpectedResults(timestampValues, start, firstQuery, queryCount);
+            GetExpectedResults(timestampValues, start, firstQuery, queryCount, period);

        EXPECT_BUFFER(timestampsBuffer, 0, size,
-                      new InternalShaderExpectation(expected.data(), size / sizeof(uint64_t)));
+                      new InternalShaderExpectation(expected.data(), size / sizeof(uint64_t)))
+            << "Conversion test for period:" << period << " firstQuery:" << firstQuery
+            << " queryCount:" << queryCount << " destinationOffset:" << destinationOffset;
    }
-
-  private:
 };

 // Test the accuracy of timestamp compute shader which uses unsigned 32-bit integers to simulate
@ -185,17 +192,28 @@ TEST_P(QueryInternalShaderTests, TimestampComputeShader) {

    DAWN_TEST_UNSUPPORTED_IF(UsesWire());

-    // Convert timestamps in timestamps buffer with offset 0
-    // Test for ResolveQuerySet(querySet, 0, kQueryCount, timestampsBuffer, 0)
-    RunTest(0, kQueryCount, 0);
+    constexpr std::array<float, 5> kPeriodsToTest = {
+        1,
+        7,
+        // A gpu frequency on Intel D3D12 (ticks/second)
+        83.33,
+        1042,
+        65535,
+    };

-    // Convert timestamps in timestamps buffer with offset 256
-    // Test for ResolveQuerySet(querySet, 1, kQueryCount - 1, timestampsBuffer, 256)
-    RunTest(1, kQueryCount - 1, 256);
+    for (float period : kPeriodsToTest) {
+        // Convert timestamps in timestamps buffer with offset 0
+        // Test for ResolveQuerySet(querySet, 0, kQueryCount, timestampsBuffer, 0)
+        RunTest(0, kQueryCount, 0, period);

-    // Convert partial timestamps in timestamps buffer with offset 256
-    // Test for ResolveQuerySet(querySet, 1, 4, timestampsBuffer, 256)
-    RunTest(1, 4, 256);
+        // Convert timestamps in timestamps buffer with offset 256
+        // Test for ResolveQuerySet(querySet, 1, kQueryCount - 1, timestampsBuffer, 256)
+        RunTest(1, kQueryCount - 1, 256, period);
+
+        // Convert partial timestamps in timestamps buffer with offset 256
+        // Test for ResolveQuerySet(querySet, 1, 4, timestampsBuffer, 256)
+        RunTest(1, 4, 256, period);
+    }
 }

 DAWN_INSTANTIATE_TEST(QueryInternalShaderTests, D3D12Backend(), MetalBackend(), VulkanBackend());