diff --git a/src/dawn_native/CommandEncoder.cpp b/src/dawn_native/CommandEncoder.cpp index def1f4ef5e..67a3d4d774 100644 --- a/src/dawn_native/CommandEncoder.cpp +++ b/src/dawn_native/CommandEncoder.cpp @@ -439,9 +439,8 @@ namespace dawn::native { availability.size() * sizeof(uint32_t))); // Timestamp params uniform buffer - TimestampParams params = {firstQuery, queryCount, - static_cast(destinationOffset), - device->GetTimestampPeriodInNS()}; + TimestampParams params(firstQuery, queryCount, static_cast(destinationOffset), + device->GetTimestampPeriodInNS()); BufferDescriptor parmsDesc = {}; parmsDesc.usage = wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst; diff --git a/src/dawn_native/QueryHelper.cpp b/src/dawn_native/QueryHelper.cpp index 7c8e9425ec..403552c4b3 100644 --- a/src/dawn_native/QueryHelper.cpp +++ b/src/dawn_native/QueryHelper.cpp @@ -24,6 +24,8 @@ #include "dawn_native/InternalPipelineStore.h" #include "dawn_native/utils/WGPUHelpers.h" +#include + namespace dawn::native { namespace { @@ -32,7 +34,8 @@ namespace dawn::native { static_assert(offsetof(dawn::native::TimestampParams, first) == 0); static_assert(offsetof(dawn::native::TimestampParams, count) == 4); static_assert(offsetof(dawn::native::TimestampParams, offset) == 8); - static_assert(offsetof(dawn::native::TimestampParams, period) == 12); + static_assert(offsetof(dawn::native::TimestampParams, multiplier) == 12); + static_assert(offsetof(dawn::native::TimestampParams, rightShift) == 16); static const char sConvertTimestampsToNanoseconds[] = R"( struct Timestamp { @@ -52,14 +55,14 @@ namespace dawn::native { first : u32; count : u32; offset : u32; - period : f32; + multiplier : u32; + right_shift : u32; }; @group(0) @binding(0) var timestamps : TimestampArr; @group(0) @binding(1) var availability : AvailabilityArr; @group(0) @binding(2) var params : TimestampParams; - let sizeofTimestamp : u32 = 8u; @stage(compute) @workgroup_size(8, 1, 1) @@ -68,8 +71,6 @@ namespace dawn::native { var index = GlobalInvocationID.x + params.offset / sizeofTimestamp; - var timestamp = timestamps.t[index]; - // Return 0 for the unavailable value. if (availability.v[GlobalInvocationID.x + params.first] == 0u) { timestamps.t[index].low = 0u; @@ -77,31 +78,40 @@ namespace dawn::native { return; } - // Multiply the values in timestamps buffer by the period. - var period = params.period; - var w = 0u; + var timestamp = timestamps.t[index]; - // If the product of low 32-bits and the period does not exceed the maximum of u32, - // directly do the multiplication, otherwise, use two u32 to represent the high - // 16-bits and low 16-bits of this u32, then multiply them by the period separately. - if (timestamp.low <= u32(f32(0xFFFFFFFFu) / period)) { - timestamps.t[index].low = u32(round(f32(timestamp.low) * period)); - } else { - var lo = timestamp.low & 0xFFFFu; - var hi = timestamp.low >> 16u; + // TODO(dawn:1250): Consider using the umulExtended and uaddCarry intrinsics once + // available. + var chunks : array; + chunks[0] = timestamp.low & 0xFFFFu; + chunks[1] = timestamp.low >> 16u; + chunks[2] = timestamp.high & 0xFFFFu; + chunks[3] = timestamp.high >> 16u; + chunks[4] = 0u; - var t0 = u32(round(f32(lo) * period)); - var t1 = u32(round(f32(hi) * period)) + (t0 >> 16u); - w = t1 >> 16u; - - var result = t1 << 16u; - result = result | (t0 & 0xFFFFu); - timestamps.t[index].low = result; + // Multiply all the chunks with the integer period. + for (var i = 0u; i < 4u; i = i + 1u) { + chunks[i] = chunks[i] * params.multiplier; } - // Get the nearest integer to the float result. For high 32-bits, the round - // function will greatly help reduce the accuracy loss of the final result. - timestamps.t[index].high = u32(round(f32(timestamp.high) * period)) + w; + // Propagate the carry + var carry = 0u; + for (var i = 0u; i < 4u; i = i + 1u) { + var chunk_with_carry = chunks[i] + carry; + carry = chunk_with_carry >> 16u; + chunks[i] = chunk_with_carry & 0xFFFFu; + } + chunks[4] = carry; + + // Apply the right shift. + for (var i = 0u; i < 4u; i = i + 1u) { + var low = chunks[i] >> params.right_shift; + var high = (chunks[i + 1u] << (16u - params.right_shift)) & 0xFFFFu; + chunks[i] = low | high; + } + + timestamps.t[index].low = chunks[0] | (chunks[1] << 16u); + timestamps.t[index].high = chunks[2] | (chunks[3] << 16u); } )"; @@ -150,6 +160,30 @@ namespace dawn::native { } // anonymous namespace + TimestampParams::TimestampParams(uint32_t first, uint32_t count, uint32_t offset, float period) + : first(first), count(count), offset(offset) { + // The overall conversion happening, if p is the period, m the multiplier, s the shift, is:: + // + // m = round(p * 2^s) + // + // Then in the shader we compute: + // + // m / 2^s = round(p * 2^s) / 2*s ~= p + // + // The goal is to find the best shift to keep the precision of computations. The + // conversion shader uses chunks of 16 bits to compute the multiplication with the perios, + // so we need to keep the multiplier under 2^16. At the same time, the larger the + // multiplier, the better the precision, so we maximize the value of the right shift while + // keeping the multiplier under 2 ^ 16 + uint32_t upperLog2 = ceil(log2(period)); + + // Clamp the shift to 16 because we're doing computations in 16bit chunks. The + // multiplication by the period will overflow the chunks, but timestamps are mostly + // informational so that's ok. + rightShift = 16u - std::min(upperLog2, 16u); + multiplier = uint32_t(period * (1 << rightShift)); + } + MaybeError EncodeConvertTimestampsToNanoseconds(CommandEncoder* encoder, BufferBase* timestamps, BufferBase* availability, diff --git a/src/dawn_native/QueryHelper.h b/src/dawn_native/QueryHelper.h index 6f8362ceee..2c64754f2b 100644 --- a/src/dawn_native/QueryHelper.h +++ b/src/dawn_native/QueryHelper.h @@ -24,10 +24,13 @@ namespace dawn::native { class CommandEncoder; struct TimestampParams { + TimestampParams(uint32_t first, uint32_t count, uint32_t offset, float period); + uint32_t first; uint32_t count; uint32_t offset; - float period; + uint32_t multiplier; + uint32_t rightShift; }; MaybeError EncodeConvertTimestampsToNanoseconds(CommandEncoder* encoder, diff --git a/src/tests/white_box/QueryInternalShaderTests.cpp b/src/tests/white_box/QueryInternalShaderTests.cpp index 5217998735..2c10f45cc9 100644 --- a/src/tests/white_box/QueryInternalShaderTests.cpp +++ b/src/tests/white_box/QueryInternalShaderTests.cpp @@ -43,22 +43,31 @@ namespace { // Expect the actual results are approximately equal to the expected values. testing::AssertionResult Check(const void* data, size_t size) override { DAWN_ASSERT(size == sizeof(uint64_t) * mExpected.size()); - constexpr static float kErrorToleranceRatio = 0.002f; + // The computations in the shader use a multiplier that's a 16bit integer plus a shift + // that maximize the multiplier. This means that for the range of periods we care about + // (1 to 2^16-1 ns per tick), the high order bit of the multiplier will always be set. + // Intuitively this means that we have 15 bits of precision in the computation so we + // expect that for the error tolerance. + constexpr static float kErrorToleranceRatio = 1.0 / (1 << 15); // about 3e-5. const uint64_t* actual = static_cast(data); for (size_t i = 0; i < mExpected.size(); ++i) { - if (mExpected[i] == 0 && actual[i] != 0) { - return testing::AssertionFailure() - << "Expected data[" << i << "] to be 0, actual " << actual[i] - << std::endl; + if (mExpected[i] == 0) { + if (actual[i] != 0) { + return testing::AssertionFailure() + << "Expected data[" << i << "] to be 0, actual " << actual[i] + << std::endl; + } + return testing::AssertionSuccess(); } - if (abs(static_cast(mExpected[i] - actual[i])) > - mExpected[i] * kErrorToleranceRatio) { + float errorRate = + abs(static_cast(mExpected[i] - actual[i])) / float(mExpected[i]); + if (errorRate > kErrorToleranceRatio) { return testing::AssertionFailure() << "Expected data[" << i << "] to be " << mExpected[i] << ", actual " - << actual[i] << ". Error rate is larger than " << kErrorToleranceRatio - << std::endl; + << actual[i] << ". Error rate " << errorRate << " is larger than " + << kErrorToleranceRatio << std::endl; } } @@ -73,12 +82,6 @@ namespace { constexpr static uint64_t kSentinelValue = ~uint64_t(0u); -// A gpu frequency on Intel D3D12 (ticks/second) -constexpr uint64_t kGPUFrequency = 12000048u; -constexpr uint64_t kNsPerSecond = 1000000000u; -// Timestamp period in nanoseconds -constexpr float kPeriod = static_cast(kNsPerSecond) / kGPUFrequency; - class QueryInternalShaderTests : public DawnTest { protected: // Original timestamp values in query set for testing @@ -103,7 +106,8 @@ class QueryInternalShaderTests : public DawnTest { const std::vector GetExpectedResults(const std::vector& origin, uint32_t start, uint32_t firstQuery, - uint32_t queryCount) { + uint32_t queryCount, + float period) { std::vector expected(origin.begin(), origin.end()); for (size_t i = 0; i < queryCount; i++) { if (availabilities[firstQuery + i] == 0) { @@ -113,13 +117,16 @@ class QueryInternalShaderTests : public DawnTest { // Maybe the timestamp * period is larger than the maximum of uint64, so cast the // delta value to double (higher precision than float) expected[start + i] = - static_cast(static_cast(origin[start + i]) * kPeriod); + static_cast(static_cast(origin[start + i]) * period); } } return expected; } - void RunTest(uint32_t firstQuery, uint32_t queryCount, uint32_t destinationOffset) { + void RunTest(uint32_t firstQuery, + uint32_t queryCount, + uint32_t destinationOffset, + float period) { ASSERT(destinationOffset % 256 == 0); uint64_t size = queryCount * sizeof(uint64_t) + destinationOffset; @@ -147,7 +154,7 @@ class QueryInternalShaderTests : public DawnTest { kQueryCount * sizeof(uint32_t), wgpu::BufferUsage::Storage); // The params uniform buffer - dawn::native::TimestampParams params = {firstQuery, queryCount, destinationOffset, kPeriod}; + dawn::native::TimestampParams params(firstQuery, queryCount, destinationOffset, period); wgpu::Buffer paramsBuffer = utils::CreateBufferFromData(device, ¶ms, sizeof(params), wgpu::BufferUsage::Uniform); @@ -158,13 +165,13 @@ class QueryInternalShaderTests : public DawnTest { queue.Submit(1, &commands); const std::vector expected = - GetExpectedResults(timestampValues, start, firstQuery, queryCount); + GetExpectedResults(timestampValues, start, firstQuery, queryCount, period); EXPECT_BUFFER(timestampsBuffer, 0, size, - new InternalShaderExpectation(expected.data(), size / sizeof(uint64_t))); + new InternalShaderExpectation(expected.data(), size / sizeof(uint64_t))) + << "Conversion test for period:" << period << " firstQuery:" << firstQuery + << " queryCount:" << queryCount << " destinationOffset:" << destinationOffset; } - - private: }; // Test the accuracy of timestamp compute shader which uses unsigned 32-bit integers to simulate @@ -185,17 +192,28 @@ TEST_P(QueryInternalShaderTests, TimestampComputeShader) { DAWN_TEST_UNSUPPORTED_IF(UsesWire()); - // Convert timestamps in timestamps buffer with offset 0 - // Test for ResolveQuerySet(querySet, 0, kQueryCount, timestampsBuffer, 0) - RunTest(0, kQueryCount, 0); + constexpr std::array kPeriodsToTest = { + 1, + 7, + // A gpu frequency on Intel D3D12 (ticks/second) + 83.33, + 1042, + 65535, + }; - // Convert timestamps in timestamps buffer with offset 256 - // Test for ResolveQuerySet(querySet, 1, kQueryCount - 1, timestampsBuffer, 256) - RunTest(1, kQueryCount - 1, 256); + for (float period : kPeriodsToTest) { + // Convert timestamps in timestamps buffer with offset 0 + // Test for ResolveQuerySet(querySet, 0, kQueryCount, timestampsBuffer, 0) + RunTest(0, kQueryCount, 0, period); - // Convert partial timestamps in timestamps buffer with offset 256 - // Test for ResolveQuerySet(querySet, 1, 4, timestampsBuffer, 256) - RunTest(1, 4, 256); + // Convert timestamps in timestamps buffer with offset 256 + // Test for ResolveQuerySet(querySet, 1, kQueryCount - 1, timestampsBuffer, 256) + RunTest(1, kQueryCount - 1, 256, period); + + // Convert partial timestamps in timestamps buffer with offset 256 + // Test for ResolveQuerySet(querySet, 1, 4, timestampsBuffer, 256) + RunTest(1, 4, 256, period); + } } DAWN_INSTANTIATE_TEST(QueryInternalShaderTests, D3D12Backend(), MetalBackend(), VulkanBackend());