QueryHelper: improve the precision of the tick->ns conversion.

Previously the conversion lost precision because of the repeated
float additions and rounding of float to uint. It is reworked to operate
using only integers, giving a better precision (3e-5 instead of 2e-3)
for all periods between 1ns and 2^16ns (which all GPUs seem to fall in
to).

The QueryHelper test is reworked to test multiple periods to provide
better certainty that the maths in the conversion is correct.

Bug: dawn:1250
Change-Id: I43703bb3a40c4e362d78126e3bf0d830690bc1d7
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/77780
Reviewed-by: Loko Kung <lokokung@google.com>
Commit-Queue: Corentin Wallez <cwallez@chromium.org>
This commit is contained in:
Corentin Wallez 2022-02-01 12:05:22 +00:00 committed by Dawn LUCI CQ
parent 9dfcc20750
commit 777654e6b9
4 changed files with 116 additions and 62 deletions

View File

@ -439,9 +439,8 @@ namespace dawn::native {
availability.size() * sizeof(uint32_t))); availability.size() * sizeof(uint32_t)));
// Timestamp params uniform buffer // Timestamp params uniform buffer
TimestampParams params = {firstQuery, queryCount, TimestampParams params(firstQuery, queryCount, static_cast<uint32_t>(destinationOffset),
static_cast<uint32_t>(destinationOffset), device->GetTimestampPeriodInNS());
device->GetTimestampPeriodInNS()};
BufferDescriptor parmsDesc = {}; BufferDescriptor parmsDesc = {};
parmsDesc.usage = wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst; parmsDesc.usage = wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst;

View File

@ -24,6 +24,8 @@
#include "dawn_native/InternalPipelineStore.h" #include "dawn_native/InternalPipelineStore.h"
#include "dawn_native/utils/WGPUHelpers.h" #include "dawn_native/utils/WGPUHelpers.h"
#include <cmath>
namespace dawn::native { namespace dawn::native {
namespace { namespace {
@ -32,7 +34,8 @@ namespace dawn::native {
static_assert(offsetof(dawn::native::TimestampParams, first) == 0); static_assert(offsetof(dawn::native::TimestampParams, first) == 0);
static_assert(offsetof(dawn::native::TimestampParams, count) == 4); static_assert(offsetof(dawn::native::TimestampParams, count) == 4);
static_assert(offsetof(dawn::native::TimestampParams, offset) == 8); static_assert(offsetof(dawn::native::TimestampParams, offset) == 8);
static_assert(offsetof(dawn::native::TimestampParams, period) == 12); static_assert(offsetof(dawn::native::TimestampParams, multiplier) == 12);
static_assert(offsetof(dawn::native::TimestampParams, rightShift) == 16);
static const char sConvertTimestampsToNanoseconds[] = R"( static const char sConvertTimestampsToNanoseconds[] = R"(
struct Timestamp { struct Timestamp {
@ -52,14 +55,14 @@ namespace dawn::native {
first : u32; first : u32;
count : u32; count : u32;
offset : u32; offset : u32;
period : f32; multiplier : u32;
right_shift : u32;
}; };
@group(0) @binding(0) var<storage, read_write> timestamps : TimestampArr; @group(0) @binding(0) var<storage, read_write> timestamps : TimestampArr;
@group(0) @binding(1) var<storage, read> availability : AvailabilityArr; @group(0) @binding(1) var<storage, read> availability : AvailabilityArr;
@group(0) @binding(2) var<uniform> params : TimestampParams; @group(0) @binding(2) var<uniform> params : TimestampParams;
let sizeofTimestamp : u32 = 8u; let sizeofTimestamp : u32 = 8u;
@stage(compute) @workgroup_size(8, 1, 1) @stage(compute) @workgroup_size(8, 1, 1)
@ -68,8 +71,6 @@ namespace dawn::native {
var index = GlobalInvocationID.x + params.offset / sizeofTimestamp; var index = GlobalInvocationID.x + params.offset / sizeofTimestamp;
var timestamp = timestamps.t[index];
// Return 0 for the unavailable value. // Return 0 for the unavailable value.
if (availability.v[GlobalInvocationID.x + params.first] == 0u) { if (availability.v[GlobalInvocationID.x + params.first] == 0u) {
timestamps.t[index].low = 0u; timestamps.t[index].low = 0u;
@ -77,31 +78,40 @@ namespace dawn::native {
return; return;
} }
// Multiply the values in timestamps buffer by the period. var timestamp = timestamps.t[index];
var period = params.period;
var w = 0u;
// If the product of low 32-bits and the period does not exceed the maximum of u32, // TODO(dawn:1250): Consider using the umulExtended and uaddCarry intrinsics once
// directly do the multiplication, otherwise, use two u32 to represent the high // available.
// 16-bits and low 16-bits of this u32, then multiply them by the period separately. var chunks : array<u32, 5>;
if (timestamp.low <= u32(f32(0xFFFFFFFFu) / period)) { chunks[0] = timestamp.low & 0xFFFFu;
timestamps.t[index].low = u32(round(f32(timestamp.low) * period)); chunks[1] = timestamp.low >> 16u;
} else { chunks[2] = timestamp.high & 0xFFFFu;
var lo = timestamp.low & 0xFFFFu; chunks[3] = timestamp.high >> 16u;
var hi = timestamp.low >> 16u; chunks[4] = 0u;
var t0 = u32(round(f32(lo) * period)); // Multiply all the chunks with the integer period.
var t1 = u32(round(f32(hi) * period)) + (t0 >> 16u); for (var i = 0u; i < 4u; i = i + 1u) {
w = t1 >> 16u; chunks[i] = chunks[i] * params.multiplier;
var result = t1 << 16u;
result = result | (t0 & 0xFFFFu);
timestamps.t[index].low = result;
} }
// Get the nearest integer to the float result. For high 32-bits, the round // Propagate the carry
// function will greatly help reduce the accuracy loss of the final result. var carry = 0u;
timestamps.t[index].high = u32(round(f32(timestamp.high) * period)) + w; for (var i = 0u; i < 4u; i = i + 1u) {
var chunk_with_carry = chunks[i] + carry;
carry = chunk_with_carry >> 16u;
chunks[i] = chunk_with_carry & 0xFFFFu;
}
chunks[4] = carry;
// Apply the right shift.
for (var i = 0u; i < 4u; i = i + 1u) {
var low = chunks[i] >> params.right_shift;
var high = (chunks[i + 1u] << (16u - params.right_shift)) & 0xFFFFu;
chunks[i] = low | high;
}
timestamps.t[index].low = chunks[0] | (chunks[1] << 16u);
timestamps.t[index].high = chunks[2] | (chunks[3] << 16u);
} }
)"; )";
@ -150,6 +160,30 @@ namespace dawn::native {
} // anonymous namespace } // anonymous namespace
TimestampParams::TimestampParams(uint32_t first, uint32_t count, uint32_t offset, float period)
: first(first), count(count), offset(offset) {
// The overall conversion happening, if p is the period, m the multiplier, s the shift, is::
//
// m = round(p * 2^s)
//
// Then in the shader we compute:
//
// m / 2^s = round(p * 2^s) / 2*s ~= p
//
// The goal is to find the best shift to keep the precision of computations. The
// conversion shader uses chunks of 16 bits to compute the multiplication with the perios,
// so we need to keep the multiplier under 2^16. At the same time, the larger the
// multiplier, the better the precision, so we maximize the value of the right shift while
// keeping the multiplier under 2 ^ 16
uint32_t upperLog2 = ceil(log2(period));
// Clamp the shift to 16 because we're doing computations in 16bit chunks. The
// multiplication by the period will overflow the chunks, but timestamps are mostly
// informational so that's ok.
rightShift = 16u - std::min(upperLog2, 16u);
multiplier = uint32_t(period * (1 << rightShift));
}
MaybeError EncodeConvertTimestampsToNanoseconds(CommandEncoder* encoder, MaybeError EncodeConvertTimestampsToNanoseconds(CommandEncoder* encoder,
BufferBase* timestamps, BufferBase* timestamps,
BufferBase* availability, BufferBase* availability,

View File

@ -24,10 +24,13 @@ namespace dawn::native {
class CommandEncoder; class CommandEncoder;
struct TimestampParams { struct TimestampParams {
TimestampParams(uint32_t first, uint32_t count, uint32_t offset, float period);
uint32_t first; uint32_t first;
uint32_t count; uint32_t count;
uint32_t offset; uint32_t offset;
float period; uint32_t multiplier;
uint32_t rightShift;
}; };
MaybeError EncodeConvertTimestampsToNanoseconds(CommandEncoder* encoder, MaybeError EncodeConvertTimestampsToNanoseconds(CommandEncoder* encoder,

View File

@ -43,22 +43,31 @@ namespace {
// Expect the actual results are approximately equal to the expected values. // Expect the actual results are approximately equal to the expected values.
testing::AssertionResult Check(const void* data, size_t size) override { testing::AssertionResult Check(const void* data, size_t size) override {
DAWN_ASSERT(size == sizeof(uint64_t) * mExpected.size()); DAWN_ASSERT(size == sizeof(uint64_t) * mExpected.size());
constexpr static float kErrorToleranceRatio = 0.002f; // The computations in the shader use a multiplier that's a 16bit integer plus a shift
// that maximize the multiplier. This means that for the range of periods we care about
// (1 to 2^16-1 ns per tick), the high order bit of the multiplier will always be set.
// Intuitively this means that we have 15 bits of precision in the computation so we
// expect that for the error tolerance.
constexpr static float kErrorToleranceRatio = 1.0 / (1 << 15); // about 3e-5.
const uint64_t* actual = static_cast<const uint64_t*>(data); const uint64_t* actual = static_cast<const uint64_t*>(data);
for (size_t i = 0; i < mExpected.size(); ++i) { for (size_t i = 0; i < mExpected.size(); ++i) {
if (mExpected[i] == 0 && actual[i] != 0) { if (mExpected[i] == 0) {
return testing::AssertionFailure() if (actual[i] != 0) {
<< "Expected data[" << i << "] to be 0, actual " << actual[i] return testing::AssertionFailure()
<< std::endl; << "Expected data[" << i << "] to be 0, actual " << actual[i]
<< std::endl;
}
return testing::AssertionSuccess();
} }
if (abs(static_cast<int64_t>(mExpected[i] - actual[i])) > float errorRate =
mExpected[i] * kErrorToleranceRatio) { abs(static_cast<int64_t>(mExpected[i] - actual[i])) / float(mExpected[i]);
if (errorRate > kErrorToleranceRatio) {
return testing::AssertionFailure() return testing::AssertionFailure()
<< "Expected data[" << i << "] to be " << mExpected[i] << ", actual " << "Expected data[" << i << "] to be " << mExpected[i] << ", actual "
<< actual[i] << ". Error rate is larger than " << kErrorToleranceRatio << actual[i] << ". Error rate " << errorRate << " is larger than "
<< std::endl; << kErrorToleranceRatio << std::endl;
} }
} }
@ -73,12 +82,6 @@ namespace {
constexpr static uint64_t kSentinelValue = ~uint64_t(0u); constexpr static uint64_t kSentinelValue = ~uint64_t(0u);
// A gpu frequency on Intel D3D12 (ticks/second)
constexpr uint64_t kGPUFrequency = 12000048u;
constexpr uint64_t kNsPerSecond = 1000000000u;
// Timestamp period in nanoseconds
constexpr float kPeriod = static_cast<float>(kNsPerSecond) / kGPUFrequency;
class QueryInternalShaderTests : public DawnTest { class QueryInternalShaderTests : public DawnTest {
protected: protected:
// Original timestamp values in query set for testing // Original timestamp values in query set for testing
@ -103,7 +106,8 @@ class QueryInternalShaderTests : public DawnTest {
const std::vector<uint64_t> GetExpectedResults(const std::vector<uint64_t>& origin, const std::vector<uint64_t> GetExpectedResults(const std::vector<uint64_t>& origin,
uint32_t start, uint32_t start,
uint32_t firstQuery, uint32_t firstQuery,
uint32_t queryCount) { uint32_t queryCount,
float period) {
std::vector<uint64_t> expected(origin.begin(), origin.end()); std::vector<uint64_t> expected(origin.begin(), origin.end());
for (size_t i = 0; i < queryCount; i++) { for (size_t i = 0; i < queryCount; i++) {
if (availabilities[firstQuery + i] == 0) { if (availabilities[firstQuery + i] == 0) {
@ -113,13 +117,16 @@ class QueryInternalShaderTests : public DawnTest {
// Maybe the timestamp * period is larger than the maximum of uint64, so cast the // Maybe the timestamp * period is larger than the maximum of uint64, so cast the
// delta value to double (higher precision than float) // delta value to double (higher precision than float)
expected[start + i] = expected[start + i] =
static_cast<uint64_t>(static_cast<double>(origin[start + i]) * kPeriod); static_cast<uint64_t>(static_cast<double>(origin[start + i]) * period);
} }
} }
return expected; return expected;
} }
void RunTest(uint32_t firstQuery, uint32_t queryCount, uint32_t destinationOffset) { void RunTest(uint32_t firstQuery,
uint32_t queryCount,
uint32_t destinationOffset,
float period) {
ASSERT(destinationOffset % 256 == 0); ASSERT(destinationOffset % 256 == 0);
uint64_t size = queryCount * sizeof(uint64_t) + destinationOffset; uint64_t size = queryCount * sizeof(uint64_t) + destinationOffset;
@ -147,7 +154,7 @@ class QueryInternalShaderTests : public DawnTest {
kQueryCount * sizeof(uint32_t), wgpu::BufferUsage::Storage); kQueryCount * sizeof(uint32_t), wgpu::BufferUsage::Storage);
// The params uniform buffer // The params uniform buffer
dawn::native::TimestampParams params = {firstQuery, queryCount, destinationOffset, kPeriod}; dawn::native::TimestampParams params(firstQuery, queryCount, destinationOffset, period);
wgpu::Buffer paramsBuffer = utils::CreateBufferFromData(device, &params, sizeof(params), wgpu::Buffer paramsBuffer = utils::CreateBufferFromData(device, &params, sizeof(params),
wgpu::BufferUsage::Uniform); wgpu::BufferUsage::Uniform);
@ -158,13 +165,13 @@ class QueryInternalShaderTests : public DawnTest {
queue.Submit(1, &commands); queue.Submit(1, &commands);
const std::vector<uint64_t> expected = const std::vector<uint64_t> expected =
GetExpectedResults(timestampValues, start, firstQuery, queryCount); GetExpectedResults(timestampValues, start, firstQuery, queryCount, period);
EXPECT_BUFFER(timestampsBuffer, 0, size, EXPECT_BUFFER(timestampsBuffer, 0, size,
new InternalShaderExpectation(expected.data(), size / sizeof(uint64_t))); new InternalShaderExpectation(expected.data(), size / sizeof(uint64_t)))
<< "Conversion test for period:" << period << " firstQuery:" << firstQuery
<< " queryCount:" << queryCount << " destinationOffset:" << destinationOffset;
} }
private:
}; };
// Test the accuracy of timestamp compute shader which uses unsigned 32-bit integers to simulate // Test the accuracy of timestamp compute shader which uses unsigned 32-bit integers to simulate
@ -185,17 +192,28 @@ TEST_P(QueryInternalShaderTests, TimestampComputeShader) {
DAWN_TEST_UNSUPPORTED_IF(UsesWire()); DAWN_TEST_UNSUPPORTED_IF(UsesWire());
// Convert timestamps in timestamps buffer with offset 0 constexpr std::array<float, 5> kPeriodsToTest = {
// Test for ResolveQuerySet(querySet, 0, kQueryCount, timestampsBuffer, 0) 1,
RunTest(0, kQueryCount, 0); 7,
// A gpu frequency on Intel D3D12 (ticks/second)
83.33,
1042,
65535,
};
// Convert timestamps in timestamps buffer with offset 256 for (float period : kPeriodsToTest) {
// Test for ResolveQuerySet(querySet, 1, kQueryCount - 1, timestampsBuffer, 256) // Convert timestamps in timestamps buffer with offset 0
RunTest(1, kQueryCount - 1, 256); // Test for ResolveQuerySet(querySet, 0, kQueryCount, timestampsBuffer, 0)
RunTest(0, kQueryCount, 0, period);
// Convert partial timestamps in timestamps buffer with offset 256 // Convert timestamps in timestamps buffer with offset 256
// Test for ResolveQuerySet(querySet, 1, 4, timestampsBuffer, 256) // Test for ResolveQuerySet(querySet, 1, kQueryCount - 1, timestampsBuffer, 256)
RunTest(1, 4, 256); RunTest(1, kQueryCount - 1, 256, period);
// Convert partial timestamps in timestamps buffer with offset 256
// Test for ResolveQuerySet(querySet, 1, 4, timestampsBuffer, 256)
RunTest(1, 4, 256, period);
}
} }
DAWN_INSTANTIATE_TEST(QueryInternalShaderTests, D3D12Backend(), MetalBackend(), VulkanBackend()); DAWN_INSTANTIATE_TEST(QueryInternalShaderTests, D3D12Backend(), MetalBackend(), VulkanBackend());