Workaround empty occlusion queries on Apple GPUs

Fixed: dawn:1707
Change-Id: I73f2b595c57830266e7d34ca9e483d803a00d331
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/125180
Kokoro: Kokoro <noreply+kokoro@google.com>
Reviewed-by: Loko Kung <lokokung@google.com>
Reviewed-by: Kai Ninomiya <kainino@chromium.org>
Commit-Queue: Kai Ninomiya <kainino@chromium.org>
This commit is contained in:
Austin Eng 2023-03-23 07:15:03 +00:00 committed by Dawn LUCI CQ
parent fa2d3851a8
commit 0dc09c01a4
6 changed files with 129 additions and 10 deletions

View File

@ -358,6 +358,12 @@ static constexpr ToggleEnumAndInfoList kToggleNameAndInfoList = {{
"subresource are completely initialized, and StoreOp::Discard is always translated as a "
"Store.",
"https://crbug.com/dawn/838", ToggleStage::Device}},
{Toggle::MetalFillEmptyOcclusionQueriesWithZero,
{"metal_fill_empty_occlusion_queries_with_zero",
"Apple GPUs leave stale results in the visibility result buffer instead of writing zero if "
"an occlusion query is empty. Workaround this by explicitly filling it with zero if there "
"are no draw calls.",
"https://crbug.com/dawn/1707", ToggleStage::Device}},
{Toggle::UseBlitForBufferToDepthTextureCopy,
{"use_blit_for_buffer_to_depth_texture_copy",
"Use a blit instead of a copy command to copy buffer data to the depth aspect of a "

View File

@ -87,6 +87,7 @@ enum class Toggle {
MetalUseCombinedDepthStencilFormatForStencil8,
MetalUseBothDepthAndStencilAttachmentsForCombinedDepthStencilFormats,
MetalKeepMultisubresourceDepthStencilTexturesInitialized,
MetalFillEmptyOcclusionQueriesWithZero,
UseBlitForBufferToDepthTextureCopy,
UseBlitForBufferToStencilTextureCopy,
UseBlitForDepthTextureToTextureCopyToNonzeroSubresource,

View File

@ -404,6 +404,10 @@ class Adapter : public AdapterBase {
true);
}
if (gpu_info::IsApple(vendorId)) {
deviceToggles->Default(Toggle::MetalFillEmptyOcclusionQueriesWithZero, true);
}
// Local testing shows the workaround is needed on AMD Radeon HD 8870M (gcn-1) MacOS 12.1;
// not on AMD Radeon Pro 555 (gcn-4) MacOS 13.1.
// Conservatively enable the workaround on AMD unless the system is MacOS 13.1+

View File

@ -15,6 +15,9 @@
#ifndef SRC_DAWN_NATIVE_METAL_COMMANDBUFFERMTL_H_
#define SRC_DAWN_NATIVE_METAL_COMMANDBUFFERMTL_H_
#include <set>
#include <utility>
#include "dawn/native/CommandBuffer.h"
#include "dawn/native/Commands.h"
#include "dawn/native/Error.h"
@ -32,6 +35,7 @@ namespace dawn::native::metal {
class CommandRecordingContext;
class Device;
class Texture;
class QuerySet;
void RecordCopyBufferToTexture(CommandRecordingContext* commandContext,
id<MTLBuffer> mtlBuffer,
@ -60,8 +64,15 @@ class CommandBuffer final : public CommandBufferBase {
MaybeError EncodeComputePass(CommandRecordingContext* commandContext,
BeginComputePassCmd* computePassCmd);
// Empty occlusion queries aren't filled to zero on Apple GPUs. This set is used to
// track which results should be explicitly zero'ed as a workaround. Use of empty queries
// *should* mostly be a degenerate scenario, so this std::set shouldn't be performance-critical.
// The set is passed as nullptr to `EncodeRenderPass` if the workaround is not in use.
using EmptyOcclusionQueries = std::set<std::pair<QuerySet*, uint32_t>>;
MaybeError EncodeRenderPass(id<MTLRenderCommandEncoder> encoder,
BeginRenderPassCmd* renderPassCmd);
BeginRenderPassCmd* renderPassCmd,
EmptyOcclusionQueries* emptyOcclusionQueries);
};
} // namespace dawn::native::metal

View File

@ -812,12 +812,25 @@ MaybeError CommandBuffer::FillCommands(CommandRecordingContext* commandContext)
Device* device = ToBackend(GetDevice());
NSRef<MTLRenderPassDescriptor> descriptor = CreateMTLRenderPassDescriptor(
device, cmd, device->UseCounterSamplingAtStageBoundary());
EmptyOcclusionQueries emptyOcclusionQueries;
DAWN_TRY(EncodeMetalRenderPass(
device, commandContext, descriptor.Get(), cmd->width, cmd->height,
[this](id<MTLRenderCommandEncoder> encoder, BeginRenderPassCmd* cmd)
-> MaybeError { return this->EncodeRenderPass(encoder, cmd); },
[&](id<MTLRenderCommandEncoder> encoder,
BeginRenderPassCmd* cmd) -> MaybeError {
return this->EncodeRenderPass(
encoder, cmd,
device->IsToggleEnabled(Toggle::MetalFillEmptyOcclusionQueriesWithZero)
? &emptyOcclusionQueries
: nullptr);
},
cmd));
for (const auto& [querySet, queryIndex] : emptyOcclusionQueries) {
[commandContext->EnsureBlit()
fillBuffer:querySet->GetVisibilityBuffer()
range:NSMakeRange(queryIndex * sizeof(uint64_t), sizeof(uint64_t))
value:0u];
}
nextRenderPassNumber++;
break;
}
@ -1347,7 +1360,8 @@ MaybeError CommandBuffer::EncodeComputePass(CommandRecordingContext* commandCont
}
MaybeError CommandBuffer::EncodeRenderPass(id<MTLRenderCommandEncoder> encoder,
BeginRenderPassCmd* renderPassCmd) {
BeginRenderPassCmd* renderPassCmd,
EmptyOcclusionQueries* emptyOcclusionQueries) {
bool enableVertexPulling = GetDevice()->IsToggleEnabled(Toggle::MetalEnableVertexPulling);
RenderPipeline* lastPipeline = nullptr;
id<MTLBuffer> indexBuffer = nullptr;
@ -1355,6 +1369,8 @@ MaybeError CommandBuffer::EncodeRenderPass(id<MTLRenderCommandEncoder> encoder,
MTLIndexType indexBufferType;
uint64_t indexFormatSize = 0;
bool didDrawInCurrentOcclusionQuery = false;
StorageBufferLengthTracker storageBufferLengths = {};
VertexBufferTracker vertexBuffers(&storageBufferLengths);
BindGroupTracker bindGroups(&storageBufferLengths);
@ -1390,12 +1406,14 @@ MaybeError CommandBuffer::EncodeRenderPass(id<MTLRenderCommandEncoder> encoder,
vertexStart:draw->firstVertex
vertexCount:draw->vertexCount
instanceCount:draw->instanceCount];
didDrawInCurrentOcclusionQuery = true;
} else {
[encoder drawPrimitives:lastPipeline->GetMTLPrimitiveTopology()
vertexStart:draw->firstVertex
vertexCount:draw->vertexCount
instanceCount:draw->instanceCount
baseInstance:draw->firstInstance];
didDrawInCurrentOcclusionQuery = true;
}
}
break;
@ -1420,6 +1438,7 @@ MaybeError CommandBuffer::EncodeRenderPass(id<MTLRenderCommandEncoder> encoder,
indexBufferOffset:indexBufferBaseOffset +
draw->firstIndex * indexFormatSize
instanceCount:draw->instanceCount];
didDrawInCurrentOcclusionQuery = true;
} else {
[encoder drawIndexedPrimitives:lastPipeline->GetMTLPrimitiveTopology()
indexCount:draw->indexCount
@ -1430,6 +1449,7 @@ MaybeError CommandBuffer::EncodeRenderPass(id<MTLRenderCommandEncoder> encoder,
instanceCount:draw->instanceCount
baseVertex:draw->baseVertex
baseInstance:draw->firstInstance];
didDrawInCurrentOcclusionQuery = true;
}
}
break;
@ -1448,6 +1468,7 @@ MaybeError CommandBuffer::EncodeRenderPass(id<MTLRenderCommandEncoder> encoder,
[encoder drawPrimitives:lastPipeline->GetMTLPrimitiveTopology()
indirectBuffer:indirectBuffer
indirectBufferOffset:draw->indirectOffset];
didDrawInCurrentOcclusionQuery = true;
break;
}
@ -1469,6 +1490,7 @@ MaybeError CommandBuffer::EncodeRenderPass(id<MTLRenderCommandEncoder> encoder,
indexBufferOffset:indexBufferBaseOffset
indirectBuffer:indirectBuffer
indirectBufferOffset:draw->indirectOffset];
didDrawInCurrentOcclusionQuery = true;
break;
}
@ -1649,6 +1671,7 @@ MaybeError CommandBuffer::EncodeRenderPass(id<MTLRenderCommandEncoder> encoder,
[encoder setVisibilityResultMode:MTLVisibilityResultModeBoolean
offset:cmd->queryIndex * sizeof(uint64_t)];
didDrawInCurrentOcclusionQuery = false;
break;
}
@ -1657,6 +1680,20 @@ MaybeError CommandBuffer::EncodeRenderPass(id<MTLRenderCommandEncoder> encoder,
[encoder setVisibilityResultMode:MTLVisibilityResultModeDisabled
offset:cmd->queryIndex * sizeof(uint64_t)];
if (emptyOcclusionQueries) {
// Empty occlusion queries aren't filled to zero on Apple GPUs.
// Keep track of them so we can clear them if necessary.
auto key = std::make_pair(ToBackend(renderPassCmd->occlusionQuerySet.Get()),
cmd->queryIndex);
if (!didDrawInCurrentOcclusionQuery) {
emptyOcclusionQueries->insert(std::move(key));
} else {
auto it = emptyOcclusionQueries->find(std::move(key));
if (it != emptyOcclusionQueries->end()) {
emptyOcclusionQueries->erase(it);
}
}
}
break;
}

View File

@ -366,8 +366,6 @@ TEST_P(OcclusionQueryTests, ResolveWithoutWritten) {
// Test setting an occlusion query to non-zero, then rewriting it without drawing, resolves to 0.
TEST_P(OcclusionQueryTests, RewriteNoDrawToZero) {
// TODO(crbug.com/dawn/1707): The second query does not reset it to 0.
DAWN_SUPPRESS_TEST_IF(IsMacOS() && IsMetal() && IsApple());
constexpr uint32_t kQueryCount = 1;
wgpu::QuerySet querySet = CreateOcclusionQuerySet(kQueryCount);
@ -406,8 +404,6 @@ TEST_P(OcclusionQueryTests, RewriteNoDrawToZero) {
// Test setting an occlusion query to non-zero, then rewriting it without drawing, resolves to 0.
// Do the two queries+resolves in separate submits.
TEST_P(OcclusionQueryTests, RewriteNoDrawToZeroSeparateSubmit) {
// TODO(crbug.com/dawn/1707): The second query does not reset it to 0.
DAWN_SUPPRESS_TEST_IF(IsMacOS() && IsMetal() && IsApple());
constexpr uint32_t kQueryCount = 1;
wgpu::QuerySet querySet = CreateOcclusionQuerySet(kQueryCount);
@ -445,6 +441,66 @@ TEST_P(OcclusionQueryTests, RewriteNoDrawToZeroSeparateSubmit) {
new OcclusionExpectation(OcclusionExpectation::Result::Zero));
}
// Test that resetting an occlusion query to zero works when a draw is done where all primitives
// fail the depth test.
TEST_P(OcclusionQueryTests, RewriteToZeroWithDraw) {
constexpr uint32_t kQueryCount = 1;
utils::ComboRenderPipelineDescriptor descriptor;
descriptor.vertex.module = vsModule;
descriptor.cFragment.module = fsModule;
// Enable depth and stencil tests and set comparison tests to never pass.
wgpu::DepthStencilState* depthStencil = descriptor.EnableDepthStencil(kDepthStencilFormat);
depthStencil->depthCompare = wgpu::CompareFunction::Never;
depthStencil->stencilFront.compare = wgpu::CompareFunction::Never;
depthStencil->stencilBack.compare = wgpu::CompareFunction::Never;
wgpu::RenderPipeline renderPipeline = device.CreateRenderPipeline(&descriptor);
wgpu::Texture renderTarget = CreateRenderTexture(kColorFormat);
wgpu::TextureView renderTargetView = renderTarget.CreateView();
wgpu::Texture depthTexture = CreateRenderTexture(kDepthStencilFormat);
wgpu::TextureView depthTextureView = depthTexture.CreateView();
wgpu::QuerySet querySet = CreateOcclusionQuerySet(kQueryCount);
wgpu::Buffer destination = CreateResolveBuffer(kQueryCount * sizeof(uint64_t));
// Set all bits in buffer to check 0 is correctly written if there is no sample passed the
// occlusion testing
queue.WriteBuffer(destination, 0, &kSentinelValue, sizeof(kSentinelValue));
wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
{
utils::BasicRenderPass renderPass = utils::CreateBasicRenderPass(device, kRTSize, kRTSize);
renderPass.renderPassInfo.occlusionQuerySet = querySet;
wgpu::RenderPassEncoder pass = encoder.BeginRenderPass(&renderPass.renderPassInfo);
pass.SetPipeline(pipeline);
pass.BeginOcclusionQuery(0);
pass.Draw(3);
pass.EndOcclusionQuery();
pass.End();
}
{
utils::ComboRenderPassDescriptor renderPass({renderTargetView}, depthTextureView);
renderPass.occlusionQuerySet = querySet;
wgpu::RenderPassEncoder rewritePass = encoder.BeginRenderPass(&renderPass);
rewritePass.SetPipeline(renderPipeline);
rewritePass.BeginOcclusionQuery(0);
rewritePass.Draw(3);
rewritePass.EndOcclusionQuery();
rewritePass.End();
}
encoder.ResolveQuerySet(querySet, 0, kQueryCount, destination, 0);
wgpu::CommandBuffer commands = encoder.Finish();
queue.Submit(1, &commands);
EXPECT_BUFFER(destination, 0, sizeof(uint64_t),
new OcclusionExpectation(OcclusionExpectation::Result::Zero));
}
// Test resolving occlusion query to the destination buffer with offset
TEST_P(OcclusionQueryTests, ResolveToBufferWithOffset) {
constexpr uint32_t kQueryCount = 2;
@ -1227,7 +1283,11 @@ TEST_P(TimestampQueryInsidePassesTests, FromComputePass) {
}
}
DAWN_INSTANTIATE_TEST(OcclusionQueryTests, D3D12Backend(), MetalBackend(), VulkanBackend());
DAWN_INSTANTIATE_TEST(OcclusionQueryTests,
D3D12Backend(),
MetalBackend(),
MetalBackend({"metal_fill_empty_occlusion_queries_with_zero"}),
VulkanBackend());
DAWN_INSTANTIATE_TEST(PipelineStatisticsQueryTests,
D3D12Backend(),
MetalBackend(),