398 lines
19 KiB
C++
398 lines
19 KiB
C++
|
// Copyright 2021 The Dawn Authors
|
||
|
//
|
||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
// you may not use this file except in compliance with the License.
|
||
|
// You may obtain a copy of the License at
|
||
|
//
|
||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||
|
//
|
||
|
// Unless required by applicable law or agreed to in writing, software
|
||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
// See the License for the specific language governing permissions and
|
||
|
// limitations under the License.
|
||
|
|
||
|
#include "dawn_native/IndirectDrawValidationEncoder.h"
|
||
|
|
||
|
#include "common/Constants.h"
|
||
|
#include "common/Math.h"
|
||
|
#include "dawn_native/BindGroup.h"
|
||
|
#include "dawn_native/BindGroupLayout.h"
|
||
|
#include "dawn_native/CommandEncoder.h"
|
||
|
#include "dawn_native/ComputePassEncoder.h"
|
||
|
#include "dawn_native/ComputePipeline.h"
|
||
|
#include "dawn_native/Device.h"
|
||
|
#include "dawn_native/InternalPipelineStore.h"
|
||
|
#include "dawn_native/Queue.h"
|
||
|
|
||
|
#include <cstdlib>
|
||
|
#include <limits>
|
||
|
|
||
|
namespace dawn_native {
|
||
|
|
||
|
namespace {
|
||
|
// NOTE: This must match the workgroup_size attribute on the compute entry point below.
|
||
|
constexpr uint64_t kWorkgroupSize = 64;
|
||
|
|
||
|
// Equivalent to the BatchInfo struct defined in the shader below.
|
||
|
struct BatchInfo {
|
||
|
uint64_t numIndexBufferElements;
|
||
|
uint32_t numDraws;
|
||
|
uint32_t padding;
|
||
|
};
|
||
|
|
||
|
// TODO(https://crbug.com/dawn/1108): Propagate validation feedback from this shader in
|
||
|
// various failure modes.
|
||
|
static const char sRenderValidationShaderSource[] = R"(
|
||
|
let kNumIndirectParamsPerDrawCall = 5u;
|
||
|
|
||
|
let kIndexCountEntry = 0u;
|
||
|
let kInstanceCountEntry = 1u;
|
||
|
let kFirstIndexEntry = 2u;
|
||
|
let kBaseVertexEntry = 3u;
|
||
|
let kFirstInstanceEntry = 4u;
|
||
|
|
||
|
[[block]] struct BatchInfo {
|
||
|
numIndexBufferElementsLow: u32;
|
||
|
numIndexBufferElementsHigh: u32;
|
||
|
numDraws: u32;
|
||
|
padding: u32;
|
||
|
indirectOffsets: array<u32>;
|
||
|
};
|
||
|
|
||
|
[[block]] struct IndirectParams {
|
||
|
data: array<u32>;
|
||
|
};
|
||
|
|
||
|
[[group(0), binding(0)]] var<storage, read> batch: BatchInfo;
|
||
|
[[group(0), binding(1)]] var<storage, read_write> clientParams: IndirectParams;
|
||
|
[[group(0), binding(2)]] var<storage, write> validatedParams: IndirectParams;
|
||
|
|
||
|
fn fail(drawIndex: u32) {
|
||
|
let index = drawIndex * kNumIndirectParamsPerDrawCall;
|
||
|
validatedParams.data[index + kIndexCountEntry] = 0u;
|
||
|
validatedParams.data[index + kInstanceCountEntry] = 0u;
|
||
|
validatedParams.data[index + kFirstIndexEntry] = 0u;
|
||
|
validatedParams.data[index + kBaseVertexEntry] = 0u;
|
||
|
validatedParams.data[index + kFirstInstanceEntry] = 0u;
|
||
|
}
|
||
|
|
||
|
fn pass(drawIndex: u32) {
|
||
|
let vIndex = drawIndex * kNumIndirectParamsPerDrawCall;
|
||
|
let cIndex = batch.indirectOffsets[drawIndex];
|
||
|
validatedParams.data[vIndex + kIndexCountEntry] =
|
||
|
clientParams.data[cIndex + kIndexCountEntry];
|
||
|
validatedParams.data[vIndex + kInstanceCountEntry] =
|
||
|
clientParams.data[cIndex + kInstanceCountEntry];
|
||
|
validatedParams.data[vIndex + kFirstIndexEntry] =
|
||
|
clientParams.data[cIndex + kFirstIndexEntry];
|
||
|
validatedParams.data[vIndex + kBaseVertexEntry] =
|
||
|
clientParams.data[cIndex + kBaseVertexEntry];
|
||
|
validatedParams.data[vIndex + kFirstInstanceEntry] =
|
||
|
clientParams.data[cIndex + kFirstInstanceEntry];
|
||
|
}
|
||
|
|
||
|
[[stage(compute), workgroup_size(64, 1, 1)]]
|
||
|
fn main([[builtin(global_invocation_id)]] id : vec3<u32>) {
|
||
|
if (id.x >= batch.numDraws) {
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
let clientIndex = batch.indirectOffsets[id.x];
|
||
|
let firstInstance = clientParams.data[clientIndex + kFirstInstanceEntry];
|
||
|
if (firstInstance != 0u) {
|
||
|
fail(id.x);
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
if (batch.numIndexBufferElementsHigh >= 2u) {
|
||
|
// firstIndex and indexCount are both u32. The maximum possible sum of these
|
||
|
// values is 0x1fffffffe, which is less than 0x200000000. Nothing to validate.
|
||
|
pass(id.x);
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
let firstIndex = clientParams.data[clientIndex + kFirstIndexEntry];
|
||
|
if (batch.numIndexBufferElementsHigh == 0u &&
|
||
|
batch.numIndexBufferElementsLow < firstIndex) {
|
||
|
fail(id.x);
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
// Note that this subtraction may underflow, but only when
|
||
|
// numIndexBufferElementsHigh is 1u. The result is still correct in that case.
|
||
|
let maxIndexCount = batch.numIndexBufferElementsLow - firstIndex;
|
||
|
let indexCount = clientParams.data[clientIndex + kIndexCountEntry];
|
||
|
if (indexCount > maxIndexCount) {
|
||
|
fail(id.x);
|
||
|
return;
|
||
|
}
|
||
|
pass(id.x);
|
||
|
}
|
||
|
)";
|
||
|
|
||
|
ResultOrError<ComputePipelineBase*> GetOrCreateRenderValidationPipeline(
|
||
|
DeviceBase* device) {
|
||
|
InternalPipelineStore* store = device->GetInternalPipelineStore();
|
||
|
|
||
|
if (store->renderValidationPipeline == nullptr) {
|
||
|
// Create compute shader module if not cached before.
|
||
|
if (store->renderValidationShader == nullptr) {
|
||
|
ShaderModuleDescriptor descriptor;
|
||
|
ShaderModuleWGSLDescriptor wgslDesc;
|
||
|
wgslDesc.source = sRenderValidationShaderSource;
|
||
|
descriptor.nextInChain = reinterpret_cast<ChainedStruct*>(&wgslDesc);
|
||
|
DAWN_TRY_ASSIGN(store->renderValidationShader,
|
||
|
device->CreateShaderModule(&descriptor));
|
||
|
}
|
||
|
|
||
|
BindGroupLayoutEntry entries[3];
|
||
|
entries[0].binding = 0;
|
||
|
entries[0].visibility = wgpu::ShaderStage::Compute;
|
||
|
entries[0].buffer.type = wgpu::BufferBindingType::ReadOnlyStorage;
|
||
|
entries[1].binding = 1;
|
||
|
entries[1].visibility = wgpu::ShaderStage::Compute;
|
||
|
entries[1].buffer.type = kInternalStorageBufferBinding;
|
||
|
entries[2].binding = 2;
|
||
|
entries[2].visibility = wgpu::ShaderStage::Compute;
|
||
|
entries[2].buffer.type = wgpu::BufferBindingType::Storage;
|
||
|
|
||
|
BindGroupLayoutDescriptor bindGroupLayoutDescriptor;
|
||
|
bindGroupLayoutDescriptor.entryCount = 3;
|
||
|
bindGroupLayoutDescriptor.entries = entries;
|
||
|
Ref<BindGroupLayoutBase> bindGroupLayout;
|
||
|
DAWN_TRY_ASSIGN(bindGroupLayout,
|
||
|
device->CreateBindGroupLayout(&bindGroupLayoutDescriptor, true));
|
||
|
|
||
|
PipelineLayoutDescriptor pipelineDescriptor;
|
||
|
pipelineDescriptor.bindGroupLayoutCount = 1;
|
||
|
pipelineDescriptor.bindGroupLayouts = &bindGroupLayout.Get();
|
||
|
Ref<PipelineLayoutBase> pipelineLayout;
|
||
|
DAWN_TRY_ASSIGN(pipelineLayout, device->CreatePipelineLayout(&pipelineDescriptor));
|
||
|
|
||
|
ComputePipelineDescriptor computePipelineDescriptor = {};
|
||
|
computePipelineDescriptor.layout = pipelineLayout.Get();
|
||
|
computePipelineDescriptor.compute.module = store->renderValidationShader.Get();
|
||
|
computePipelineDescriptor.compute.entryPoint = "main";
|
||
|
|
||
|
DAWN_TRY_ASSIGN(store->renderValidationPipeline,
|
||
|
device->CreateComputePipeline(&computePipelineDescriptor));
|
||
|
}
|
||
|
|
||
|
return store->renderValidationPipeline.Get();
|
||
|
}
|
||
|
|
||
|
size_t GetBatchDataSize(uint32_t numDraws) {
|
||
|
return sizeof(BatchInfo) + numDraws * sizeof(uint32_t);
|
||
|
}
|
||
|
|
||
|
} // namespace
|
||
|
|
||
|
const uint32_t kBatchDrawCallLimitByDispatchSize =
|
||
|
kMaxComputePerDimensionDispatchSize * kWorkgroupSize;
|
||
|
const uint32_t kBatchDrawCallLimitByStorageBindingSize =
|
||
|
(kMaxStorageBufferBindingSize - sizeof(BatchInfo)) / sizeof(uint32_t);
|
||
|
const uint32_t kMaxDrawCallsPerIndirectValidationBatch =
|
||
|
std::min(kBatchDrawCallLimitByDispatchSize, kBatchDrawCallLimitByStorageBindingSize);
|
||
|
|
||
|
MaybeError EncodeIndirectDrawValidationCommands(DeviceBase* device,
|
||
|
CommandEncoder* commandEncoder,
|
||
|
RenderPassResourceUsageTracker* usageTracker,
|
||
|
IndirectDrawMetadata* indirectDrawMetadata) {
|
||
|
struct Batch {
|
||
|
const IndirectDrawMetadata::IndexedIndirectValidationBatch* metadata;
|
||
|
uint64_t numIndexBufferElements;
|
||
|
uint64_t dataBufferOffset;
|
||
|
uint64_t dataSize;
|
||
|
uint64_t clientIndirectOffset;
|
||
|
uint64_t clientIndirectSize;
|
||
|
uint64_t validatedParamsOffset;
|
||
|
uint64_t validatedParamsSize;
|
||
|
BatchInfo* batchInfo;
|
||
|
};
|
||
|
|
||
|
struct Pass {
|
||
|
BufferBase* clientIndirectBuffer;
|
||
|
uint64_t validatedParamsSize = 0;
|
||
|
uint64_t batchDataSize = 0;
|
||
|
std::unique_ptr<void, void (*)(void*)> batchData{nullptr, std::free};
|
||
|
std::vector<Batch> batches;
|
||
|
};
|
||
|
|
||
|
// First stage is grouping all batches into passes. We try to pack as many batches into a
|
||
|
// single pass as possible. Batches can be grouped together as long as they're validating
|
||
|
// data from the same indirect buffer, but they may still be split into multiple passes if
|
||
|
// the number of draw calls in a pass would exceed some (very high) upper bound.
|
||
|
uint64_t numTotalDrawCalls = 0;
|
||
|
size_t validatedParamsSize = 0;
|
||
|
std::vector<Pass> passes;
|
||
|
IndirectDrawMetadata::IndexedIndirectBufferValidationInfoMap& bufferInfoMap =
|
||
|
*indirectDrawMetadata->GetIndexedIndirectBufferValidationInfo();
|
||
|
if (bufferInfoMap.empty()) {
|
||
|
return {};
|
||
|
}
|
||
|
|
||
|
for (auto& entry : bufferInfoMap) {
|
||
|
const IndirectDrawMetadata::IndexedIndirectConfig& config = entry.first;
|
||
|
BufferBase* clientIndirectBuffer = config.first;
|
||
|
for (const IndirectDrawMetadata::IndexedIndirectValidationBatch& batch :
|
||
|
entry.second.GetBatches()) {
|
||
|
const uint64_t minOffsetFromAlignedBoundary =
|
||
|
batch.minOffset % kMinStorageBufferOffsetAlignment;
|
||
|
const uint64_t minOffsetAlignedDown =
|
||
|
batch.minOffset - minOffsetFromAlignedBoundary;
|
||
|
|
||
|
Batch newBatch;
|
||
|
newBatch.metadata = &batch;
|
||
|
newBatch.numIndexBufferElements = config.second;
|
||
|
newBatch.dataSize = GetBatchDataSize(batch.draws.size());
|
||
|
newBatch.clientIndirectOffset = minOffsetAlignedDown;
|
||
|
newBatch.clientIndirectSize =
|
||
|
batch.maxOffset + kDrawIndexedIndirectSize - minOffsetAlignedDown;
|
||
|
numTotalDrawCalls += batch.draws.size();
|
||
|
|
||
|
newBatch.validatedParamsSize = batch.draws.size() * kDrawIndexedIndirectSize;
|
||
|
newBatch.validatedParamsOffset =
|
||
|
Align(validatedParamsSize, kMinStorageBufferOffsetAlignment);
|
||
|
validatedParamsSize = newBatch.validatedParamsOffset + newBatch.validatedParamsSize;
|
||
|
if (validatedParamsSize > kMaxStorageBufferBindingSize) {
|
||
|
return DAWN_INTERNAL_ERROR("Too many drawIndexedIndirect calls to validate");
|
||
|
}
|
||
|
|
||
|
Pass* currentPass = passes.empty() ? nullptr : &passes.back();
|
||
|
if (currentPass && currentPass->clientIndirectBuffer == clientIndirectBuffer) {
|
||
|
uint64_t nextBatchDataOffset =
|
||
|
Align(currentPass->batchDataSize, kMinStorageBufferOffsetAlignment);
|
||
|
uint64_t newPassBatchDataSize = nextBatchDataOffset + newBatch.dataSize;
|
||
|
if (newPassBatchDataSize <= kMaxStorageBufferBindingSize) {
|
||
|
// We can fit this batch in the current pass.
|
||
|
newBatch.dataBufferOffset = nextBatchDataOffset;
|
||
|
currentPass->batchDataSize = newPassBatchDataSize;
|
||
|
currentPass->batches.push_back(newBatch);
|
||
|
continue;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// We need to start a new pass for this batch.
|
||
|
newBatch.dataBufferOffset = 0;
|
||
|
|
||
|
Pass newPass;
|
||
|
newPass.clientIndirectBuffer = clientIndirectBuffer;
|
||
|
newPass.batchDataSize = newBatch.dataSize;
|
||
|
newPass.batches.push_back(newBatch);
|
||
|
passes.push_back(std::move(newPass));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
auto* const store = device->GetInternalPipelineStore();
|
||
|
ScratchBuffer& validatedParamsBuffer = store->scratchIndirectStorage;
|
||
|
ScratchBuffer& batchDataBuffer = store->scratchStorage;
|
||
|
|
||
|
uint64_t requiredBatchDataBufferSize = 0;
|
||
|
for (const Pass& pass : passes) {
|
||
|
requiredBatchDataBufferSize = std::max(requiredBatchDataBufferSize, pass.batchDataSize);
|
||
|
}
|
||
|
DAWN_TRY(batchDataBuffer.EnsureCapacity(requiredBatchDataBufferSize));
|
||
|
usageTracker->BufferUsedAs(batchDataBuffer.GetBuffer(), wgpu::BufferUsage::Storage);
|
||
|
|
||
|
DAWN_TRY(validatedParamsBuffer.EnsureCapacity(validatedParamsSize));
|
||
|
usageTracker->BufferUsedAs(validatedParamsBuffer.GetBuffer(), wgpu::BufferUsage::Indirect);
|
||
|
|
||
|
// Now we allocate and populate host-side batch data to be copied to the GPU, and prepare to
|
||
|
// update all DrawIndexedIndirectCmd buffer references.
|
||
|
std::vector<DeferredBufferLocationUpdate> deferredBufferLocationUpdates;
|
||
|
deferredBufferLocationUpdates.reserve(numTotalDrawCalls);
|
||
|
for (Pass& pass : passes) {
|
||
|
// We use std::malloc here because it guarantees maximal scalar alignment.
|
||
|
pass.batchData = {std::malloc(pass.batchDataSize), std::free};
|
||
|
memset(pass.batchData.get(), 0, pass.batchDataSize);
|
||
|
uint8_t* batchData = static_cast<uint8_t*>(pass.batchData.get());
|
||
|
for (Batch& batch : pass.batches) {
|
||
|
batch.batchInfo = new (&batchData[batch.dataBufferOffset]) BatchInfo();
|
||
|
batch.batchInfo->numIndexBufferElements = batch.numIndexBufferElements;
|
||
|
batch.batchInfo->numDraws = static_cast<uint32_t>(batch.metadata->draws.size());
|
||
|
|
||
|
uint32_t* indirectOffsets = reinterpret_cast<uint32_t*>(batch.batchInfo + 1);
|
||
|
uint64_t validatedParamsOffset = batch.validatedParamsOffset;
|
||
|
for (const auto& draw : batch.metadata->draws) {
|
||
|
// The shader uses this to index an array of u32, hence the division by 4 bytes.
|
||
|
*indirectOffsets++ = static_cast<uint32_t>(
|
||
|
(draw.clientBufferOffset - batch.clientIndirectOffset) / 4);
|
||
|
|
||
|
DeferredBufferLocationUpdate deferredUpdate;
|
||
|
deferredUpdate.location = draw.bufferLocation;
|
||
|
deferredUpdate.buffer = validatedParamsBuffer.GetBuffer();
|
||
|
deferredUpdate.offset = validatedParamsOffset;
|
||
|
deferredBufferLocationUpdates.push_back(std::move(deferredUpdate));
|
||
|
|
||
|
validatedParamsOffset += kDrawIndexedIndirectSize;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
ComputePipelineBase* pipeline;
|
||
|
DAWN_TRY_ASSIGN(pipeline, GetOrCreateRenderValidationPipeline(device));
|
||
|
|
||
|
Ref<BindGroupLayoutBase> layout;
|
||
|
DAWN_TRY_ASSIGN(layout, pipeline->GetBindGroupLayout(0));
|
||
|
|
||
|
BindGroupEntry bindings[3];
|
||
|
BindGroupEntry& bufferDataBinding = bindings[0];
|
||
|
bufferDataBinding.binding = 0;
|
||
|
bufferDataBinding.buffer = batchDataBuffer.GetBuffer();
|
||
|
|
||
|
BindGroupEntry& clientIndirectBinding = bindings[1];
|
||
|
clientIndirectBinding.binding = 1;
|
||
|
|
||
|
BindGroupEntry& validatedParamsBinding = bindings[2];
|
||
|
validatedParamsBinding.binding = 2;
|
||
|
validatedParamsBinding.buffer = validatedParamsBuffer.GetBuffer();
|
||
|
|
||
|
BindGroupDescriptor bindGroupDescriptor = {};
|
||
|
bindGroupDescriptor.layout = layout.Get();
|
||
|
bindGroupDescriptor.entryCount = 3;
|
||
|
bindGroupDescriptor.entries = bindings;
|
||
|
|
||
|
// Finally, we can now encode our validation passes. Each pass first does a single
|
||
|
// WriteBuffer to get batch data over to the GPU, followed by a single compute pass. The
|
||
|
// compute pass encodes a separate SetBindGroup and Dispatch command for each batch.
|
||
|
commandEncoder->EncodeSetValidatedBufferLocationsInternal(
|
||
|
std::move(deferredBufferLocationUpdates));
|
||
|
for (const Pass& pass : passes) {
|
||
|
commandEncoder->APIWriteBuffer(batchDataBuffer.GetBuffer(), 0,
|
||
|
static_cast<const uint8_t*>(pass.batchData.get()),
|
||
|
pass.batchDataSize);
|
||
|
|
||
|
// TODO(dawn:723): change to not use AcquireRef for reentrant object creation.
|
||
|
ComputePassDescriptor descriptor = {};
|
||
|
Ref<ComputePassEncoder> passEncoder =
|
||
|
AcquireRef(commandEncoder->APIBeginComputePass(&descriptor));
|
||
|
passEncoder->APISetPipeline(pipeline);
|
||
|
|
||
|
clientIndirectBinding.buffer = pass.clientIndirectBuffer;
|
||
|
|
||
|
for (const Batch& batch : pass.batches) {
|
||
|
bufferDataBinding.offset = batch.dataBufferOffset;
|
||
|
bufferDataBinding.size = batch.dataSize;
|
||
|
clientIndirectBinding.offset = batch.clientIndirectOffset;
|
||
|
clientIndirectBinding.size = batch.clientIndirectSize;
|
||
|
validatedParamsBinding.offset = batch.validatedParamsOffset;
|
||
|
validatedParamsBinding.size = batch.validatedParamsSize;
|
||
|
|
||
|
Ref<BindGroupBase> bindGroup;
|
||
|
DAWN_TRY_ASSIGN(bindGroup, device->CreateBindGroup(&bindGroupDescriptor));
|
||
|
|
||
|
const uint32_t numDrawsRoundedUp =
|
||
|
(batch.batchInfo->numDraws + kWorkgroupSize - 1) / kWorkgroupSize;
|
||
|
passEncoder->APISetBindGroup(0, bindGroup.Get());
|
||
|
passEncoder->APIDispatch(numDrawsRoundedUp);
|
||
|
}
|
||
|
|
||
|
passEncoder->APIEndPass();
|
||
|
}
|
||
|
|
||
|
return {};
|
||
|
}
|
||
|
|
||
|
} // namespace dawn_native
|