// Copyright 2021 The Dawn Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "dawn/native/IndirectDrawValidationEncoder.h" #include #include #include #include #include #include #include "dawn/common/Constants.h" #include "dawn/common/Math.h" #include "dawn/native/BindGroup.h" #include "dawn/native/BindGroupLayout.h" #include "dawn/native/CommandEncoder.h" #include "dawn/native/ComputePassEncoder.h" #include "dawn/native/ComputePipeline.h" #include "dawn/native/Device.h" #include "dawn/native/InternalPipelineStore.h" #include "dawn/native/Queue.h" #include "dawn/native/utils/WGPUHelpers.h" namespace dawn::native { namespace { // NOTE: This must match the workgroup_size attribute on the compute entry point below. constexpr uint64_t kWorkgroupSize = 64; // Bitmasks for BatchInfo::flags constexpr uint32_t kDuplicateBaseVertexInstance = 1; constexpr uint32_t kIndexedDraw = 2; constexpr uint32_t kValidationEnabled = 4; // Equivalent to the BatchInfo struct defined in the shader below. struct BatchInfo { uint64_t numIndexBufferElements; uint32_t numDraws; uint32_t flags; }; // TODO(https://crbug.com/dawn/1108): Propagate validation feedback from this shader in // various failure modes. static const char sRenderValidationShaderSource[] = R"( let kNumDrawIndirectParams = 4u; let kIndexCountEntry = 0u; let kFirstIndexEntry = 2u; // Bitmasks for BatchInfo::flags let kDuplicateBaseVertexInstance = 1u; let kIndexedDraw = 2u; let kValidationEnabled = 4u; struct BatchInfo { numIndexBufferElementsLow: u32, numIndexBufferElementsHigh: u32, numDraws: u32, flags: u32, indirectOffsets: array, } struct IndirectParams { data: array, } @group(0) @binding(0) var batch: BatchInfo; @group(0) @binding(1) var inputParams: IndirectParams; @group(0) @binding(2) var outputParams: IndirectParams; fn numIndirectParamsPerDrawCallInput() -> u32 { var numParams = kNumDrawIndirectParams; // Indexed Draw has an extra parameter (firstIndex) if (bool(batch.flags & kIndexedDraw)) { numParams = numParams + 1u; } return numParams; } fn numIndirectParamsPerDrawCallOutput() -> u32 { var numParams = numIndirectParamsPerDrawCallInput(); // 2 extra parameter for duplicated first/baseVexter and firstInstance if (bool(batch.flags & kDuplicateBaseVertexInstance)) { numParams = numParams + 2u; } return numParams; } fn fail(drawIndex: u32) { let numParams = numIndirectParamsPerDrawCallOutput(); let index = drawIndex * numParams; for(var i = 0u; i < numParams; i = i + 1u) { outputParams.data[index + i] = 0u; } } fn pass(drawIndex: u32) { let numInputParams = numIndirectParamsPerDrawCallInput(); var outIndex = drawIndex * numIndirectParamsPerDrawCallOutput(); let inIndex = batch.indirectOffsets[drawIndex]; // The first 2 parameter is reserved for the duplicated first/baseVertex and firstInstance if (bool(batch.flags & kDuplicateBaseVertexInstance)) { // first/baseVertex and firstInstance are always last two parameters let dupIndex = inIndex + numInputParams - 2u; outputParams.data[outIndex] = inputParams.data[dupIndex]; outputParams.data[outIndex + 1u] = inputParams.data[dupIndex + 1u]; outIndex = outIndex + 2u; } for(var i = 0u; i < numInputParams; i = i + 1u) { outputParams.data[outIndex + i] = inputParams.data[inIndex + i]; } } @stage(compute) @workgroup_size(64, 1, 1) fn main(@builtin(global_invocation_id) id : vec3) { if (id.x >= batch.numDraws) { return; } if(!bool(batch.flags & kValidationEnabled)) { pass(id.x); return; } let inputIndex = batch.indirectOffsets[id.x]; // firstInstance is always the last parameter let firstInstance = inputParams.data[inputIndex + numIndirectParamsPerDrawCallInput() - 1u]; if (firstInstance != 0u) { fail(id.x); return; } if (!bool(batch.flags & kIndexedDraw)) { pass(id.x); return; } if (batch.numIndexBufferElementsHigh >= 2u) { // firstIndex and indexCount are both u32. The maximum possible sum of these // values is 0x1fffffffe, which is less than 0x200000000. Nothing to validate. pass(id.x); return; } let firstIndex = inputParams.data[inputIndex + kFirstIndexEntry]; if (batch.numIndexBufferElementsHigh == 0u && batch.numIndexBufferElementsLow < firstIndex) { fail(id.x); return; } // Note that this subtraction may underflow, but only when // numIndexBufferElementsHigh is 1u. The result is still correct in that case. let maxIndexCount = batch.numIndexBufferElementsLow - firstIndex; let indexCount = inputParams.data[inputIndex + kIndexCountEntry]; if (indexCount > maxIndexCount) { fail(id.x); return; } pass(id.x); } )"; ResultOrError GetOrCreateRenderValidationPipeline(DeviceBase* device) { InternalPipelineStore* store = device->GetInternalPipelineStore(); if (store->renderValidationPipeline == nullptr) { // Create compute shader module if not cached before. if (store->renderValidationShader == nullptr) { DAWN_TRY_ASSIGN(store->renderValidationShader, utils::CreateShaderModule(device, sRenderValidationShaderSource)); } Ref bindGroupLayout; DAWN_TRY_ASSIGN( bindGroupLayout, utils::MakeBindGroupLayout( device, { {0, wgpu::ShaderStage::Compute, wgpu::BufferBindingType::ReadOnlyStorage}, {1, wgpu::ShaderStage::Compute, kInternalStorageBufferBinding}, {2, wgpu::ShaderStage::Compute, wgpu::BufferBindingType::Storage}, }, /* allowInternalBinding */ true)); Ref pipelineLayout; DAWN_TRY_ASSIGN(pipelineLayout, utils::MakeBasicPipelineLayout(device, bindGroupLayout)); ComputePipelineDescriptor computePipelineDescriptor = {}; computePipelineDescriptor.layout = pipelineLayout.Get(); computePipelineDescriptor.compute.module = store->renderValidationShader.Get(); computePipelineDescriptor.compute.entryPoint = "main"; DAWN_TRY_ASSIGN(store->renderValidationPipeline, device->CreateComputePipeline(&computePipelineDescriptor)); } return store->renderValidationPipeline.Get(); } size_t GetBatchDataSize(uint32_t numDraws) { return sizeof(BatchInfo) + numDraws * sizeof(uint32_t); } } // namespace uint32_t ComputeMaxDrawCallsPerIndirectValidationBatch(const CombinedLimits& limits) { const uint64_t batchDrawCallLimitByDispatchSize = static_cast(limits.v1.maxComputeWorkgroupsPerDimension) * kWorkgroupSize; const uint64_t batchDrawCallLimitByStorageBindingSize = (limits.v1.maxStorageBufferBindingSize - sizeof(BatchInfo)) / sizeof(uint32_t); return static_cast( std::min({batchDrawCallLimitByDispatchSize, batchDrawCallLimitByStorageBindingSize, uint64_t(std::numeric_limits::max())})); } MaybeError EncodeIndirectDrawValidationCommands(DeviceBase* device, CommandEncoder* commandEncoder, RenderPassResourceUsageTracker* usageTracker, IndirectDrawMetadata* indirectDrawMetadata) { struct Batch { const IndirectDrawMetadata::IndirectValidationBatch* metadata; uint64_t numIndexBufferElements; uint64_t dataBufferOffset; uint64_t dataSize; uint64_t inputIndirectOffset; uint64_t inputIndirectSize; uint64_t outputParamsOffset; uint64_t outputParamsSize; BatchInfo* batchInfo; }; struct Pass { uint32_t flags; BufferBase* inputIndirectBuffer; uint64_t outputParamsSize = 0; uint64_t batchDataSize = 0; std::unique_ptr batchData{nullptr, std::free}; std::vector batches; }; // First stage is grouping all batches into passes. We try to pack as many batches into a // single pass as possible. Batches can be grouped together as long as they're validating // data from the same indirect buffer, but they may still be split into multiple passes if // the number of draw calls in a pass would exceed some (very high) upper bound. uint64_t outputParamsSize = 0; std::vector passes; IndirectDrawMetadata::IndexedIndirectBufferValidationInfoMap& bufferInfoMap = *indirectDrawMetadata->GetIndexedIndirectBufferValidationInfo(); if (bufferInfoMap.empty()) { return {}; } const uint64_t maxStorageBufferBindingSize = device->GetLimits().v1.maxStorageBufferBindingSize; const uint32_t minStorageBufferOffsetAlignment = device->GetLimits().v1.minStorageBufferOffsetAlignment; for (auto& [config, validationInfo] : bufferInfoMap) { const uint64_t indirectDrawCommandSize = config.drawType == IndirectDrawMetadata::DrawType::Indexed ? kDrawIndexedIndirectSize : kDrawIndirectSize; uint64_t outputIndirectSize = indirectDrawCommandSize; if (config.duplicateBaseVertexInstance) { outputIndirectSize += 2 * sizeof(uint32_t); } for (const IndirectDrawMetadata::IndirectValidationBatch& batch : validationInfo.GetBatches()) { const uint64_t minOffsetFromAlignedBoundary = batch.minOffset % minStorageBufferOffsetAlignment; const uint64_t minOffsetAlignedDown = batch.minOffset - minOffsetFromAlignedBoundary; Batch newBatch; newBatch.metadata = &batch; newBatch.numIndexBufferElements = config.numIndexBufferElements; newBatch.dataSize = GetBatchDataSize(batch.draws.size()); newBatch.inputIndirectOffset = minOffsetAlignedDown; newBatch.inputIndirectSize = batch.maxOffset + indirectDrawCommandSize - minOffsetAlignedDown; newBatch.outputParamsSize = batch.draws.size() * outputIndirectSize; newBatch.outputParamsOffset = Align(outputParamsSize, minStorageBufferOffsetAlignment); outputParamsSize = newBatch.outputParamsOffset + newBatch.outputParamsSize; if (outputParamsSize > maxStorageBufferBindingSize) { return DAWN_INTERNAL_ERROR("Too many drawIndexedIndirect calls to validate"); } Pass* currentPass = passes.empty() ? nullptr : &passes.back(); if (currentPass && currentPass->inputIndirectBuffer == config.inputIndirectBuffer) { uint64_t nextBatchDataOffset = Align(currentPass->batchDataSize, minStorageBufferOffsetAlignment); uint64_t newPassBatchDataSize = nextBatchDataOffset + newBatch.dataSize; if (newPassBatchDataSize <= maxStorageBufferBindingSize) { // We can fit this batch in the current pass. newBatch.dataBufferOffset = nextBatchDataOffset; currentPass->batchDataSize = newPassBatchDataSize; currentPass->batches.push_back(newBatch); continue; } } // We need to start a new pass for this batch. newBatch.dataBufferOffset = 0; Pass newPass{}; newPass.inputIndirectBuffer = config.inputIndirectBuffer; newPass.batchDataSize = newBatch.dataSize; newPass.batches.push_back(newBatch); newPass.flags = 0; if (config.duplicateBaseVertexInstance) { newPass.flags |= kDuplicateBaseVertexInstance; } if (config.drawType == IndirectDrawMetadata::DrawType::Indexed) { newPass.flags |= kIndexedDraw; } if (device->IsValidationEnabled()) { newPass.flags |= kValidationEnabled; } passes.push_back(std::move(newPass)); } } auto* const store = device->GetInternalPipelineStore(); ScratchBuffer& outputParamsBuffer = store->scratchIndirectStorage; ScratchBuffer& batchDataBuffer = store->scratchStorage; uint64_t requiredBatchDataBufferSize = 0; for (const Pass& pass : passes) { requiredBatchDataBufferSize = std::max(requiredBatchDataBufferSize, pass.batchDataSize); } DAWN_TRY(batchDataBuffer.EnsureCapacity(requiredBatchDataBufferSize)); usageTracker->BufferUsedAs(batchDataBuffer.GetBuffer(), wgpu::BufferUsage::Storage); DAWN_TRY(outputParamsBuffer.EnsureCapacity(outputParamsSize)); usageTracker->BufferUsedAs(outputParamsBuffer.GetBuffer(), wgpu::BufferUsage::Indirect); // Now we allocate and populate host-side batch data to be copied to the GPU. for (Pass& pass : passes) { // We use std::malloc here because it guarantees maximal scalar alignment. pass.batchData = {std::malloc(pass.batchDataSize), std::free}; memset(pass.batchData.get(), 0, pass.batchDataSize); uint8_t* batchData = static_cast(pass.batchData.get()); for (Batch& batch : pass.batches) { batch.batchInfo = new (&batchData[batch.dataBufferOffset]) BatchInfo(); batch.batchInfo->numIndexBufferElements = batch.numIndexBufferElements; batch.batchInfo->numDraws = static_cast(batch.metadata->draws.size()); batch.batchInfo->flags = pass.flags; uint32_t* indirectOffsets = reinterpret_cast(batch.batchInfo + 1); uint64_t outputParamsOffset = batch.outputParamsOffset; for (auto& draw : batch.metadata->draws) { // The shader uses this to index an array of u32, hence the division by 4 bytes. *indirectOffsets++ = static_cast((draw.inputBufferOffset - batch.inputIndirectOffset) / 4); draw.cmd->indirectBuffer = outputParamsBuffer.GetBuffer(); draw.cmd->indirectOffset = outputParamsOffset; if (pass.flags & kIndexedDraw) { outputParamsOffset += kDrawIndexedIndirectSize; } else { outputParamsOffset += kDrawIndirectSize; } } } } ComputePipelineBase* pipeline; DAWN_TRY_ASSIGN(pipeline, GetOrCreateRenderValidationPipeline(device)); Ref layout; DAWN_TRY_ASSIGN(layout, pipeline->GetBindGroupLayout(0)); BindGroupEntry bindings[3]; BindGroupEntry& bufferDataBinding = bindings[0]; bufferDataBinding.binding = 0; bufferDataBinding.buffer = batchDataBuffer.GetBuffer(); BindGroupEntry& inputIndirectBinding = bindings[1]; inputIndirectBinding.binding = 1; BindGroupEntry& outputParamsBinding = bindings[2]; outputParamsBinding.binding = 2; outputParamsBinding.buffer = outputParamsBuffer.GetBuffer(); BindGroupDescriptor bindGroupDescriptor = {}; bindGroupDescriptor.layout = layout.Get(); bindGroupDescriptor.entryCount = 3; bindGroupDescriptor.entries = bindings; // Finally, we can now encode our validation and duplication passes. Each pass first does a // two WriteBuffer to get batch and pass data over to the GPU, followed by a single compute // pass. The compute pass encodes a separate SetBindGroup and Dispatch command for each // batch. for (const Pass& pass : passes) { commandEncoder->APIWriteBuffer(batchDataBuffer.GetBuffer(), 0, static_cast(pass.batchData.get()), pass.batchDataSize); Ref passEncoder = commandEncoder->BeginComputePass(); passEncoder->APISetPipeline(pipeline); inputIndirectBinding.buffer = pass.inputIndirectBuffer; for (const Batch& batch : pass.batches) { bufferDataBinding.offset = batch.dataBufferOffset; bufferDataBinding.size = batch.dataSize; inputIndirectBinding.offset = batch.inputIndirectOffset; inputIndirectBinding.size = batch.inputIndirectSize; outputParamsBinding.offset = batch.outputParamsOffset; outputParamsBinding.size = batch.outputParamsSize; Ref bindGroup; DAWN_TRY_ASSIGN(bindGroup, device->CreateBindGroup(&bindGroupDescriptor)); const uint32_t numDrawsRoundedUp = (batch.batchInfo->numDraws + kWorkgroupSize - 1) / kWorkgroupSize; passEncoder->APISetBindGroup(0, bindGroup.Get()); passEncoder->APIDispatchWorkgroups(numDrawsRoundedUp); } passEncoder->APIEnd(); } return {}; } } // namespace dawn::native