// Copyright 2021 The Dawn Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "dawn_native/IndirectDrawValidationEncoder.h" #include "common/Constants.h" #include "common/Math.h" #include "dawn_native/BindGroup.h" #include "dawn_native/BindGroupLayout.h" #include "dawn_native/CommandEncoder.h" #include "dawn_native/ComputePassEncoder.h" #include "dawn_native/ComputePipeline.h" #include "dawn_native/Device.h" #include "dawn_native/InternalPipelineStore.h" #include "dawn_native/Queue.h" #include #include namespace dawn_native { namespace { // NOTE: This must match the workgroup_size attribute on the compute entry point below. constexpr uint64_t kWorkgroupSize = 64; // Equivalent to the BatchInfo struct defined in the shader below. struct BatchInfo { uint64_t numIndexBufferElements; uint32_t numDraws; uint32_t padding; }; // TODO(https://crbug.com/dawn/1108): Propagate validation feedback from this shader in // various failure modes. static const char sRenderValidationShaderSource[] = R"( let kNumIndirectParamsPerDrawCall = 5u; let kIndexCountEntry = 0u; let kInstanceCountEntry = 1u; let kFirstIndexEntry = 2u; let kBaseVertexEntry = 3u; let kFirstInstanceEntry = 4u; [[block]] struct BatchInfo { numIndexBufferElementsLow: u32; numIndexBufferElementsHigh: u32; numDraws: u32; padding: u32; indirectOffsets: array; }; [[block]] struct IndirectParams { data: array; }; [[group(0), binding(0)]] var batch: BatchInfo; [[group(0), binding(1)]] var clientParams: IndirectParams; [[group(0), binding(2)]] var validatedParams: IndirectParams; fn fail(drawIndex: u32) { let index = drawIndex * kNumIndirectParamsPerDrawCall; validatedParams.data[index + kIndexCountEntry] = 0u; validatedParams.data[index + kInstanceCountEntry] = 0u; validatedParams.data[index + kFirstIndexEntry] = 0u; validatedParams.data[index + kBaseVertexEntry] = 0u; validatedParams.data[index + kFirstInstanceEntry] = 0u; } fn pass(drawIndex: u32) { let vIndex = drawIndex * kNumIndirectParamsPerDrawCall; let cIndex = batch.indirectOffsets[drawIndex]; validatedParams.data[vIndex + kIndexCountEntry] = clientParams.data[cIndex + kIndexCountEntry]; validatedParams.data[vIndex + kInstanceCountEntry] = clientParams.data[cIndex + kInstanceCountEntry]; validatedParams.data[vIndex + kFirstIndexEntry] = clientParams.data[cIndex + kFirstIndexEntry]; validatedParams.data[vIndex + kBaseVertexEntry] = clientParams.data[cIndex + kBaseVertexEntry]; validatedParams.data[vIndex + kFirstInstanceEntry] = clientParams.data[cIndex + kFirstInstanceEntry]; } [[stage(compute), workgroup_size(64, 1, 1)]] fn main([[builtin(global_invocation_id)]] id : vec3) { if (id.x >= batch.numDraws) { return; } let clientIndex = batch.indirectOffsets[id.x]; let firstInstance = clientParams.data[clientIndex + kFirstInstanceEntry]; if (firstInstance != 0u) { fail(id.x); return; } if (batch.numIndexBufferElementsHigh >= 2u) { // firstIndex and indexCount are both u32. The maximum possible sum of these // values is 0x1fffffffe, which is less than 0x200000000. Nothing to validate. pass(id.x); return; } let firstIndex = clientParams.data[clientIndex + kFirstIndexEntry]; if (batch.numIndexBufferElementsHigh == 0u && batch.numIndexBufferElementsLow < firstIndex) { fail(id.x); return; } // Note that this subtraction may underflow, but only when // numIndexBufferElementsHigh is 1u. The result is still correct in that case. let maxIndexCount = batch.numIndexBufferElementsLow - firstIndex; let indexCount = clientParams.data[clientIndex + kIndexCountEntry]; if (indexCount > maxIndexCount) { fail(id.x); return; } pass(id.x); } )"; ResultOrError GetOrCreateRenderValidationPipeline( DeviceBase* device) { InternalPipelineStore* store = device->GetInternalPipelineStore(); if (store->renderValidationPipeline == nullptr) { // Create compute shader module if not cached before. if (store->renderValidationShader == nullptr) { ShaderModuleDescriptor descriptor; ShaderModuleWGSLDescriptor wgslDesc; wgslDesc.source = sRenderValidationShaderSource; descriptor.nextInChain = reinterpret_cast(&wgslDesc); DAWN_TRY_ASSIGN(store->renderValidationShader, device->CreateShaderModule(&descriptor)); } BindGroupLayoutEntry entries[3]; entries[0].binding = 0; entries[0].visibility = wgpu::ShaderStage::Compute; entries[0].buffer.type = wgpu::BufferBindingType::ReadOnlyStorage; entries[1].binding = 1; entries[1].visibility = wgpu::ShaderStage::Compute; entries[1].buffer.type = kInternalStorageBufferBinding; entries[2].binding = 2; entries[2].visibility = wgpu::ShaderStage::Compute; entries[2].buffer.type = wgpu::BufferBindingType::Storage; BindGroupLayoutDescriptor bindGroupLayoutDescriptor; bindGroupLayoutDescriptor.entryCount = 3; bindGroupLayoutDescriptor.entries = entries; Ref bindGroupLayout; DAWN_TRY_ASSIGN(bindGroupLayout, device->CreateBindGroupLayout(&bindGroupLayoutDescriptor, true)); PipelineLayoutDescriptor pipelineDescriptor; pipelineDescriptor.bindGroupLayoutCount = 1; pipelineDescriptor.bindGroupLayouts = &bindGroupLayout.Get(); Ref pipelineLayout; DAWN_TRY_ASSIGN(pipelineLayout, device->CreatePipelineLayout(&pipelineDescriptor)); ComputePipelineDescriptor computePipelineDescriptor = {}; computePipelineDescriptor.layout = pipelineLayout.Get(); computePipelineDescriptor.compute.module = store->renderValidationShader.Get(); computePipelineDescriptor.compute.entryPoint = "main"; DAWN_TRY_ASSIGN(store->renderValidationPipeline, device->CreateComputePipeline(&computePipelineDescriptor)); } return store->renderValidationPipeline.Get(); } size_t GetBatchDataSize(uint32_t numDraws) { return sizeof(BatchInfo) + numDraws * sizeof(uint32_t); } } // namespace const uint32_t kBatchDrawCallLimitByDispatchSize = kMaxComputePerDimensionDispatchSize * kWorkgroupSize; const uint32_t kBatchDrawCallLimitByStorageBindingSize = (kMaxStorageBufferBindingSize - sizeof(BatchInfo)) / sizeof(uint32_t); const uint32_t kMaxDrawCallsPerIndirectValidationBatch = std::min(kBatchDrawCallLimitByDispatchSize, kBatchDrawCallLimitByStorageBindingSize); MaybeError EncodeIndirectDrawValidationCommands(DeviceBase* device, CommandEncoder* commandEncoder, RenderPassResourceUsageTracker* usageTracker, IndirectDrawMetadata* indirectDrawMetadata) { struct Batch { const IndirectDrawMetadata::IndexedIndirectValidationBatch* metadata; uint64_t numIndexBufferElements; uint64_t dataBufferOffset; uint64_t dataSize; uint64_t clientIndirectOffset; uint64_t clientIndirectSize; uint64_t validatedParamsOffset; uint64_t validatedParamsSize; BatchInfo* batchInfo; }; struct Pass { BufferBase* clientIndirectBuffer; uint64_t validatedParamsSize = 0; uint64_t batchDataSize = 0; std::unique_ptr batchData{nullptr, std::free}; std::vector batches; }; // First stage is grouping all batches into passes. We try to pack as many batches into a // single pass as possible. Batches can be grouped together as long as they're validating // data from the same indirect buffer, but they may still be split into multiple passes if // the number of draw calls in a pass would exceed some (very high) upper bound. uint64_t numTotalDrawCalls = 0; size_t validatedParamsSize = 0; std::vector passes; IndirectDrawMetadata::IndexedIndirectBufferValidationInfoMap& bufferInfoMap = *indirectDrawMetadata->GetIndexedIndirectBufferValidationInfo(); if (bufferInfoMap.empty()) { return {}; } for (auto& entry : bufferInfoMap) { const IndirectDrawMetadata::IndexedIndirectConfig& config = entry.first; BufferBase* clientIndirectBuffer = config.first; for (const IndirectDrawMetadata::IndexedIndirectValidationBatch& batch : entry.second.GetBatches()) { const uint64_t minOffsetFromAlignedBoundary = batch.minOffset % kMinStorageBufferOffsetAlignment; const uint64_t minOffsetAlignedDown = batch.minOffset - minOffsetFromAlignedBoundary; Batch newBatch; newBatch.metadata = &batch; newBatch.numIndexBufferElements = config.second; newBatch.dataSize = GetBatchDataSize(batch.draws.size()); newBatch.clientIndirectOffset = minOffsetAlignedDown; newBatch.clientIndirectSize = batch.maxOffset + kDrawIndexedIndirectSize - minOffsetAlignedDown; numTotalDrawCalls += batch.draws.size(); newBatch.validatedParamsSize = batch.draws.size() * kDrawIndexedIndirectSize; newBatch.validatedParamsOffset = Align(validatedParamsSize, kMinStorageBufferOffsetAlignment); validatedParamsSize = newBatch.validatedParamsOffset + newBatch.validatedParamsSize; if (validatedParamsSize > kMaxStorageBufferBindingSize) { return DAWN_INTERNAL_ERROR("Too many drawIndexedIndirect calls to validate"); } Pass* currentPass = passes.empty() ? nullptr : &passes.back(); if (currentPass && currentPass->clientIndirectBuffer == clientIndirectBuffer) { uint64_t nextBatchDataOffset = Align(currentPass->batchDataSize, kMinStorageBufferOffsetAlignment); uint64_t newPassBatchDataSize = nextBatchDataOffset + newBatch.dataSize; if (newPassBatchDataSize <= kMaxStorageBufferBindingSize) { // We can fit this batch in the current pass. newBatch.dataBufferOffset = nextBatchDataOffset; currentPass->batchDataSize = newPassBatchDataSize; currentPass->batches.push_back(newBatch); continue; } } // We need to start a new pass for this batch. newBatch.dataBufferOffset = 0; Pass newPass; newPass.clientIndirectBuffer = clientIndirectBuffer; newPass.batchDataSize = newBatch.dataSize; newPass.batches.push_back(newBatch); passes.push_back(std::move(newPass)); } } auto* const store = device->GetInternalPipelineStore(); ScratchBuffer& validatedParamsBuffer = store->scratchIndirectStorage; ScratchBuffer& batchDataBuffer = store->scratchStorage; uint64_t requiredBatchDataBufferSize = 0; for (const Pass& pass : passes) { requiredBatchDataBufferSize = std::max(requiredBatchDataBufferSize, pass.batchDataSize); } DAWN_TRY(batchDataBuffer.EnsureCapacity(requiredBatchDataBufferSize)); usageTracker->BufferUsedAs(batchDataBuffer.GetBuffer(), wgpu::BufferUsage::Storage); DAWN_TRY(validatedParamsBuffer.EnsureCapacity(validatedParamsSize)); usageTracker->BufferUsedAs(validatedParamsBuffer.GetBuffer(), wgpu::BufferUsage::Indirect); // Now we allocate and populate host-side batch data to be copied to the GPU, and prepare to // update all DrawIndexedIndirectCmd buffer references. std::vector deferredBufferLocationUpdates; deferredBufferLocationUpdates.reserve(numTotalDrawCalls); for (Pass& pass : passes) { // We use std::malloc here because it guarantees maximal scalar alignment. pass.batchData = {std::malloc(pass.batchDataSize), std::free}; memset(pass.batchData.get(), 0, pass.batchDataSize); uint8_t* batchData = static_cast(pass.batchData.get()); for (Batch& batch : pass.batches) { batch.batchInfo = new (&batchData[batch.dataBufferOffset]) BatchInfo(); batch.batchInfo->numIndexBufferElements = batch.numIndexBufferElements; batch.batchInfo->numDraws = static_cast(batch.metadata->draws.size()); uint32_t* indirectOffsets = reinterpret_cast(batch.batchInfo + 1); uint64_t validatedParamsOffset = batch.validatedParamsOffset; for (const auto& draw : batch.metadata->draws) { // The shader uses this to index an array of u32, hence the division by 4 bytes. *indirectOffsets++ = static_cast( (draw.clientBufferOffset - batch.clientIndirectOffset) / 4); DeferredBufferLocationUpdate deferredUpdate; deferredUpdate.location = draw.bufferLocation; deferredUpdate.buffer = validatedParamsBuffer.GetBuffer(); deferredUpdate.offset = validatedParamsOffset; deferredBufferLocationUpdates.push_back(std::move(deferredUpdate)); validatedParamsOffset += kDrawIndexedIndirectSize; } } } ComputePipelineBase* pipeline; DAWN_TRY_ASSIGN(pipeline, GetOrCreateRenderValidationPipeline(device)); Ref layout; DAWN_TRY_ASSIGN(layout, pipeline->GetBindGroupLayout(0)); BindGroupEntry bindings[3]; BindGroupEntry& bufferDataBinding = bindings[0]; bufferDataBinding.binding = 0; bufferDataBinding.buffer = batchDataBuffer.GetBuffer(); BindGroupEntry& clientIndirectBinding = bindings[1]; clientIndirectBinding.binding = 1; BindGroupEntry& validatedParamsBinding = bindings[2]; validatedParamsBinding.binding = 2; validatedParamsBinding.buffer = validatedParamsBuffer.GetBuffer(); BindGroupDescriptor bindGroupDescriptor = {}; bindGroupDescriptor.layout = layout.Get(); bindGroupDescriptor.entryCount = 3; bindGroupDescriptor.entries = bindings; // Finally, we can now encode our validation passes. Each pass first does a single // WriteBuffer to get batch data over to the GPU, followed by a single compute pass. The // compute pass encodes a separate SetBindGroup and Dispatch command for each batch. commandEncoder->EncodeSetValidatedBufferLocationsInternal( std::move(deferredBufferLocationUpdates)); for (const Pass& pass : passes) { commandEncoder->APIWriteBuffer(batchDataBuffer.GetBuffer(), 0, static_cast(pass.batchData.get()), pass.batchDataSize); // TODO(dawn:723): change to not use AcquireRef for reentrant object creation. ComputePassDescriptor descriptor = {}; Ref passEncoder = AcquireRef(commandEncoder->APIBeginComputePass(&descriptor)); passEncoder->APISetPipeline(pipeline); clientIndirectBinding.buffer = pass.clientIndirectBuffer; for (const Batch& batch : pass.batches) { bufferDataBinding.offset = batch.dataBufferOffset; bufferDataBinding.size = batch.dataSize; clientIndirectBinding.offset = batch.clientIndirectOffset; clientIndirectBinding.size = batch.clientIndirectSize; validatedParamsBinding.offset = batch.validatedParamsOffset; validatedParamsBinding.size = batch.validatedParamsSize; Ref bindGroup; DAWN_TRY_ASSIGN(bindGroup, device->CreateBindGroup(&bindGroupDescriptor)); const uint32_t numDrawsRoundedUp = (batch.batchInfo->numDraws + kWorkgroupSize - 1) / kWorkgroupSize; passEncoder->APISetBindGroup(0, bindGroup.Get()); passEncoder->APIDispatch(numDrawsRoundedUp); } passEncoder->APIEndPass(); } return {}; } } // namespace dawn_native