// Copyright 2019 The Dawn Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "tests/DawnTest.h" #include "utils/WGPUHelpers.h" class ComputeStorageBufferBarrierTests : public DawnTest { protected: static constexpr uint32_t kNumValues = 100; static constexpr uint32_t kIterations = 100; }; // Test that multiple dispatches to increment values in a storage buffer are synchronized. TEST_P(ComputeStorageBufferBarrierTests, AddIncrement) { std::vector data(kNumValues, 0); std::vector expected(kNumValues, 0x1234 * kIterations); uint64_t bufferSize = static_cast(data.size() * sizeof(uint32_t)); wgpu::Buffer buffer = utils::CreateBufferFromData( device, data.data(), bufferSize, wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc); wgpu::ShaderModule module = utils::CreateShaderModule(device, R"( [[block]] struct Buf { data : array; }; [[group(0), binding(0)]] var buf : Buf; [[stage(compute), workgroup_size(1)]] fn main([[builtin(global_invocation_id)]] GlobalInvocationID : vec3) { buf.data[GlobalInvocationID.x] = buf.data[GlobalInvocationID.x] + 0x1234u; } )"); wgpu::ComputePipelineDescriptor pipelineDesc = {}; pipelineDesc.compute.module = module; pipelineDesc.compute.entryPoint = "main"; wgpu::ComputePipeline pipeline = device.CreateComputePipeline(&pipelineDesc); wgpu::BindGroup bindGroup = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0), {{0, buffer, 0, bufferSize}}); wgpu::CommandEncoder encoder = device.CreateCommandEncoder(); wgpu::ComputePassEncoder pass = encoder.BeginComputePass(); pass.SetPipeline(pipeline); pass.SetBindGroup(0, bindGroup); for (uint32_t i = 0; i < kIterations; ++i) { pass.Dispatch(kNumValues); } pass.EndPass(); wgpu::CommandBuffer commands = encoder.Finish(); queue.Submit(1, &commands); EXPECT_BUFFER_U32_RANGE_EQ(expected.data(), buffer, 0, kNumValues); } // Test that multiple dispatches to increment values by ping-ponging between two storage buffers // are synchronized. TEST_P(ComputeStorageBufferBarrierTests, AddPingPong) { std::vector data(kNumValues, 0); std::vector expectedA(kNumValues, 0x1234 * kIterations); std::vector expectedB(kNumValues, 0x1234 * (kIterations - 1)); uint64_t bufferSize = static_cast(data.size() * sizeof(uint32_t)); wgpu::Buffer bufferA = utils::CreateBufferFromData( device, data.data(), bufferSize, wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc); wgpu::Buffer bufferB = utils::CreateBufferFromData( device, data.data(), bufferSize, wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc); wgpu::ShaderModule module = utils::CreateShaderModule(device, R"( // TODO(crbug.com/tint/386): Use the same struct. [[block]] struct Src { data : array; }; [[block]] struct Dst { data : array; }; [[group(0), binding(0)]] var src : Src; [[group(0), binding(1)]] var dst : Dst; [[stage(compute), workgroup_size(1)]] fn main([[builtin(global_invocation_id)]] GlobalInvocationID : vec3) { dst.data[GlobalInvocationID.x] = src.data[GlobalInvocationID.x] + 0x1234u; } )"); wgpu::ComputePipelineDescriptor pipelineDesc = {}; pipelineDesc.compute.module = module; pipelineDesc.compute.entryPoint = "main"; wgpu::ComputePipeline pipeline = device.CreateComputePipeline(&pipelineDesc); wgpu::BindGroup bindGroupA = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0), { {0, bufferA, 0, bufferSize}, {1, bufferB, 0, bufferSize}, }); wgpu::BindGroup bindGroupB = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0), { {0, bufferB, 0, bufferSize}, {1, bufferA, 0, bufferSize}, }); wgpu::BindGroup bindGroups[2] = {bindGroupA, bindGroupB}; wgpu::CommandEncoder encoder = device.CreateCommandEncoder(); wgpu::ComputePassEncoder pass = encoder.BeginComputePass(); pass.SetPipeline(pipeline); for (uint32_t i = 0; i < kIterations / 2; ++i) { pass.SetBindGroup(0, bindGroups[0]); pass.Dispatch(kNumValues); pass.SetBindGroup(0, bindGroups[1]); pass.Dispatch(kNumValues); } pass.EndPass(); wgpu::CommandBuffer commands = encoder.Finish(); queue.Submit(1, &commands); EXPECT_BUFFER_U32_RANGE_EQ(expectedA.data(), bufferA, 0, kNumValues); EXPECT_BUFFER_U32_RANGE_EQ(expectedB.data(), bufferB, 0, kNumValues); } // Test that multiple dispatches to increment values by ping-ponging between storage buffers and // read-only storage buffers are synchronized in one compute pass. TEST_P(ComputeStorageBufferBarrierTests, StorageAndReadonlyStoragePingPongInOnePass) { std::vector data(kNumValues, 0); std::vector expectedA(kNumValues, 0x1234 * kIterations); std::vector expectedB(kNumValues, 0x1234 * (kIterations - 1)); uint64_t bufferSize = static_cast(data.size() * sizeof(uint32_t)); wgpu::Buffer bufferA = utils::CreateBufferFromData( device, data.data(), bufferSize, wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc); wgpu::Buffer bufferB = utils::CreateBufferFromData( device, data.data(), bufferSize, wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc); wgpu::ShaderModule module = utils::CreateShaderModule(device, R"( // TODO(crbug.com/tint/386): Use the same struct. [[block]] struct Src { data : array; }; [[block]] struct Dst { data : array; }; [[group(0), binding(0)]] var src : Src; [[group(0), binding(1)]] var dst : Dst; [[stage(compute), workgroup_size(1)]] fn main([[builtin(global_invocation_id)]] GlobalInvocationID : vec3) { dst.data[GlobalInvocationID.x] = src.data[GlobalInvocationID.x] + 0x1234u; } )"); wgpu::ComputePipelineDescriptor pipelineDesc = {}; pipelineDesc.compute.module = module; pipelineDesc.compute.entryPoint = "main"; wgpu::ComputePipeline pipeline = device.CreateComputePipeline(&pipelineDesc); wgpu::BindGroup bindGroupA = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0), { {0, bufferA, 0, bufferSize}, {1, bufferB, 0, bufferSize}, }); wgpu::BindGroup bindGroupB = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0), { {0, bufferB, 0, bufferSize}, {1, bufferA, 0, bufferSize}, }); wgpu::BindGroup bindGroups[2] = {bindGroupA, bindGroupB}; wgpu::CommandEncoder encoder = device.CreateCommandEncoder(); wgpu::ComputePassEncoder pass = encoder.BeginComputePass(); pass.SetPipeline(pipeline); for (uint32_t i = 0; i < kIterations / 2; ++i) { pass.SetBindGroup(0, bindGroups[0]); pass.Dispatch(kNumValues); pass.SetBindGroup(0, bindGroups[1]); pass.Dispatch(kNumValues); } pass.EndPass(); wgpu::CommandBuffer commands = encoder.Finish(); queue.Submit(1, &commands); EXPECT_BUFFER_U32_RANGE_EQ(expectedA.data(), bufferA, 0, kNumValues); EXPECT_BUFFER_U32_RANGE_EQ(expectedB.data(), bufferB, 0, kNumValues); } // Test that Storage to Uniform buffer transitions work and synchronize correctly // by ping-ponging between Storage/Uniform usage in sequential compute passes. TEST_P(ComputeStorageBufferBarrierTests, UniformToStorageAddPingPong) { std::vector data(kNumValues, 0); std::vector expectedA(kNumValues, 0x1234 * kIterations); std::vector expectedB(kNumValues, 0x1234 * (kIterations - 1)); uint64_t bufferSize = static_cast(data.size() * sizeof(uint32_t)); wgpu::Buffer bufferA = utils::CreateBufferFromData( device, data.data(), bufferSize, wgpu::BufferUsage::Storage | wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopySrc); wgpu::Buffer bufferB = utils::CreateBufferFromData( device, data.data(), bufferSize, wgpu::BufferUsage::Storage | wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopySrc); wgpu::ShaderModule module = utils::CreateShaderModule(device, R"( [[block]] struct Buf { data : array, 25>; }; [[group(0), binding(0)]] var src : Buf; [[group(0), binding(1)]] var dst : Buf; [[stage(compute), workgroup_size(1)]] fn main([[builtin(global_invocation_id)]] GlobalInvocationID : vec3) { dst.data[GlobalInvocationID.x] = src.data[GlobalInvocationID.x] + vec4(0x1234u, 0x1234u, 0x1234u, 0x1234u); } )"); wgpu::ComputePipelineDescriptor pipelineDesc = {}; pipelineDesc.compute.module = module; pipelineDesc.compute.entryPoint = "main"; wgpu::ComputePipeline pipeline = device.CreateComputePipeline(&pipelineDesc); wgpu::BindGroup bindGroupA = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0), { {0, bufferA, 0, bufferSize}, {1, bufferB, 0, bufferSize}, }); wgpu::BindGroup bindGroupB = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0), { {0, bufferB, 0, bufferSize}, {1, bufferA, 0, bufferSize}, }); wgpu::BindGroup bindGroups[2] = {bindGroupA, bindGroupB}; wgpu::CommandEncoder encoder = device.CreateCommandEncoder(); for (uint32_t i = 0, b = 0; i < kIterations; ++i, b = 1 - b) { wgpu::ComputePassEncoder pass = encoder.BeginComputePass(); pass.SetPipeline(pipeline); pass.SetBindGroup(0, bindGroups[b]); pass.Dispatch(kNumValues / 4); pass.EndPass(); } wgpu::CommandBuffer commands = encoder.Finish(); queue.Submit(1, &commands); EXPECT_BUFFER_U32_RANGE_EQ(expectedA.data(), bufferA, 0, kNumValues); EXPECT_BUFFER_U32_RANGE_EQ(expectedB.data(), bufferB, 0, kNumValues); } // Test that Storage to Uniform buffer transitions work and synchronize correctly // by ping-ponging between Storage/Uniform usage in one compute pass. TEST_P(ComputeStorageBufferBarrierTests, UniformToStorageAddPingPongInOnePass) { std::vector data(kNumValues, 0); std::vector expectedA(kNumValues, 0x1234 * kIterations); std::vector expectedB(kNumValues, 0x1234 * (kIterations - 1)); uint64_t bufferSize = static_cast(data.size() * sizeof(uint32_t)); wgpu::Buffer bufferA = utils::CreateBufferFromData( device, data.data(), bufferSize, wgpu::BufferUsage::Storage | wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopySrc); wgpu::Buffer bufferB = utils::CreateBufferFromData( device, data.data(), bufferSize, wgpu::BufferUsage::Storage | wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopySrc); wgpu::ShaderModule module = utils::CreateShaderModule(device, R"( [[block]] struct Buf { data : array, 25>; }; [[group(0), binding(0)]] var src : Buf; [[group(0), binding(1)]] var dst : Buf; [[stage(compute), workgroup_size(1)]] fn main([[builtin(global_invocation_id)]] GlobalInvocationID : vec3) { dst.data[GlobalInvocationID.x] = src.data[GlobalInvocationID.x] + vec4(0x1234u, 0x1234u, 0x1234u, 0x1234u); } )"); wgpu::ComputePipelineDescriptor pipelineDesc = {}; pipelineDesc.compute.module = module; pipelineDesc.compute.entryPoint = "main"; wgpu::ComputePipeline pipeline = device.CreateComputePipeline(&pipelineDesc); wgpu::BindGroup bindGroupA = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0), { {0, bufferA, 0, bufferSize}, {1, bufferB, 0, bufferSize}, }); wgpu::BindGroup bindGroupB = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0), { {0, bufferB, 0, bufferSize}, {1, bufferA, 0, bufferSize}, }); wgpu::BindGroup bindGroups[2] = {bindGroupA, bindGroupB}; wgpu::CommandEncoder encoder = device.CreateCommandEncoder(); wgpu::ComputePassEncoder pass = encoder.BeginComputePass(); for (uint32_t i = 0, b = 0; i < kIterations; ++i, b = 1 - b) { pass.SetPipeline(pipeline); pass.SetBindGroup(0, bindGroups[b]); pass.Dispatch(kNumValues / 4); } pass.EndPass(); wgpu::CommandBuffer commands = encoder.Finish(); queue.Submit(1, &commands); EXPECT_BUFFER_U32_RANGE_EQ(expectedA.data(), bufferA, 0, kNumValues); EXPECT_BUFFER_U32_RANGE_EQ(expectedB.data(), bufferB, 0, kNumValues); } // Test that barriers for dispatches correctly combine Indirect | Storage in backends with explicit // barriers. Do this by: // 1 - Initializing an indirect buffer with zeros. // 2 - Write ones into it with a compute shader. // 3 - Use the indirect buffer in a Dispatch while also reading its data. TEST_P(ComputeStorageBufferBarrierTests, IndirectBufferCorrectBarrier) { // For some reason SPIRV-Cross crashes when translating the step3 shader to HLSL. Suppress the // failure since we'll remove SPIRV-Cross at some point. DAWN_SUPPRESS_TEST_IF(IsD3D12() && !HasToggleEnabled("use_tint_generator")); wgpu::ComputePipelineDescriptor step2PipelineDesc; step2PipelineDesc.compute.entryPoint = "main"; step2PipelineDesc.compute.module = utils::CreateShaderModule(device, R"( [[block]] struct Buf { data : array; }; [[group(0), binding(0)]] var buf : Buf; [[stage(compute), workgroup_size(1)]] fn main() { buf.data = array(1u, 1u, 1u); } )"); wgpu::ComputePipeline step2Pipeline = device.CreateComputePipeline(&step2PipelineDesc); wgpu::ComputePipelineDescriptor step3PipelineDesc; step3PipelineDesc.compute.entryPoint = "main"; step3PipelineDesc.compute.module = utils::CreateShaderModule(device, R"( [[block]] struct Buf { data : array; }; [[group(0), binding(0)]] var buf : Buf; [[block]] struct Result { data : u32; }; [[group(0), binding(1)]] var result : Result; [[stage(compute), workgroup_size(1)]] fn main() { result.data = 2u; if (buf.data[0] == 1u && buf.data[1] == 1u && buf.data[2] == 1u) { result.data = 1u; } } )"); wgpu::ComputePipeline step3Pipeline = device.CreateComputePipeline(&step3PipelineDesc); // 1 - Initializing an indirect buffer with zeros. wgpu::Buffer buf = utils::CreateBufferFromData( device, wgpu::BufferUsage::Storage | wgpu::BufferUsage::Indirect, {0u, 0u, 0u}); // 2 - Write ones into it with a compute shader. wgpu::CommandEncoder encoder = device.CreateCommandEncoder(); wgpu::ComputePassEncoder pass = encoder.BeginComputePass(); wgpu::BindGroup step2Group = utils::MakeBindGroup(device, step2Pipeline.GetBindGroupLayout(0), {{0, buf}}); pass.SetPipeline(step2Pipeline); pass.SetBindGroup(0, step2Group); pass.Dispatch(1); // 3 - Use the indirect buffer in a Dispatch while also reading its data. wgpu::Buffer resultBuffer = utils::CreateBufferFromData( device, wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc, {0u}); wgpu::BindGroup step3Group = utils::MakeBindGroup(device, step3Pipeline.GetBindGroupLayout(0), {{0, buf}, {1, resultBuffer}}); pass.SetPipeline(step3Pipeline); pass.SetBindGroup(0, step3Group); pass.DispatchIndirect(buf, 0); pass.EndPass(); wgpu::CommandBuffer commands = encoder.Finish(); queue.Submit(1, &commands); EXPECT_BUFFER_U32_EQ(1u, resultBuffer, 0); } DAWN_INSTANTIATE_TEST(ComputeStorageBufferBarrierTests, D3D12Backend(), MetalBackend(), OpenGLBackend(), OpenGLESBackend(), VulkanBackend());