dawn-cmake/src/tests/end2end/ComputeStorageBufferBarrierTests.cpp

// Copyright 2019 The Dawn Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "tests/DawnTest.h"

#include "utils/WGPUHelpers.h"

class ComputeStorageBufferBarrierTests : public DawnTest {
  protected:
    static constexpr uint32_t kNumValues = 100;
    static constexpr uint32_t kIterations = 100;
};

// Test that multiple dispatches to increment values in a storage buffer are synchronized.
TEST_P(ComputeStorageBufferBarrierTests, AddIncrement) {
    std::vector<uint32_t> data(kNumValues, 0);
    std::vector<uint32_t> expected(kNumValues, 0x1234 * kIterations);

    uint64_t bufferSize = static_cast<uint64_t>(data.size() * sizeof(uint32_t));
    wgpu::Buffer buffer = utils::CreateBufferFromData(
        device, data.data(), bufferSize, wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc);

    wgpu::ShaderModule module = utils::CreateShaderModule(device, R"(
        [[block]] struct Buf {
            data : array<u32, 100>;
        };

        [[group(0), binding(0)]] var<storage, read_write> buf : Buf;

        [[stage(compute)]]
        fn main([[builtin(global_invocation_id)]] GlobalInvocationID : vec3<u32>) {
            buf.data[GlobalInvocationID.x] = buf.data[GlobalInvocationID.x] + 0x1234u;
        }
    )");

    wgpu::ComputePipelineDescriptor pipelineDesc = {};
    pipelineDesc.compute.module = module;
    pipelineDesc.compute.entryPoint = "main";
    wgpu::ComputePipeline pipeline = device.CreateComputePipeline(&pipelineDesc);

    wgpu::BindGroup bindGroup =
        utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0), {{0, buffer, 0, bufferSize}});

    wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
    wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
    pass.SetPipeline(pipeline);
    pass.SetBindGroup(0, bindGroup);
    for (uint32_t i = 0; i < kIterations; ++i) {
        pass.Dispatch(kNumValues);
    }
    pass.EndPass();
    wgpu::CommandBuffer commands = encoder.Finish();
    queue.Submit(1, &commands);

    EXPECT_BUFFER_U32_RANGE_EQ(expected.data(), buffer, 0, kNumValues);
}

// Test that multiple dispatches to increment values by ping-ponging between two storage buffers
// are synchronized.
TEST_P(ComputeStorageBufferBarrierTests, AddPingPong) {
    std::vector<uint32_t> data(kNumValues, 0);
    std::vector<uint32_t> expectedA(kNumValues, 0x1234 * kIterations);
    std::vector<uint32_t> expectedB(kNumValues, 0x1234 * (kIterations - 1));

    uint64_t bufferSize = static_cast<uint64_t>(data.size() * sizeof(uint32_t));

    wgpu::Buffer bufferA = utils::CreateBufferFromData(
        device, data.data(), bufferSize, wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc);

    wgpu::Buffer bufferB = utils::CreateBufferFromData(
        device, data.data(), bufferSize, wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc);

    wgpu::ShaderModule module = utils::CreateShaderModule(device, R"(
        // TODO(crbug.com/tint/386):  Use the same struct.
        [[block]] struct Src {
            data : array<u32, 100>;
        };

        [[block]] struct Dst {
            data : array<u32, 100>;
        };

        [[group(0), binding(0)]] var<storage, read_write> src : Src;
        [[group(0), binding(1)]] var<storage, read_write> dst : Dst;

        [[stage(compute)]]
        fn main([[builtin(global_invocation_id)]] GlobalInvocationID : vec3<u32>) {
            dst.data[GlobalInvocationID.x] = src.data[GlobalInvocationID.x] + 0x1234u;
        }
    )");

    wgpu::ComputePipelineDescriptor pipelineDesc = {};
    pipelineDesc.compute.module = module;
    pipelineDesc.compute.entryPoint = "main";
    wgpu::ComputePipeline pipeline = device.CreateComputePipeline(&pipelineDesc);

    wgpu::BindGroup bindGroupA = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
                                                      {
                                                          {0, bufferA, 0, bufferSize},
                                                          {1, bufferB, 0, bufferSize},
                                                      });

    wgpu::BindGroup bindGroupB = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
                                                      {
                                                          {0, bufferB, 0, bufferSize},
                                                          {1, bufferA, 0, bufferSize},
                                                      });

    wgpu::BindGroup bindGroups[2] = {bindGroupA, bindGroupB};

    wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
    wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
    pass.SetPipeline(pipeline);

    for (uint32_t i = 0; i < kIterations / 2; ++i) {
        pass.SetBindGroup(0, bindGroups[0]);
        pass.Dispatch(kNumValues);
        pass.SetBindGroup(0, bindGroups[1]);
        pass.Dispatch(kNumValues);
    }
    pass.EndPass();
    wgpu::CommandBuffer commands = encoder.Finish();
    queue.Submit(1, &commands);

    EXPECT_BUFFER_U32_RANGE_EQ(expectedA.data(), bufferA, 0, kNumValues);
    EXPECT_BUFFER_U32_RANGE_EQ(expectedB.data(), bufferB, 0, kNumValues);
}

// Test that multiple dispatches to increment values by ping-ponging between storage buffers and
// read-only storage buffers are synchronized in one compute pass.
TEST_P(ComputeStorageBufferBarrierTests, StorageAndReadonlyStoragePingPongInOnePass) {
    std::vector<uint32_t> data(kNumValues, 0);
    std::vector<uint32_t> expectedA(kNumValues, 0x1234 * kIterations);
    std::vector<uint32_t> expectedB(kNumValues, 0x1234 * (kIterations - 1));

    uint64_t bufferSize = static_cast<uint64_t>(data.size() * sizeof(uint32_t));

    wgpu::Buffer bufferA = utils::CreateBufferFromData(
        device, data.data(), bufferSize, wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc);

    wgpu::Buffer bufferB = utils::CreateBufferFromData(
        device, data.data(), bufferSize, wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc);

    wgpu::ShaderModule module = utils::CreateShaderModule(device, R"(
        // TODO(crbug.com/tint/386):  Use the same struct.
        [[block]] struct Src {
            data : array<u32, 100>;
        };

        [[block]] struct Dst {
            data : array<u32, 100>;
        };

        [[group(0), binding(0)]] var<storage, read> src : Src;
        [[group(0), binding(1)]] var<storage, read_write> dst : Dst;

        [[stage(compute)]]
        fn main([[builtin(global_invocation_id)]] GlobalInvocationID : vec3<u32>) {
            dst.data[GlobalInvocationID.x] = src.data[GlobalInvocationID.x] + 0x1234u;
        }
    )");

    wgpu::ComputePipelineDescriptor pipelineDesc = {};
    pipelineDesc.compute.module = module;
    pipelineDesc.compute.entryPoint = "main";
    wgpu::ComputePipeline pipeline = device.CreateComputePipeline(&pipelineDesc);

    wgpu::BindGroup bindGroupA = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
                                                      {
                                                          {0, bufferA, 0, bufferSize},
                                                          {1, bufferB, 0, bufferSize},
                                                      });

    wgpu::BindGroup bindGroupB = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
                                                      {
                                                          {0, bufferB, 0, bufferSize},
                                                          {1, bufferA, 0, bufferSize},
                                                      });

    wgpu::BindGroup bindGroups[2] = {bindGroupA, bindGroupB};

    wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
    wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
    pass.SetPipeline(pipeline);

    for (uint32_t i = 0; i < kIterations / 2; ++i) {
        pass.SetBindGroup(0, bindGroups[0]);
        pass.Dispatch(kNumValues);
        pass.SetBindGroup(0, bindGroups[1]);
        pass.Dispatch(kNumValues);
    }
    pass.EndPass();
    wgpu::CommandBuffer commands = encoder.Finish();
    queue.Submit(1, &commands);

    EXPECT_BUFFER_U32_RANGE_EQ(expectedA.data(), bufferA, 0, kNumValues);
    EXPECT_BUFFER_U32_RANGE_EQ(expectedB.data(), bufferB, 0, kNumValues);
}

// Test that Storage to Uniform buffer transitions work and synchronize correctly
// by ping-ponging between Storage/Uniform usage in sequential compute passes.
TEST_P(ComputeStorageBufferBarrierTests, UniformToStorageAddPingPong) {
    std::vector<uint32_t> data(kNumValues, 0);
    std::vector<uint32_t> expectedA(kNumValues, 0x1234 * kIterations);
    std::vector<uint32_t> expectedB(kNumValues, 0x1234 * (kIterations - 1));

    uint64_t bufferSize = static_cast<uint64_t>(data.size() * sizeof(uint32_t));

    wgpu::Buffer bufferA = utils::CreateBufferFromData(
        device, data.data(), bufferSize,
        wgpu::BufferUsage::Storage | wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopySrc);

    wgpu::Buffer bufferB = utils::CreateBufferFromData(
        device, data.data(), bufferSize,
        wgpu::BufferUsage::Storage | wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopySrc);

    wgpu::ShaderModule module = utils::CreateShaderModule(device, R"(
        [[block]] struct Buf {
            data : array<vec4<u32>, 25>;
        };

        [[group(0), binding(0)]] var<uniform> src : Buf;
        [[group(0), binding(1)]] var<storage, read_write> dst : Buf;

        [[stage(compute)]]
        fn main([[builtin(global_invocation_id)]] GlobalInvocationID : vec3<u32>) {
            dst.data[GlobalInvocationID.x] = src.data[GlobalInvocationID.x] +
                vec4<u32>(0x1234u, 0x1234u, 0x1234u, 0x1234u);
        }
    )");

    wgpu::ComputePipelineDescriptor pipelineDesc = {};
    pipelineDesc.compute.module = module;
    pipelineDesc.compute.entryPoint = "main";
    wgpu::ComputePipeline pipeline = device.CreateComputePipeline(&pipelineDesc);

    wgpu::BindGroup bindGroupA = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
                                                      {
                                                          {0, bufferA, 0, bufferSize},
                                                          {1, bufferB, 0, bufferSize},
                                                      });

    wgpu::BindGroup bindGroupB = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
                                                      {
                                                          {0, bufferB, 0, bufferSize},
                                                          {1, bufferA, 0, bufferSize},
                                                      });

    wgpu::BindGroup bindGroups[2] = {bindGroupA, bindGroupB};

    wgpu::CommandEncoder encoder = device.CreateCommandEncoder();

    for (uint32_t i = 0, b = 0; i < kIterations; ++i, b = 1 - b) {
        wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
        pass.SetPipeline(pipeline);
        pass.SetBindGroup(0, bindGroups[b]);
        pass.Dispatch(kNumValues / 4);
        pass.EndPass();
    }

    wgpu::CommandBuffer commands = encoder.Finish();
    queue.Submit(1, &commands);

    EXPECT_BUFFER_U32_RANGE_EQ(expectedA.data(), bufferA, 0, kNumValues);
    EXPECT_BUFFER_U32_RANGE_EQ(expectedB.data(), bufferB, 0, kNumValues);
}

// Test that Storage to Uniform buffer transitions work and synchronize correctly
// by ping-ponging between Storage/Uniform usage in one compute pass.
TEST_P(ComputeStorageBufferBarrierTests, UniformToStorageAddPingPongInOnePass) {
    std::vector<uint32_t> data(kNumValues, 0);
    std::vector<uint32_t> expectedA(kNumValues, 0x1234 * kIterations);
    std::vector<uint32_t> expectedB(kNumValues, 0x1234 * (kIterations - 1));

    uint64_t bufferSize = static_cast<uint64_t>(data.size() * sizeof(uint32_t));

    wgpu::Buffer bufferA = utils::CreateBufferFromData(
        device, data.data(), bufferSize,
        wgpu::BufferUsage::Storage | wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopySrc);

    wgpu::Buffer bufferB = utils::CreateBufferFromData(
        device, data.data(), bufferSize,
        wgpu::BufferUsage::Storage | wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopySrc);

    wgpu::ShaderModule module = utils::CreateShaderModule(device, R"(
        [[block]] struct Buf {
            data : array<vec4<u32>, 25>;
        };

        [[group(0), binding(0)]] var<uniform> src : Buf;
        [[group(0), binding(1)]] var<storage, read_write> dst : Buf;

        [[stage(compute)]]
        fn main([[builtin(global_invocation_id)]] GlobalInvocationID : vec3<u32>) {
            dst.data[GlobalInvocationID.x] = src.data[GlobalInvocationID.x] +
                vec4<u32>(0x1234u, 0x1234u, 0x1234u, 0x1234u);
        }
    )");

    wgpu::ComputePipelineDescriptor pipelineDesc = {};
    pipelineDesc.compute.module = module;
    pipelineDesc.compute.entryPoint = "main";
    wgpu::ComputePipeline pipeline = device.CreateComputePipeline(&pipelineDesc);

    wgpu::BindGroup bindGroupA = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
                                                      {
                                                          {0, bufferA, 0, bufferSize},
                                                          {1, bufferB, 0, bufferSize},
                                                      });

    wgpu::BindGroup bindGroupB = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
                                                      {
                                                          {0, bufferB, 0, bufferSize},
                                                          {1, bufferA, 0, bufferSize},
                                                      });

    wgpu::BindGroup bindGroups[2] = {bindGroupA, bindGroupB};

    wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
    wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
    for (uint32_t i = 0, b = 0; i < kIterations; ++i, b = 1 - b) {
        pass.SetPipeline(pipeline);
        pass.SetBindGroup(0, bindGroups[b]);
        pass.Dispatch(kNumValues / 4);
    }
    pass.EndPass();

    wgpu::CommandBuffer commands = encoder.Finish();
    queue.Submit(1, &commands);

    EXPECT_BUFFER_U32_RANGE_EQ(expectedA.data(), bufferA, 0, kNumValues);
    EXPECT_BUFFER_U32_RANGE_EQ(expectedB.data(), bufferB, 0, kNumValues);
}

// Test that barriers for dispatches correctly combine Indirect | Storage in backends with explicit
// barriers. Do this by:
//  1 - Initializing an indirect buffer with zeros.
//  2 - Write ones into it with a compute shader.
//  3 - Use the indirect buffer in a Dispatch while also reading its data.
TEST_P(ComputeStorageBufferBarrierTests, IndirectBufferCorrectBarrier) {
    // For some reason SPIRV-Cross crashes when translating the step3 shader to HLSL. Suppress the
    // failure since we'll remove SPIRV-Cross at some point.
    DAWN_SUPPRESS_TEST_IF(IsD3D12() && !HasToggleEnabled("use_tint_generator"));

    wgpu::ComputePipelineDescriptor step2PipelineDesc;
    step2PipelineDesc.compute.entryPoint = "main";
    step2PipelineDesc.compute.module = utils::CreateShaderModule(device, R"(
        [[block]] struct Buf {
            data : array<u32, 3>;
        };
        [[group(0), binding(0)]] var<storage, read_write> buf : Buf;

        [[stage(compute)]] fn main() {
            buf.data = array<u32, 3>(1u, 1u, 1u);
        }
    )");
    wgpu::ComputePipeline step2Pipeline = device.CreateComputePipeline(&step2PipelineDesc);

    wgpu::ComputePipelineDescriptor step3PipelineDesc;
    step3PipelineDesc.compute.entryPoint = "main";
    step3PipelineDesc.compute.module = utils::CreateShaderModule(device, R"(
        [[block]] struct Buf {
            data : array<u32, 3>;
        };
        [[group(0), binding(0)]] var<storage, read> buf : Buf;

        [[block]] struct Result {
            data : u32;
        };
        [[group(0), binding(1)]] var<storage, read_write> result : Result;

        [[stage(compute)]] fn main() {
            result.data = 2u;
            if (buf.data[0] == 1u && buf.data[1] == 1u && buf.data[2] == 1u) {
                result.data = 1u;
            }
        }
    )");
    wgpu::ComputePipeline step3Pipeline = device.CreateComputePipeline(&step3PipelineDesc);

    //  1 - Initializing an indirect buffer with zeros.
    wgpu::Buffer buf = utils::CreateBufferFromData<uint32_t>(
        device, wgpu::BufferUsage::Storage | wgpu::BufferUsage::Indirect, {0u, 0u, 0u});

    //  2 - Write ones into it with a compute shader.
    wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
    wgpu::ComputePassEncoder pass = encoder.BeginComputePass();

    wgpu::BindGroup step2Group =
        utils::MakeBindGroup(device, step2Pipeline.GetBindGroupLayout(0), {{0, buf}});

    pass.SetPipeline(step2Pipeline);
    pass.SetBindGroup(0, step2Group);
    pass.Dispatch(1);

    //  3 - Use the indirect buffer in a Dispatch while also reading its data.
    wgpu::Buffer resultBuffer = utils::CreateBufferFromData<uint32_t>(
        device, wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc, {0u});
    wgpu::BindGroup step3Group = utils::MakeBindGroup(device, step3Pipeline.GetBindGroupLayout(0),
                                                      {{0, buf}, {1, resultBuffer}});

    pass.SetPipeline(step3Pipeline);
    pass.SetBindGroup(0, step3Group);
    pass.DispatchIndirect(buf, 0);

    pass.EndPass();
    wgpu::CommandBuffer commands = encoder.Finish();
    queue.Submit(1, &commands);

    EXPECT_BUFFER_U32_EQ(1u, resultBuffer, 0);
}

DAWN_INSTANTIATE_TEST(ComputeStorageBufferBarrierTests,
                      D3D12Backend(),
                      MetalBackend(),
                      OpenGLBackend(),
                      OpenGLESBackend(),
                      VulkanBackend());