Add missing lock to APIBeginRender/ComputePass.

Add tests to verify multithreading behaviors of encoding render/compute passes. Bug: dawn:1662 Change-Id: I9bc6a0dd5d94b53b59e7e49a4611d4d55cc36e60 Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/126980 Reviewed-by: Austin Eng <enga@chromium.org> Commit-Queue: Quyen Le <lehoangquyen@chromium.org> Kokoro: Kokoro <noreply+kokoro@google.com>
2025-07-04 12:16:10 +00:00 · 2023-04-12 21:07:31 +00:00 · 2023-04-12 21:07:31 +00:00 · 3e7a114a6e
commit 3e7a114a6e
parent 3dba94c1d4
5 changed files with 204 additions and 8 deletions
--- a/src/dawn/native/CommandEncoder.cpp
+++ b/src/dawn/native/CommandEncoder.cpp
@ -771,11 +771,15 @@ void CommandEncoder::TrackQueryAvailability(QuerySetBase* querySet, uint32_t que
 // Implementation of the API's command recording methods

 ComputePassEncoder* CommandEncoder::APIBeginComputePass(const ComputePassDescriptor* descriptor) {
+    // This function will create new object, need to lock the Device.
+    auto deviceLock(GetDevice()->GetScopedLock());
+
    return BeginComputePass(descriptor).Detach();
 }

 Ref<ComputePassEncoder> CommandEncoder::BeginComputePass(const ComputePassDescriptor* descriptor) {
    DeviceBase* device = GetDevice();
+    ASSERT(device->IsLockedByCurrentThreadIfNeeded());

    bool success = mEncodingContext.TryEncode(
        this,
@ -830,11 +834,15 @@ Ref<ComputePassEncoder> CommandEncoder::BeginComputePass(const ComputePassDescri
 }

 RenderPassEncoder* CommandEncoder::APIBeginRenderPass(const RenderPassDescriptor* descriptor) {
+    // This function will create new object, need to lock the Device.
+    auto deviceLock(GetDevice()->GetScopedLock());
+
    return BeginRenderPass(descriptor).Detach();
 }

 Ref<RenderPassEncoder> CommandEncoder::BeginRenderPass(const RenderPassDescriptor* descriptor) {
    DeviceBase* device = GetDevice();
+    ASSERT(device->IsLockedByCurrentThreadIfNeeded());

    RenderPassResourceUsageTracker usageTracker;

@ -1047,13 +1055,14 @@ ResultOrError<std::function<void()>> CommandEncoder::ApplyRenderPassWorkarounds(
                descriptor.dimension = wgpu::TextureDimension::e2D;
                descriptor.mipLevelCount = 1;

-                // We are creating new resources. Need to lock the Device.
+                // We are creating new resources. Device must already be locked via
+                // APIBeginRenderPass -> ApplyRenderPassWorkarounds.
                // TODO(crbug.com/dawn/1618): In future, all temp resources should be created at
                // Command Submit time, so the locking would be removed from here at that point.
                Ref<TextureBase> temporaryResolveTexture;
                Ref<TextureViewBase> temporaryResolveView;
                {
-                    auto deviceLock(GetDevice()->GetScopedLock());
+                    ASSERT(device->IsLockedByCurrentThreadIfNeeded());

                    DAWN_TRY_ASSIGN(temporaryResolveTexture, device->CreateTexture(&descriptor));

--- a/src/dawn/native/Device.cpp
+++ b/src/dawn/native/Device.cpp
@ -92,10 +92,6 @@ struct DeviceBase::DeprecationWarnings {
 };

 namespace {
-bool IsMutexLockedByCurrentThreadIfNeeded(const Ref<Mutex>& mutex) {
-    return mutex == nullptr || mutex->IsLockedByCurrentThread();
-}
-
 struct LoggingCallbackTask : CallbackTask {
  public:
    LoggingCallbackTask() = delete;
@ -873,7 +869,7 @@ Ref<RenderPipelineBase> DeviceBase::GetCachedRenderPipeline(

 Ref<ComputePipelineBase> DeviceBase::AddOrGetCachedComputePipeline(
    Ref<ComputePipelineBase> computePipeline) {
-    ASSERT(IsMutexLockedByCurrentThreadIfNeeded(mMutex));
+    ASSERT(IsLockedByCurrentThreadIfNeeded());
    auto [cachedPipeline, inserted] = mCaches->computePipelines.insert(computePipeline.Get());
    if (inserted) {
        computePipeline->SetIsCachedReference();
@ -885,7 +881,7 @@ Ref<ComputePipelineBase> DeviceBase::AddOrGetCachedComputePipeline(

 Ref<RenderPipelineBase> DeviceBase::AddOrGetCachedRenderPipeline(
    Ref<RenderPipelineBase> renderPipeline) {
-    ASSERT(IsMutexLockedByCurrentThreadIfNeeded(mMutex));
+    ASSERT(IsLockedByCurrentThreadIfNeeded());
    auto [cachedPipeline, inserted] = mCaches->renderPipelines.insert(renderPipeline.Get());
    if (inserted) {
        renderPipeline->SetIsCachedReference();
@ -2048,6 +2044,10 @@ Mutex::AutoLock DeviceBase::GetScopedLock() {
    return Mutex::AutoLock(mMutex.Get());
 }

+bool DeviceBase::IsLockedByCurrentThreadIfNeeded() const {
+    return mMutex == nullptr || mMutex->IsLockedByCurrentThread();
+}
+
 IgnoreLazyClearCountScope::IgnoreLazyClearCountScope(DeviceBase* device)
    : mDevice(device), mLazyClearCountForTesting(device->mLazyClearCountForTesting) {}

--- a/src/dawn/native/Device.h
+++ b/src/dawn/native/Device.h
@ -431,6 +431,12 @@ class DeviceBase : public RefCountedWithExternalCount {
    // AutoLock. It would crash if such thing happens.
    [[nodiscard]] Mutex::AutoLock GetScopedLock();

+    // This method returns true if Feature::ImplicitDeviceSynchronization is turned on and the
+    // device is locked by current thread. This method is only enabled when DAWN_ENABLE_ASSERTS is
+    // turned on. Thus it should only be wrapped inside ASSERT() macro. i.e.
+    // ASSERT(device.IsLockedByCurrentThread())
+    bool IsLockedByCurrentThreadIfNeeded() const;
+
    // In the 'Normal' mode, currently recorded commands in the backend normally will be actually
    // submitted in the next Tick. However in the 'Passive' mode, the submission will be postponed
    // as late as possible, for example, until the client has explictly issued a submission.
--- a/src/dawn/tests/BUILD.gn
+++ b/src/dawn/tests/BUILD.gn
@ -531,6 +531,7 @@ source_set("end2end_tests_sources") {
    "end2end/MemoryAllocationStressTests.cpp",
    "end2end/MultisampledRenderingTests.cpp",
    "end2end/MultisampledSamplingTests.cpp",
+    "end2end/MultithreadTests.cpp",
    "end2end/NonzeroBufferCreationTests.cpp",
    "end2end/NonzeroTextureCreationTests.cpp",
    "end2end/ObjectCachingTests.cpp",
--- a/src/dawn/tests/end2end/MultithreadTests.cpp
+++ b/src/dawn/tests/end2end/MultithreadTests.cpp
@ -0,0 +1,180 @@
+// Copyright 2023 The Dawn Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <functional>
+#include <limits>
+#include <memory>
+#include <thread>
+#include <vector>
+
+#include "dawn/common/Constants.h"
+#include "dawn/common/Math.h"
+#include "dawn/tests/DawnTest.h"
+#include "dawn/utils/TestUtils.h"
+#include "dawn/utils/TextureUtils.h"
+#include "dawn/utils/WGPUHelpers.h"
+
+class MultithreadTests : public DawnTest {
+  protected:
+    std::vector<wgpu::FeatureName> GetRequiredFeatures() override {
+        std::vector<wgpu::FeatureName> features;
+        // TODO(crbug.com/dawn/1678): DawnWire doesn't support thread safe API yet.
+        if (!UsesWire()) {
+            features.push_back(wgpu::FeatureName::ImplicitDeviceSynchronization);
+        }
+        return features;
+    }
+
+    void SetUp() override {
+        DawnTest::SetUp();
+        // TODO(crbug.com/dawn/1678): DawnWire doesn't support thread safe API yet.
+        DAWN_TEST_UNSUPPORTED_IF(UsesWire());
+
+        // TODO(crbug.com/dawn/1679): OpenGL/D3D11 backend doesn't support thread safe API yet.
+        DAWN_TEST_UNSUPPORTED_IF(IsOpenGL() || IsOpenGLES() || IsD3D11());
+    }
+
+    wgpu::Buffer CreateBuffer(uint32_t size, wgpu::BufferUsage usage) {
+        wgpu::BufferDescriptor descriptor;
+        descriptor.size = size;
+        descriptor.usage = usage;
+        return device.CreateBuffer(&descriptor);
+    }
+
+    wgpu::Texture CreateTexture(uint32_t width,
+                                uint32_t height,
+                                wgpu::TextureFormat format,
+                                wgpu::TextureUsage usage,
+                                uint32_t mipLevelCount = 1,
+                                uint32_t sampleCount = 1) {
+        wgpu::TextureDescriptor texDescriptor = {};
+        texDescriptor.size = {width, height, 1};
+        texDescriptor.format = format;
+        texDescriptor.usage = usage;
+        texDescriptor.mipLevelCount = mipLevelCount;
+        texDescriptor.sampleCount = sampleCount;
+        return device.CreateTexture(&texDescriptor);
+    }
+
+    void RunInParallel(uint32_t numThreads, const std::function<void(uint32_t)>& workerFunc) {
+        std::vector<std::unique_ptr<std::thread>> threads(numThreads);
+
+        for (uint32_t i = 0; i < threads.size(); ++i) {
+            threads[i] = std::make_unique<std::thread>([i, workerFunc] { workerFunc(i); });
+        }
+
+        for (auto& thread : threads) {
+            thread->join();
+        }
+    }
+};
+
+class MultithreadEncodingTests : public MultithreadTests {};
+
+// Test that encoding render passes in parallel should work
+TEST_P(MultithreadEncodingTests, RenderPassEncodersInParallel) {
+    constexpr uint32_t kRTSize = 16;
+    constexpr uint32_t kNumThreads = 10;
+
+    wgpu::Texture msaaRenderTarget =
+        CreateTexture(kRTSize, kRTSize, wgpu::TextureFormat::RGBA8Unorm,
+                      wgpu::TextureUsage::RenderAttachment | wgpu::TextureUsage::CopySrc,
+                      /*mipLevelCount=*/1, /*sampleCount=*/4);
+    wgpu::TextureView msaaRenderTargetView = msaaRenderTarget.CreateView();
+
+    wgpu::Texture resolveTarget =
+        CreateTexture(kRTSize, kRTSize, wgpu::TextureFormat::RGBA8Unorm,
+                      wgpu::TextureUsage::RenderAttachment | wgpu::TextureUsage::CopySrc);
+    wgpu::TextureView resolveTargetView = resolveTarget.CreateView();
+
+    std::vector<wgpu::CommandBuffer> commandBuffers(kNumThreads);
+
+    RunInParallel(kNumThreads, [=, &commandBuffers](uint32_t index) {
+        wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+
+        // Clear the renderTarget to red.
+        utils::ComboRenderPassDescriptor renderPass({msaaRenderTargetView});
+        renderPass.cColorAttachments[0].resolveTarget = resolveTargetView;
+        renderPass.cColorAttachments[0].clearValue = {1.0f, 0.0f, 0.0f, 1.0f};
+
+        wgpu::RenderPassEncoder pass = encoder.BeginRenderPass(&renderPass);
+        pass.End();
+
+        commandBuffers[index] = encoder.Finish();
+    });
+
+    // Verify that the command buffers executed correctly.
+    for (auto& commandBuffer : commandBuffers) {
+        queue.Submit(1, &commandBuffer);
+
+        EXPECT_TEXTURE_EQ(utils::RGBA8::kRed, resolveTarget, {0, 0});
+        EXPECT_TEXTURE_EQ(utils::RGBA8::kRed, resolveTarget, {kRTSize - 1, kRTSize - 1});
+    }
+}
+
+// Test that encoding compute passes in parallel should work
+TEST_P(MultithreadEncodingTests, ComputePassEncodersInParallel) {
+    constexpr uint32_t kNumThreads = 10;
+    constexpr uint32_t kExpected = 0xFFFFFFFFu;
+
+    wgpu::ShaderModule module = utils::CreateShaderModule(device, R"(
+            @group(0) @binding(0) var<storage, read_write> output : u32;
+
+            @compute @workgroup_size(1, 1, 1)
+            fn main(@builtin(global_invocation_id) GlobalInvocationID : vec3u) {
+                output = 0xFFFFFFFFu;
+            })");
+    wgpu::ComputePipelineDescriptor csDesc;
+    csDesc.compute.module = module;
+    csDesc.compute.entryPoint = "main";
+    auto pipeline = device.CreateComputePipeline(&csDesc);
+
+    wgpu::Buffer dstBuffer =
+        CreateBuffer(sizeof(uint32_t), wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc |
+                                           wgpu::BufferUsage::CopyDst);
+    wgpu::BindGroup bindGroup = utils::MakeBindGroup(device, pipeline.GetBindGroupLayout(0),
+                                                     {
+                                                         {0, dstBuffer, 0, sizeof(uint32_t)},
+                                                     });
+
+    std::vector<wgpu::CommandBuffer> commandBuffers(kNumThreads);
+
+    RunInParallel(kNumThreads, [=, &commandBuffers](uint32_t index) {
+        wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+        wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
+        pass.SetPipeline(pipeline);
+        pass.SetBindGroup(0, bindGroup);
+        pass.DispatchWorkgroups(1, 1, 1);
+        pass.End();
+
+        commandBuffers[index] = encoder.Finish();
+    });
+
+    // Verify that the command buffers executed correctly.
+    for (auto& commandBuffer : commandBuffers) {
+        constexpr uint32_t kSentinelData = 0;
+        queue.WriteBuffer(dstBuffer, 0, &kSentinelData, sizeof(kSentinelData));
+        queue.Submit(1, &commandBuffer);
+
+        EXPECT_BUFFER_U32_EQ(kExpected, dstBuffer, 0);
+    }
+}
+
+DAWN_INSTANTIATE_TEST(MultithreadEncodingTests,
+                      D3D11Backend(),
+                      D3D12Backend(),
+                      MetalBackend(),
+                      OpenGLBackend(),
+                      OpenGLESBackend(),
+                      VulkanBackend());