From 23f43961779d0dea6f39aaa87fb2366b1052f6ee Mon Sep 17 00:00:00 2001
From: Le Hoang Quyen <lehoangquyen@chromium.org>
Date: Mon, 24 Apr 2023 20:12:00 +0000
Subject: [PATCH] Add more multithread tests.

Bug: dawn:1662
Change-Id: I2b2c66c6f9a7b512ae9f8010a082e7306feaa6f3
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/122060
Commit-Queue: Quyen Le <lehoangquyen@chromium.org>
Kokoro: Kokoro <noreply+kokoro@google.com>
Reviewed-by: Austin Eng <enga@chromium.org>
---
 src/dawn/tests/DawnTest.cpp                   |  30 +-
 src/dawn/tests/DawnTest.h                     |   6 +-
 .../end2end/D3D12ResourceWrappingTests.cpp    | 156 +++-
 .../tests/end2end/DeviceLifetimeTests.cpp     |  41 +
 .../tests/end2end/IOSurfaceWrappingTests.cpp  |  65 ++
 src/dawn/tests/end2end/MultithreadTests.cpp   | 879 +++++++++++++++++-
 src/dawn/utils/TestUtils.cpp                  |  20 +
 src/dawn/utils/TestUtils.h                    |   5 +
 8 files changed, 1141 insertions(+), 61 deletions(-)
diff --git a/src/dawn/tests/DawnTest.cpp b/src/dawn/tests/DawnTest.cpp
index 97f930130b..3ed7ebd232 100644
--- a/src/dawn/tests/DawnTest.cpp
+++ b/src/dawn/tests/DawnTest.cpp
@@ -15,6 +15,7 @@
 #include "dawn/tests/DawnTest.h"
 
 #include <algorithm>
+#include <atomic>
 #include <fstream>
 #include <iomanip>
 #include <regex>
@@ -1146,6 +1147,9 @@ std::ostringstream& DawnTestBase::AddBufferExpectation(const char* file,
     deferred.size = size;
     deferred.expectation.reset(expectation);
 
+    // This expectation might be called from multiple threads
+    dawn::Mutex::AutoLock lg(&mMutex);
+
     mDeferredExpectations.push_back(std::move(deferred));
     mDeferredExpectations.back().message = std::make_unique<std::ostringstream>();
     return *(mDeferredExpectations.back().message.get());
@@ -1200,6 +1204,9 @@ std::ostringstream& DawnTestBase::AddTextureExpectationImpl(const char* file,
     deferred.bytesPerRow = bytesPerRow;
     deferred.expectation.reset(expectation);
 
+    // This expectation might be called from multiple threads
+    dawn::Mutex::AutoLock lg(&mMutex);
+
     mDeferredExpectations.push_back(std::move(deferred));
     mDeferredExpectations.back().message = std::make_unique<std::ostringstream>();
     return *(mDeferredExpectations.back().message.get());
@@ -1504,11 +1511,16 @@ void DawnTestBase::FlushWire() {
 }
 
 void DawnTestBase::WaitForAllOperations() {
-    bool done = false;
+    // Callback might be invoked on another thread that calls the same WaitABit() method, not
+    // necessarily the current thread. So we need to use atomic here.
+    std::atomic<bool> done(false);
     device.GetQueue().OnSubmittedWorkDone(
-        0u, [](WGPUQueueWorkDoneStatus, void* userdata) { *static_cast<bool*>(userdata) = true; },
+        0u,
+        [](WGPUQueueWorkDoneStatus, void* userdata) {
+            *static_cast<std::atomic<bool>*>(userdata) = true;
+        },
         &done);
-    while (!done) {
+    while (!done.load()) {
         WaitABit();
     }
 }
@@ -1526,6 +1538,9 @@ DawnTestBase::ReadbackReservation DawnTestBase::ReserveReadback(wgpu::Device tar
         utils::CreateBufferFromData(targetDevice, initialBufferData.data(), readbackSize,
                                     wgpu::BufferUsage::MapRead | wgpu::BufferUsage::CopyDst);
 
+    // This readback might be called from multiple threads
+    dawn::Mutex::AutoLock lg(&mMutex);
+
     ReadbackReservation reservation;
     reservation.device = targetDevice;
     reservation.buffer = slot.buffer;
@@ -1551,7 +1566,7 @@ void DawnTestBase::MapSlotsSynchronously() {
     }
 
     // Busy wait until all map operations are done.
-    while (mNumPendingMapOperations != 0) {
+    while (mNumPendingMapOperations.load(std::memory_order_acquire) != 0) {
         WaitABit();
     }
 }
@@ -1562,7 +1577,8 @@ void DawnTestBase::SlotMapCallback(WGPUBufferMapAsyncStatus status, void* userda
                 status == WGPUBufferMapAsyncStatus_DeviceLost);
     std::unique_ptr<MapReadUserdata> userdata(static_cast<MapReadUserdata*>(userdata_));
     DawnTestBase* test = userdata->test;
-    test->mNumPendingMapOperations--;
+
+    dawn::Mutex::AutoLock lg(&test->mMutex);
 
     ReadbackSlot* slot = &test->mReadbackSlots[userdata->slot];
     if (status == WGPUBufferMapAsyncStatus_Success) {
@@ -1571,6 +1587,8 @@ void DawnTestBase::SlotMapCallback(WGPUBufferMapAsyncStatus status, void* userda
     } else {
         slot->mappedData = nullptr;
     }
+
+    test->mNumPendingMapOperations.fetch_sub(1, std::memory_order_release);
 }
 
 void DawnTestBase::ResolveExpectations() {
@@ -1629,6 +1647,8 @@ void DawnTestBase::ResolveDeferredExpectationsNow() {
     FlushWire();
 
     MapSlotsSynchronously();
+
+    dawn::Mutex::AutoLock lg(&mMutex);
     ResolveExpectations();
 
     mDeferredExpectations.clear();
diff --git a/src/dawn/tests/DawnTest.h b/src/dawn/tests/DawnTest.h
index 8482c7c9ae..9c917c1328 100644
--- a/src/dawn/tests/DawnTest.h
+++ b/src/dawn/tests/DawnTest.h
@@ -15,6 +15,7 @@
 #ifndef SRC_DAWN_TESTS_DAWNTEST_H_
 #define SRC_DAWN_TESTS_DAWNTEST_H_
 
+#include <atomic>
 #include <memory>
 #include <queue>
 #include <string>
@@ -23,6 +24,7 @@
 #include <vector>
 
 #include "dawn/common/Log.h"
+#include "dawn/common/Mutex.h"
 #include "dawn/common/Platform.h"
 #include "dawn/common/Preprocessor.h"
 #include "dawn/dawn_proc_table.h"
@@ -623,7 +625,7 @@ class DawnTestBase {
     // Maps all the buffers and fill ReadbackSlot::mappedData
     void MapSlotsSynchronously();
     static void SlotMapCallback(WGPUBufferMapAsyncStatus status, void* userdata);
-    size_t mNumPendingMapOperations = 0;
+    std::atomic<size_t> mNumPendingMapOperations = 0;
 
     // Reserve space where the data for an expectation can be copied
     struct ReadbackReservation {
@@ -656,6 +658,8 @@ class DawnTestBase {
     WGPUDevice mLastCreatedBackendDevice;
 
     std::unique_ptr<dawn::platform::Platform> mTestPlatform;
+
+    dawn::Mutex mMutex;
 };
 
 #define DAWN_SKIP_TEST_IF_BASE(condition, type, reason)   \
diff --git a/src/dawn/tests/end2end/D3D12ResourceWrappingTests.cpp b/src/dawn/tests/end2end/D3D12ResourceWrappingTests.cpp
index 3329126f91..702a0bf854 100644
--- a/src/dawn/tests/end2end/D3D12ResourceWrappingTests.cpp
+++ b/src/dawn/tests/end2end/D3D12ResourceWrappingTests.cpp
@@ -19,6 +19,7 @@
 #include <wrl/client.h>
 
 #include <memory>
+#include <mutex>
 #include <thread>
 #include <utility>
 #include <vector>
@@ -395,14 +396,11 @@ class D3D12SharedHandleUsageTests : public D3D12ResourceTestBase {
         queue.Submit(1, &commands);
     }
 
-    void WrapAndClearD3D11Texture(
-        const wgpu::TextureDescriptor& dawnDescriptor,
-        const D3D11_TEXTURE2D_DESC& d3dDescriptor,
-        const wgpu::Color& clearColor,
-        wgpu::Texture* dawnTextureOut,
-        ID3D11Texture2D** d3d11TextureOut,
-        std::unique_ptr<dawn::native::d3d12::ExternalImageDXGI>* externalImageOut,
-        bool isInitialized = true) const {
+    void CreateSharedD3D11Texture(const D3D11_TEXTURE2D_DESC& d3dDescriptor,
+                                  ID3D11Texture2D** d3d11TextureOut,
+                                  ID3D11Fence** d3d11FenceOut,
+                                  HANDLE* sharedHandleOut,
+                                  HANDLE* fenceSharedHandleOut) const {
         ComPtr<ID3D11Texture2D> d3d11Texture;
         HRESULT hr = mD3d11Device->CreateTexture2D(&d3dDescriptor, nullptr, &d3d11Texture);
         ASSERT_EQ(hr, S_OK);
@@ -417,20 +415,10 @@ class D3D12SharedHandleUsageTests : public D3D12ResourceTestBase {
             &sharedHandle);
         ASSERT_EQ(hr, S_OK);
 
-        ComPtr<IDXGIKeyedMutex> dxgiKeyedMutex;
-
         HANDLE fenceSharedHandle = nullptr;
         ComPtr<ID3D11Fence> d3d11Fence;
 
-        ComPtr<ID3D11DeviceContext4> d3d11DeviceContext4;
-
-        if (GetParam().mSyncMode == SyncMode::kKeyedMutex) {
-            hr = d3d11Texture.As(&dxgiKeyedMutex);
-            ASSERT_EQ(hr, S_OK);
-
-            hr = dxgiKeyedMutex->AcquireSync(kDXGIKeyedMutexAcquireReleaseKey, INFINITE);
-            ASSERT_EQ(hr, S_OK);
-        } else {
+        if (GetParam().mSyncMode == SyncMode::kFence) {
             ComPtr<ID3D11Device5> d3d11Device5;
             hr = mD3d11Device.As(&d3d11Device5);
             ASSERT_EQ(hr, S_OK);
@@ -442,6 +430,33 @@ class D3D12SharedHandleUsageTests : public D3D12ResourceTestBase {
             ASSERT_EQ(hr, S_OK);
         }
 
+        *d3d11TextureOut = d3d11Texture.Detach();
+        *d3d11FenceOut = d3d11Fence.Detach();
+        *sharedHandleOut = sharedHandle;
+        *fenceSharedHandleOut = fenceSharedHandle;
+    }
+
+    void ClearD3D11Texture(const wgpu::Color& clearColor,
+                           ID3D11Texture2D* d3d11TexturePtr,
+                           ID3D11Fence* d3d11Fence,
+                           uint64_t fenceSignalValue) const {
+        ComPtr<ID3D11Texture2D> d3d11Texture = d3d11TexturePtr;
+        ComPtr<IDXGIResource1> dxgiResource;
+        HRESULT hr = d3d11Texture.As(&dxgiResource);
+        ASSERT_EQ(hr, S_OK);
+
+        ComPtr<IDXGIKeyedMutex> dxgiKeyedMutex;
+
+        ComPtr<ID3D11DeviceContext4> d3d11DeviceContext4;
+
+        if (GetParam().mSyncMode == SyncMode::kKeyedMutex) {
+            hr = d3d11Texture.As(&dxgiKeyedMutex);
+            ASSERT_EQ(hr, S_OK);
+
+            hr = dxgiKeyedMutex->AcquireSync(kDXGIKeyedMutexAcquireReleaseKey, INFINITE);
+            ASSERT_EQ(hr, S_OK);
+        }
+
         ComPtr<ID3D11RenderTargetView> d3d11RTV;
         hr = mD3d11Device->CreateRenderTargetView(d3d11Texture.Get(), nullptr, &d3d11RTV);
         ASSERT_EQ(hr, S_OK);
@@ -451,7 +466,6 @@ class D3D12SharedHandleUsageTests : public D3D12ResourceTestBase {
             static_cast<float>(clearColor.b), static_cast<float>(clearColor.a)};
         mD3d11DeviceContext->ClearRenderTargetView(d3d11RTV.Get(), colorRGBA);
 
-        constexpr uint64_t kFenceSignalValue = 1;
         if (dxgiKeyedMutex) {
             hr = dxgiKeyedMutex->ReleaseSync(kDXGIKeyedMutexAcquireReleaseKey);
             ASSERT_EQ(hr, S_OK);
@@ -460,9 +474,18 @@ class D3D12SharedHandleUsageTests : public D3D12ResourceTestBase {
             ASSERT_EQ(hr, S_OK);
             // The fence starts with 0 signaled, but that won't capture the render target view clear
             // above, so signal explicitly with 1 and make the next Dawn access wait on 1.
-            d3d11DeviceContext4->Signal(d3d11Fence.Get(), kFenceSignalValue);
+            d3d11DeviceContext4->Signal(d3d11Fence, fenceSignalValue);
         }
+    }
 
+    void WaitAndWrapD3D11Texture(
+        const wgpu::TextureDescriptor& dawnDescriptor,
+        HANDLE sharedHandle,
+        HANDLE fenceSharedHandle,
+        uint64_t fenceWaitValue,
+        wgpu::Texture* dawnTextureOut,
+        std::unique_ptr<dawn::native::d3d12::ExternalImageDXGI>* externalImageOut,
+        bool isInitialized) const {
         dawn::native::d3d12::ExternalImageDescriptorDXGISharedHandle externalImageDesc = {};
         externalImageDesc.sharedHandle = sharedHandle;
         externalImageDesc.cTextureDescriptor =
@@ -476,12 +499,36 @@ class D3D12SharedHandleUsageTests : public D3D12ResourceTestBase {
         externalAccessDesc.isInitialized = isInitialized;
         externalAccessDesc.usage = static_cast<WGPUTextureUsageFlags>(dawnDescriptor.usage);
         if (fenceSharedHandle != nullptr) {
-            externalAccessDesc.waitFences.push_back({fenceSharedHandle, kFenceSignalValue});
+            externalAccessDesc.waitFences.push_back({fenceSharedHandle, fenceWaitValue});
         }
 
         *dawnTextureOut = wgpu::Texture::Acquire(externalImage->BeginAccess(&externalAccessDesc));
-        *d3d11TextureOut = d3d11Texture.Detach();
         *externalImageOut = std::move(externalImage);
+    }
+
+    void WrapAndClearD3D11Texture(
+        const wgpu::TextureDescriptor& dawnDescriptor,
+        const D3D11_TEXTURE2D_DESC& d3dDescriptor,
+        const wgpu::Color& clearColor,
+        wgpu::Texture* dawnTextureOut,
+        ID3D11Texture2D** d3d11TextureOut,
+        std::unique_ptr<dawn::native::d3d12::ExternalImageDXGI>* externalImageOut,
+        bool isInitialized = true) const {
+        ComPtr<ID3D11Texture2D> d3d11Texture;
+        ComPtr<ID3D11Fence> d3d11Fence;
+        HANDLE sharedHandle = nullptr;
+        HANDLE fenceSharedHandle = nullptr;
+        CreateSharedD3D11Texture(d3dDescriptor, &d3d11Texture, &d3d11Fence, &sharedHandle,
+                                 &fenceSharedHandle);
+
+        constexpr uint64_t kFenceSignalValue = 1;
+        ClearD3D11Texture(clearColor, d3d11Texture.Get(), d3d11Fence.Get(), kFenceSignalValue);
+
+        WaitAndWrapD3D11Texture(dawnDescriptor, sharedHandle, fenceSharedHandle,
+                                /*fenceWaitValue=*/kFenceSignalValue, dawnTextureOut,
+                                externalImageOut, isInitialized);
+
+        *d3d11TextureOut = d3d11Texture.Detach();
 
         if (fenceSharedHandle != nullptr) {
             ::CloseHandle(fenceSharedHandle);
@@ -1123,6 +1170,69 @@ TEST_P(D3D12SharedHandleMultithreadTests, DestroyDeviceAndUseImageInParallel) {
     thread2.join();
 }
 
+// 1. Create and clear a D3D11 texture
+// 2. On 2nd thread: Wrap it in a Dawn texture and clear it to a different color
+// 3. Readback the texture with D3D11 and ensure we receive the color we cleared with Dawn.
+TEST_P(D3D12SharedHandleMultithreadTests, ClearInD3D12ReadbackInD3D11_TwoThreads) {
+    // TODO(crbug.com/dawn/735): This test appears to hang for
+    // D3D12_Microsoft_Basic_Render_Driver_CPU when validation is enabled.
+    DAWN_SUPPRESS_TEST_IF(IsD3D12() && IsWARP() && IsBackendValidationEnabled());
+
+    // KeyedMutex doesn't guarantee the order of commands so skip it.
+    DAWN_TEST_UNSUPPORTED_IF(GetParam().mSyncMode != SyncMode::kFence);
+
+    const wgpu::Color d3d11ClearColor{1.0f, 1.0f, 0.0f, 1.0f};
+    const wgpu::Color d3d12ClearColor{0.0f, 0.0f, 1.0f, 1.0f};
+
+    constexpr uint64_t kD3D11FenceSignalValue = 1;
+
+    ComPtr<ID3D11Texture2D> d3d11Texture;
+    ComPtr<ID3D11Fence> d3d11Fence;
+    HANDLE sharedHandle = nullptr;
+    HANDLE fenceSharedHandle = nullptr;
+    CreateSharedD3D11Texture(baseD3dDescriptor, &d3d11Texture, &d3d11Fence, &sharedHandle,
+                             &fenceSharedHandle);
+
+    dawn::native::d3d12::ExternalImageDXGIFenceDescriptor d3d12SignalFence;
+
+    std::thread d3d12Thread([=, &d3d12SignalFence] {
+        wgpu::Texture dawnTexture;
+        std::unique_ptr<dawn::native::d3d12::ExternalImageDXGI> externalImage;
+        WaitAndWrapD3D11Texture(baseDawnDescriptor, sharedHandle, fenceSharedHandle,
+                                /*fenceWaitValue=*/kD3D11FenceSignalValue, &dawnTexture,
+                                &externalImage, /*isInitialized=*/true);
+
+        ASSERT_NE(dawnTexture.Get(), nullptr);
+
+        EXPECT_PIXEL_RGBA8_EQ(utils::RGBA8(d3d11ClearColor.r * 255, d3d11ClearColor.g * 255,
+                                           d3d11ClearColor.b * 255, d3d11ClearColor.a * 255),
+                              dawnTexture, 0, 0);
+
+        ClearImage(dawnTexture, d3d12ClearColor, device);
+
+        externalImage->EndAccess(dawnTexture.Get(), &d3d12SignalFence);
+
+        dawnTexture.Destroy();
+    });
+
+    ClearD3D11Texture(d3d11ClearColor, d3d11Texture.Get(), d3d11Fence.Get(),
+                      /*fenceSignalValue=*/kD3D11FenceSignalValue);
+
+    d3d12Thread.join();
+    // Now that Dawn (via D3D12) has finished writing to the texture, we should be
+    // able to read it back by copying it to a staging texture and verifying the
+    // color matches the D3D12 clear color.
+    ExpectPixelRGBA8EQ(d3d11Texture.Get(), d3d12ClearColor, &d3d12SignalFence);
+
+    if (sharedHandle != nullptr) {
+        ::CloseHandle(sharedHandle);
+    }
+
+    if (fenceSharedHandle != nullptr) {
+        ::CloseHandle(fenceSharedHandle);
+    }
+}
+
 DAWN_INSTANTIATE_TEST_P(D3D12SharedHandleValidation,
                         {D3D12Backend()},
                         {SyncMode::kKeyedMutex, SyncMode::kFence});
diff --git a/src/dawn/tests/end2end/DeviceLifetimeTests.cpp b/src/dawn/tests/end2end/DeviceLifetimeTests.cpp
index cec6374bf1..83d13b0041 100644
--- a/src/dawn/tests/end2end/DeviceLifetimeTests.cpp
+++ b/src/dawn/tests/end2end/DeviceLifetimeTests.cpp
@@ -211,6 +211,47 @@ TEST_P(DeviceLifetimeTests, DroppedThenMapBuffer) {
     }
 }
 
+// Test that the device can be dropped before a buffer created from it, then mapping the buffer
+// twice (one inside callback) will both fail.
+TEST_P(DeviceLifetimeTests, Dropped_ThenMapBuffer_ThenMapBufferInCallback) {
+    wgpu::BufferDescriptor desc = {};
+    desc.size = 4;
+    desc.usage = wgpu::BufferUsage::MapRead | wgpu::BufferUsage::CopyDst;
+    wgpu::Buffer buffer = device.CreateBuffer(&desc);
+
+    device = nullptr;
+
+    struct UserData {
+        wgpu::Buffer buffer;
+        bool done = false;
+    };
+
+    UserData userData;
+    userData.buffer = buffer;
+
+    // First mapping.
+    buffer.MapAsync(
+        wgpu::MapMode::Read, 0, wgpu::kWholeMapSize,
+        [](WGPUBufferMapAsyncStatus status, void* userdataPtr) {
+            EXPECT_EQ(status, WGPUBufferMapAsyncStatus_DeviceLost);
+            auto userdata = static_cast<UserData*>(userdataPtr);
+
+            // Second mapping.
+            userdata->buffer.MapAsync(
+                wgpu::MapMode::Read, 0, wgpu::kWholeMapSize,
+                [](WGPUBufferMapAsyncStatus status, void* userdataPtr) {
+                    EXPECT_EQ(status, WGPUBufferMapAsyncStatus_DeviceLost);
+                    *static_cast<bool*>(userdataPtr) = true;
+                },
+                &userdata->done);
+        },
+        &userData);
+
+    while (!userData.done) {
+        WaitABit();
+    }
+}
+
 // Test that the device can be dropped inside a buffer map callback.
 TEST_P(DeviceLifetimeTests, DroppedInsideBufferMapCallback) {
     wgpu::BufferDescriptor desc = {};
diff --git a/src/dawn/tests/end2end/IOSurfaceWrappingTests.cpp b/src/dawn/tests/end2end/IOSurfaceWrappingTests.cpp
index 4f54f110e8..0c86cde098 100644
--- a/src/dawn/tests/end2end/IOSurfaceWrappingTests.cpp
+++ b/src/dawn/tests/end2end/IOSurfaceWrappingTests.cpp
@@ -16,6 +16,10 @@
 #include <CoreVideo/CVPixelBuffer.h>
 #include <IOSurface/IOSurface.h>
 
+#include <memory>
+#include <thread>
+#include <vector>
+
 #include "dawn/tests/DawnTest.h"
 
 #include "dawn/native/MetalBackend.h"
@@ -588,5 +592,66 @@ TEST_P(IOSurfaceUsageTests, WriteThenConcurrentReadThenWrite) {
     EXPECT_TRUE(endWriteAccessDesc.isInitialized);
 }
 
+class IOSurfaceMultithreadTests : public IOSurfaceUsageTests {
+  protected:
+    std::vector<wgpu::FeatureName> GetRequiredFeatures() override {
+        std::vector<wgpu::FeatureName> features;
+        // TODO(crbug.com/dawn/1678): DawnWire doesn't support thread safe API yet.
+        if (!UsesWire()) {
+            features.push_back(wgpu::FeatureName::ImplicitDeviceSynchronization);
+        }
+        return features;
+    }
+
+    void SetUp() override {
+        IOSurfaceUsageTests::SetUp();
+        // TODO(crbug.com/dawn/1678): DawnWire doesn't support thread safe API yet.
+        DAWN_TEST_UNSUPPORTED_IF(UsesWire());
+    }
+};
+
+// Test that texture with color is cleared when isInitialized = false. There shoudn't be any data
+// race if multiple of them are created on multiple threads.
+TEST_P(IOSurfaceMultithreadTests, UninitializedTexturesAreCleared_OnMultipleThreads) {
+    utils::RunInParallel(10, [this](uint32_t) {
+        ScopedIOSurfaceRef ioSurface =
+            CreateSinglePlaneIOSurface(1, 1, kCVPixelFormatType_32RGBA, 4);
+        uint32_t data = 0x04030201;
+
+        IOSurfaceLock(ioSurface.get(), 0, nullptr);
+        memcpy(IOSurfaceGetBaseAddress(ioSurface.get()), &data, sizeof(data));
+        IOSurfaceUnlock(ioSurface.get(), 0, nullptr);
+
+        wgpu::TextureDescriptor textureDescriptor;
+        textureDescriptor.dimension = wgpu::TextureDimension::e2D;
+        textureDescriptor.format = wgpu::TextureFormat::RGBA8Unorm;
+        textureDescriptor.size = {1, 1, 1};
+        textureDescriptor.sampleCount = 1;
+        textureDescriptor.mipLevelCount = 1;
+        textureDescriptor.usage =
+            wgpu::TextureUsage::RenderAttachment | wgpu::TextureUsage::CopySrc;
+
+        // wrap ioSurface and ensure color is not visible when isInitialized set to false
+        wgpu::Texture ioSurfaceTexture = WrapIOSurface(&textureDescriptor, ioSurface.get(), false);
+        EXPECT_PIXEL_RGBA8_EQ(utils::RGBA8(0, 0, 0, 0), ioSurfaceTexture, 0, 0);
+
+        dawn::native::metal::ExternalImageIOSurfaceEndAccessDescriptor endAccessDesc;
+        dawn::native::metal::IOSurfaceEndAccess(ioSurfaceTexture.Get(), &endAccessDesc);
+        EXPECT_TRUE(endAccessDesc.isInitialized);
+    });
+}
+
+// Test that wrapping multiple IOSurface and clear them on multiple threads work.
+TEST_P(IOSurfaceMultithreadTests, WrapAndClear_OnMultipleThreads) {
+    utils::RunInParallel(10, [this](uint32_t) {
+        ScopedIOSurfaceRef ioSurface =
+            CreateSinglePlaneIOSurface(1, 1, kCVPixelFormatType_32BGRA, 4);
+
+        uint32_t data = 0x04010203;
+        DoClearTest(ioSurface.get(), wgpu::TextureFormat::BGRA8Unorm, &data, sizeof(data));
+    });
+}
+
 DAWN_INSTANTIATE_TEST(IOSurfaceValidationTests, MetalBackend());
 DAWN_INSTANTIATE_TEST(IOSurfaceUsageTests, MetalBackend());
+DAWN_INSTANTIATE_TEST(IOSurfaceMultithreadTests, MetalBackend());
diff --git a/src/dawn/tests/end2end/MultithreadTests.cpp b/src/dawn/tests/end2end/MultithreadTests.cpp
index 94f5a35723..652866017d 100644
--- a/src/dawn/tests/end2end/MultithreadTests.cpp
+++ b/src/dawn/tests/end2end/MultithreadTests.cpp
@@ -12,28 +12,50 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <atomic>
+#include <condition_variable>
 #include <functional>
 #include <limits>
 #include <memory>
+#include <mutex>
+#include <sstream>
+#include <string>
 #include <thread>
+#include <utility>
 #include <vector>
 
 #include "dawn/common/Constants.h"
 #include "dawn/common/Math.h"
-#include "dawn/common/Mutex.h"
 #include "dawn/tests/DawnTest.h"
 #include "dawn/utils/ComboRenderPipelineDescriptor.h"
 #include "dawn/utils/TestUtils.h"
 #include "dawn/utils/TextureUtils.h"
 #include "dawn/utils/WGPUHelpers.h"
 
-#define LOCKED_CMD(CMD)                   \
-    do {                                  \
-        dawn::Mutex::AutoLock lk(&mutex); \
-        CMD;                              \
-    } while (0)
-
 namespace {
+template <typename Step>
+class LockStep {
+  public:
+    LockStep() = delete;
+    explicit LockStep(Step startStep) : mStep(startStep) {}
+
+    void Signal(Step step) {
+        std::lock_guard<std::mutex> lg(mMutex);
+        mStep = step;
+        mCv.notify_all();
+    }
+
+    void Wait(Step step) {
+        std::unique_lock<std::mutex> lg(mMutex);
+        mCv.wait(lg, [=] { return mStep == step; });
+    }
+
+  private:
+    Step mStep;
+    std::mutex mMutex;
+    std::condition_variable mCv;
+};
+
 class MultithreadTests : public DawnTest {
   protected:
     std::vector<wgpu::FeatureName> GetRequiredFeatures() override {
@@ -75,21 +97,309 @@ class MultithreadTests : public DawnTest {
         texDescriptor.sampleCount = sampleCount;
         return device.CreateTexture(&texDescriptor);
     }
+};
 
-    void RunInParallel(uint32_t numThreads, const std::function<void(uint32_t)>& workerFunc) {
-        std::vector<std::unique_ptr<std::thread>> threads(numThreads);
+// Test that dropping a device's last ref on another thread won't crash Instance::ProcessEvents.
+TEST_P(MultithreadTests, Device_DroppedOnAnotherThread) {
+    std::vector<wgpu::Device> devices(5);
 
-        for (uint32_t i = 0; i < threads.size(); ++i) {
-            threads[i] = std::make_unique<std::thread>([i, workerFunc] { workerFunc(i); });
-        }
-
-        for (auto& thread : threads) {
-            thread->join();
-        }
+    // Create devices.
+    for (size_t i = 0; i < devices.size(); ++i) {
+        devices[i] = CreateDevice();
     }
 
-    dawn::Mutex mutex;
-};
+    std::atomic<uint32_t> numAliveDevices = static_cast<uint32_t>(devices.size());
+
+    // Create threads
+    utils::RunInParallel(
+        numAliveDevices.load(),
+        [&devices, &numAliveDevices](uint32_t index) {
+            EXPECT_NE(devices[index].Get(), nullptr);
+
+            // Drop device.
+            devices[index] = nullptr;
+
+            numAliveDevices--;
+        },
+        [this, &numAliveDevices] {
+            while (numAliveDevices.load() > 0) {
+                // main thread process events from all devices
+                WaitABit();
+            }
+        });
+}
+
+// Test that dropping a device's last ref inside a callback on another thread won't crash
+// Instance::ProcessEvents.
+TEST_P(MultithreadTests, Device_DroppedInCallback_OnAnotherThread) {
+    std::vector<wgpu::Device> devices(10);
+
+    // Create devices.
+    for (auto& device : devices) {
+        device = CreateDevice();
+    }
+
+    // Create threads
+    utils::RunInParallel(static_cast<uint32_t>(devices.size()), [&devices, this](uint32_t index) {
+        auto additionalDevice = std::move(devices[index]);
+        struct UserData {
+            wgpu::Device device2ndRef;
+            std::atomic_bool isCompleted{false};
+        } userData;
+
+        userData.device2ndRef = additionalDevice;
+
+        // Drop the last ref inside a callback.
+        additionalDevice.PushErrorScope(wgpu::ErrorFilter::Validation);
+        additionalDevice.PopErrorScope(
+            [](WGPUErrorType type, const char*, void* userdataPtr) {
+                auto userdata = static_cast<UserData*>(userdataPtr);
+                userdata->device2ndRef = nullptr;
+                userdata->isCompleted = true;
+            },
+            &userData);
+        // main ref dropped.
+        additionalDevice = nullptr;
+
+        do {
+            WaitABit();
+        } while (!userData.isCompleted.load());
+
+        EXPECT_EQ(userData.device2ndRef, nullptr);
+    });
+}
+
+// Test that multiple buffers being created and mapped on multiple threads won't interfere with
+// each other.
+TEST_P(MultithreadTests, Buffers_MapInParallel) {
+    constexpr uint32_t kDataSize = 1000;
+    std::vector<uint32_t> myData;
+    for (uint32_t i = 0; i < kDataSize; ++i) {
+        myData.push_back(i);
+    }
+
+    constexpr uint32_t kSize = static_cast<uint32_t>(kDataSize * sizeof(uint32_t));
+
+    utils::RunInParallel(10, [=, &myData = std::as_const(myData)](uint32_t) {
+        wgpu::Buffer buffer;
+        std::atomic<bool> mapCompleted(false);
+
+        // Create buffer and request mapping.
+        buffer = CreateBuffer(kSize, wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc);
+
+        buffer.MapAsync(
+            wgpu::MapMode::Write, 0, kSize,
+            [](WGPUBufferMapAsyncStatus status, void* userdata) {
+                EXPECT_EQ(WGPUBufferMapAsyncStatus_Success, status);
+                (*static_cast<std::atomic<bool>*>(userdata)) = true;
+            },
+            &mapCompleted);
+
+        // Wait for the mapping to complete
+        while (!mapCompleted.load()) {
+            device.Tick();
+            FlushWire();
+        }
+
+        // Buffer is mapped, write into it and unmap .
+        memcpy(buffer.GetMappedRange(0, kSize), myData.data(), kSize);
+        buffer.Unmap();
+
+        // Check the content of the buffer.
+        EXPECT_BUFFER_U32_RANGE_EQ(myData.data(), buffer, 0, kDataSize);
+    });
+}
+
+// Test CreateComputePipelineAsync on multiple threads.
+TEST_P(MultithreadTests, CreateComputePipelineAsyncInParallel) {
+    // TODO(crbug.com/dawn/1766): TSAN reported race conditions in NVIDIA's vk driver.
+    DAWN_SUPPRESS_TEST_IF(IsVulkan() && IsNvidia() && IsTsan());
+
+    std::vector<wgpu::ComputePipeline> pipelines(10);
+    std::vector<std::string> shaderSources(pipelines.size());
+    std::vector<uint32_t> expectedValues(shaderSources.size());
+
+    for (uint32_t i = 0; i < pipelines.size(); ++i) {
+        expectedValues[i] = i + 1;
+
+        std::ostringstream ss;
+        ss << R"(
+        struct SSBO {
+            value : u32
+        }
+        @group(0) @binding(0) var<storage, read_write> ssbo : SSBO;
+
+        @compute @workgroup_size(1) fn main() {
+            ssbo.value =
+        )";
+        ss << expectedValues[i];
+        ss << ";}";
+
+        shaderSources[i] = ss.str();
+    }
+
+    // Create pipelines in parallel
+    utils::RunInParallel(static_cast<uint32_t>(pipelines.size()), [&](uint32_t index) {
+        wgpu::ComputePipelineDescriptor csDesc;
+        csDesc.compute.module = utils::CreateShaderModule(device, shaderSources[index].c_str());
+        csDesc.compute.entryPoint = "main";
+
+        struct Task {
+            wgpu::ComputePipeline computePipeline;
+            std::atomic<bool> isCompleted{false};
+        } task;
+        device.CreateComputePipelineAsync(
+            &csDesc,
+            [](WGPUCreatePipelineAsyncStatus status, WGPUComputePipeline returnPipeline,
+               const char* message, void* userdata) {
+                EXPECT_EQ(WGPUCreatePipelineAsyncStatus::WGPUCreatePipelineAsyncStatus_Success,
+                          status);
+
+                auto task = static_cast<Task*>(userdata);
+                task->computePipeline = wgpu::ComputePipeline::Acquire(returnPipeline);
+                task->isCompleted = true;
+            },
+            &task);
+
+        while (!task.isCompleted.load()) {
+            WaitABit();
+        }
+
+        pipelines[index] = task.computePipeline;
+    });
+
+    // Verify pipelines' executions
+    for (uint32_t i = 0; i < pipelines.size(); ++i) {
+        wgpu::Buffer ssbo =
+            CreateBuffer(sizeof(uint32_t), wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc);
+
+        wgpu::CommandBuffer commands;
+        {
+            wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+            wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
+
+            ASSERT_NE(nullptr, pipelines[i].Get());
+            wgpu::BindGroup bindGroup =
+                utils::MakeBindGroup(device, pipelines[i].GetBindGroupLayout(0),
+                                     {
+                                         {0, ssbo, 0, sizeof(uint32_t)},
+                                     });
+            pass.SetBindGroup(0, bindGroup);
+            pass.SetPipeline(pipelines[i]);
+
+            pass.DispatchWorkgroups(1);
+            pass.End();
+
+            commands = encoder.Finish();
+        }
+
+        queue.Submit(1, &commands);
+
+        EXPECT_BUFFER_U32_EQ(expectedValues[i], ssbo, 0);
+    }
+}
+
+// Test CreateRenderPipelineAsync on multiple threads.
+TEST_P(MultithreadTests, CreateRenderPipelineAsyncInParallel) {
+    // TODO(crbug.com/dawn/1766): TSAN reported race conditions in NVIDIA's vk driver.
+    DAWN_SUPPRESS_TEST_IF(IsVulkan() && IsNvidia() && IsTsan());
+
+    constexpr uint32_t kNumThreads = 10;
+    constexpr wgpu::TextureFormat kRenderAttachmentFormat = wgpu::TextureFormat::RGBA8Unorm;
+    constexpr uint8_t kColorStep = 250 / kNumThreads;
+
+    std::vector<wgpu::RenderPipeline> pipelines(kNumThreads);
+    std::vector<std::string> fragmentShaderSources(kNumThreads);
+    std::vector<utils::RGBA8> minExpectedValues(kNumThreads);
+    std::vector<utils::RGBA8> maxExpectedValues(kNumThreads);
+
+    for (uint32_t i = 0; i < kNumThreads; ++i) {
+        // Due to floating point precision, we need to use min & max values to compare the
+        // expectations.
+        auto expectedGreen = kColorStep * i;
+        minExpectedValues[i] =
+            utils::RGBA8(0, expectedGreen == 0 ? 0 : (expectedGreen - 2), 0, 255);
+        maxExpectedValues[i] =
+            utils::RGBA8(0, expectedGreen == 255 ? 255 : (expectedGreen + 2), 0, 255);
+
+        std::ostringstream ss;
+        ss << R"(
+        @fragment fn main() -> @location(0) vec4f {
+            return vec4f(0.0,
+        )";
+        ss << expectedGreen / 255.0;
+        ss << ", 0.0, 1.0);}";
+
+        fragmentShaderSources[i] = ss.str();
+    }
+
+    // Create pipelines in parallel
+    utils::RunInParallel(kNumThreads, [&](uint32_t index) {
+        utils::ComboRenderPipelineDescriptor renderPipelineDescriptor;
+        wgpu::ShaderModule vsModule = utils::CreateShaderModule(device, R"(
+        @vertex fn main() -> @builtin(position) vec4f {
+            return vec4f(0.0, 0.0, 0.0, 1.0);
+        })");
+        wgpu::ShaderModule fsModule =
+            utils::CreateShaderModule(device, fragmentShaderSources[index].c_str());
+        renderPipelineDescriptor.vertex.module = vsModule;
+        renderPipelineDescriptor.cFragment.module = fsModule;
+        renderPipelineDescriptor.cTargets[0].format = kRenderAttachmentFormat;
+        renderPipelineDescriptor.primitive.topology = wgpu::PrimitiveTopology::PointList;
+
+        struct Task {
+            wgpu::RenderPipeline renderPipeline;
+            std::atomic<bool> isCompleted{false};
+        } task;
+        device.CreateRenderPipelineAsync(
+            &renderPipelineDescriptor,
+            [](WGPUCreatePipelineAsyncStatus status, WGPURenderPipeline returnPipeline,
+               const char* message, void* userdata) {
+                EXPECT_EQ(WGPUCreatePipelineAsyncStatus::WGPUCreatePipelineAsyncStatus_Success,
+                          status);
+
+                auto* task = static_cast<Task*>(userdata);
+                task->renderPipeline = wgpu::RenderPipeline::Acquire(returnPipeline);
+                task->isCompleted = true;
+            },
+            &task);
+
+        while (!task.isCompleted) {
+            WaitABit();
+        }
+
+        pipelines[index] = task.renderPipeline;
+    });
+
+    // Verify pipelines' executions
+    for (uint32_t i = 0; i < pipelines.size(); ++i) {
+        wgpu::Texture outputTexture =
+            CreateTexture(1, 1, kRenderAttachmentFormat,
+                          wgpu::TextureUsage::RenderAttachment | wgpu::TextureUsage::CopySrc);
+
+        utils::ComboRenderPassDescriptor renderPassDescriptor({outputTexture.CreateView()});
+        renderPassDescriptor.cColorAttachments[0].loadOp = wgpu::LoadOp::Clear;
+        renderPassDescriptor.cColorAttachments[0].clearValue = {1.f, 0.f, 0.f, 1.f};
+
+        wgpu::CommandBuffer commands;
+        {
+            wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+            wgpu::RenderPassEncoder renderPassEncoder =
+                encoder.BeginRenderPass(&renderPassDescriptor);
+
+            ASSERT_NE(nullptr, pipelines[i].Get());
+
+            renderPassEncoder.SetPipeline(pipelines[i]);
+            renderPassEncoder.Draw(1);
+            renderPassEncoder.End();
+            commands = encoder.Finish();
+        }
+
+        queue.Submit(1, &commands);
+
+        EXPECT_PIXEL_RGBA8_BETWEEN(minExpectedValues[i], maxExpectedValues[i], outputTexture, 0, 0);
+    }
+}
 
 class MultithreadCachingTests : public MultithreadTests {
   protected:
@@ -116,7 +426,7 @@ class MultithreadCachingTests : public MultithreadTests {
 // Test that creating a same shader module (which will return the cached shader module) and release
 // it on multiple threads won't race.
 TEST_P(MultithreadCachingTests, RefAndReleaseCachedShaderModulesInParallel) {
-    RunInParallel(100, [this](uint32_t) {
+    utils::RunInParallel(100, [this](uint32_t) {
         wgpu::ShaderModule csModule = CreateComputeShaderModule();
         EXPECT_NE(nullptr, csModule.Get());
     });
@@ -134,7 +444,7 @@ TEST_P(MultithreadCachingTests, RefAndReleaseCachedComputePipelinesInParallel) {
     csDesc.compute.entryPoint = "main";
     csDesc.layout = pipelineLayout;
 
-    RunInParallel(100, [&, this](uint32_t) {
+    utils::RunInParallel(100, [&, this](uint32_t) {
         wgpu::ComputePipeline pipeline = device.CreateComputePipeline(&csDesc);
         EXPECT_NE(nullptr, pipeline.Get());
     });
@@ -143,7 +453,7 @@ TEST_P(MultithreadCachingTests, RefAndReleaseCachedComputePipelinesInParallel) {
 // Test that creating a same bind group layout (which will return the cached layout) and
 // release it on multiple threads won't race.
 TEST_P(MultithreadCachingTests, RefAndReleaseCachedBindGroupLayoutsInParallel) {
-    RunInParallel(100, [&, this](uint32_t) {
+    utils::RunInParallel(100, [&, this](uint32_t) {
         wgpu::BindGroupLayout layout = CreateComputeBindGroupLayout();
         EXPECT_NE(nullptr, layout.Get());
     });
@@ -154,7 +464,7 @@ TEST_P(MultithreadCachingTests, RefAndReleaseCachedBindGroupLayoutsInParallel) {
 TEST_P(MultithreadCachingTests, RefAndReleaseCachedPipelineLayoutsInParallel) {
     wgpu::BindGroupLayout bglayout = CreateComputeBindGroupLayout();
 
-    RunInParallel(100, [&, this](uint32_t) {
+    utils::RunInParallel(100, [&, this](uint32_t) {
         wgpu::PipelineLayout pipelineLayout = utils::MakePipelineLayout(device, {bglayout});
         EXPECT_NE(nullptr, pipelineLayout.Get());
     });
@@ -177,7 +487,7 @@ TEST_P(MultithreadCachingTests, RefAndReleaseCachedRenderPipelinesInParallel) {
     renderPipelineDescriptor.cTargets[0].format = wgpu::TextureFormat::RGBA8Unorm;
     renderPipelineDescriptor.primitive.topology = wgpu::PrimitiveTopology::PointList;
 
-    RunInParallel(100, [&, this](uint32_t) {
+    utils::RunInParallel(100, [&, this](uint32_t) {
         wgpu::RenderPipeline pipeline = device.CreateRenderPipeline(&renderPipelineDescriptor);
         EXPECT_NE(nullptr, pipeline.Get());
     });
@@ -187,7 +497,7 @@ TEST_P(MultithreadCachingTests, RefAndReleaseCachedRenderPipelinesInParallel) {
 // on multiple threads won't race.
 TEST_P(MultithreadCachingTests, RefAndReleaseCachedSamplersInParallel) {
     wgpu::SamplerDescriptor desc = {};
-    RunInParallel(100, [&, this](uint32_t) {
+    utils::RunInParallel(100, [&, this](uint32_t) {
         wgpu::Sampler sampler = device.CreateSampler(&desc);
         EXPECT_NE(nullptr, sampler.Get());
     });
@@ -213,7 +523,7 @@ TEST_P(MultithreadEncodingTests, RenderPassEncodersInParallel) {
 
     std::vector<wgpu::CommandBuffer> commandBuffers(kNumThreads);
 
-    RunInParallel(kNumThreads, [=, &commandBuffers](uint32_t index) {
+    utils::RunInParallel(kNumThreads, [=, &commandBuffers](uint32_t index) {
         wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
 
         // Clear the renderTarget to red.
@@ -263,7 +573,7 @@ TEST_P(MultithreadEncodingTests, ComputePassEncodersInParallel) {
 
     std::vector<wgpu::CommandBuffer> commandBuffers(kNumThreads);
 
-    RunInParallel(kNumThreads, [=, &commandBuffers](uint32_t index) {
+    utils::RunInParallel(kNumThreads, [=, &commandBuffers](uint32_t index) {
         wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
         wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
         pass.SetPipeline(pipeline);
@@ -284,6 +594,494 @@ TEST_P(MultithreadEncodingTests, ComputePassEncodersInParallel) {
     }
 }
 
+class MultithreadTextureCopyTests : public MultithreadTests {
+  protected:
+    void SetUp() override {
+        MultithreadTests::SetUp();
+
+        // TODO(crbug.com/dawn/1291): These tests are failing on GLES (both native and ANGLE)
+        // when using Tint/GLSL.
+        DAWN_TEST_UNSUPPORTED_IF(IsOpenGLES());
+    }
+
+    wgpu::Texture CreateAndWriteTexture(uint32_t width,
+                                        uint32_t height,
+                                        wgpu::TextureFormat format,
+                                        wgpu::TextureUsage usage,
+                                        const void* data,
+                                        size_t dataSize) {
+        auto texture = CreateTexture(width, height, format, wgpu::TextureUsage::CopyDst | usage);
+
+        wgpu::Extent3D textureSize = {width, height, 1};
+
+        wgpu::ImageCopyTexture imageCopyTexture =
+            utils::CreateImageCopyTexture(texture, 0, {0, 0, 0}, wgpu::TextureAspect::All);
+        wgpu::TextureDataLayout textureDataLayout =
+            utils::CreateTextureDataLayout(0, dataSize / height);
+
+        queue.WriteTexture(&imageCopyTexture, data, dataSize, &textureDataLayout, &textureSize);
+
+        return texture;
+    }
+
+    uint32_t BufferSizeForTextureCopy(uint32_t width, uint32_t height, wgpu::TextureFormat format) {
+        uint32_t bytesPerRow = utils::GetMinimumBytesPerRow(format, width);
+        return utils::RequiredBytesInCopy(bytesPerRow, height, {width, height, 1}, format);
+    }
+
+    void CopyTextureToTextureHelper(
+        const wgpu::Texture& srcTexture,
+        const wgpu::ImageCopyTexture& dst,
+        const wgpu::Extent3D& dstSize,
+        const wgpu::CommandEncoder& encoder,
+        const wgpu::CopyTextureForBrowserOptions* copyForBrowerOptions = nullptr) {
+        wgpu::ImageCopyTexture srcView =
+            utils::CreateImageCopyTexture(srcTexture, 0, {0, 0, 0}, wgpu::TextureAspect::All);
+
+        if (copyForBrowerOptions == nullptr) {
+            encoder.CopyTextureToTexture(&srcView, &dst, &dstSize);
+
+            wgpu::CommandBuffer commands = encoder.Finish();
+            queue.Submit(1, &commands);
+        } else {
+            // Don't need encoder
+            ASSERT(encoder == nullptr);
+            queue.CopyTextureForBrowser(&srcView, &dst, &dstSize, copyForBrowerOptions);
+        }
+    }
+
+    void CopyBufferToTextureHelper(const wgpu::Buffer& srcBuffer,
+                                   uint32_t srcBytesPerRow,
+                                   const wgpu::ImageCopyTexture& dst,
+                                   const wgpu::Extent3D& dstSize,
+                                   const wgpu::CommandEncoder& encoder) {
+        wgpu::ImageCopyBuffer srcView =
+            utils::CreateImageCopyBuffer(srcBuffer, 0, srcBytesPerRow, dstSize.height);
+
+        encoder.CopyBufferToTexture(&srcView, &dst, &dstSize);
+
+        wgpu::CommandBuffer commands = encoder.Finish();
+        queue.Submit(1, &commands);
+    }
+};
+
+// Test that depth texture's CopyTextureToTexture() can work in parallel with other commands (such
+// resources creation and texture to buffer copy for texture expectations).
+// This test is needed since most of command encoder's commands are not synchronized, but
+// CopyTextureToTexture() command might internally allocate resources and we need to make sure that
+// it won't race with other threads' works.
+TEST_P(MultithreadTextureCopyTests, CopyDepthToDepthNoRace) {
+    enum class Step {
+        Begin,
+        WriteTexture,
+    };
+
+    constexpr uint32_t kWidth = 4;
+    constexpr uint32_t kHeight = 4;
+
+    const std::vector<float> kExpectedData32 = {
+        0,    0,    0,    0,  //
+        0,    0,    0.4f, 0,  //
+        1.0f, 1.0f, 0,    0,  //
+        1.0f, 1.0f, 0,    0,  //
+    };
+
+    std::vector<uint16_t> kExpectedData16(kExpectedData32.size());
+    for (size_t i = 0; i < kExpectedData32.size(); ++i) {
+        kExpectedData16[i] = kExpectedData32[i] * std::numeric_limits<uint16_t>::max();
+    }
+
+    const size_t kExpectedDataSize16 = kExpectedData16.size() * sizeof(kExpectedData16[0]);
+
+    LockStep<Step> lockStep(Step::Begin);
+
+    wgpu::Texture depthTexture;
+    std::thread writeThread([&] {
+        depthTexture = CreateAndWriteTexture(
+            kWidth, kHeight, wgpu::TextureFormat::Depth16Unorm,
+            wgpu::TextureUsage::CopySrc | wgpu::TextureUsage::RenderAttachment,
+            kExpectedData16.data(), kExpectedDataSize16);
+
+        lockStep.Signal(Step::WriteTexture);
+
+        // Verify the initial data
+        ExpectAttachmentDepthTestData(depthTexture, wgpu::TextureFormat::Depth16Unorm, kWidth,
+                                      kHeight, 0, /*mipLevel=*/0, kExpectedData32);
+    });
+
+    std::thread copyThread([&] {
+        auto destTexture =
+            CreateTexture(kWidth * 2, kHeight * 2, wgpu::TextureFormat::Depth16Unorm,
+                          wgpu::TextureUsage::RenderAttachment | wgpu::TextureUsage::CopyDst |
+                              wgpu::TextureUsage::CopySrc,
+                          /*mipLevelCount=*/2);
+
+        // Copy from depthTexture to destTexture.
+        const wgpu::Extent3D dstSize = {kWidth, kHeight, 1};
+        wgpu::ImageCopyTexture dest = utils::CreateImageCopyTexture(
+            destTexture, /*dstMipLevel=*/1, {0, 0, 0}, wgpu::TextureAspect::All);
+        auto encoder = device.CreateCommandEncoder();
+        lockStep.Wait(Step::WriteTexture);
+        CopyTextureToTextureHelper(depthTexture, dest, dstSize, encoder);
+
+        // Verify the copied data
+        ExpectAttachmentDepthTestData(destTexture, wgpu::TextureFormat::Depth16Unorm, kWidth,
+                                      kHeight, 0, /*mipLevel=*/1, kExpectedData32);
+    });
+
+    writeThread.join();
+    copyThread.join();
+}
+
+// Test that depth texture's CopyBufferToTexture() can work in parallel with other commands (such
+// resources creation and texture to buffer copy for texture expectations).
+// This test is needed since most of command encoder's commands are not synchronized, but
+// CopyBufferToTexture() command might internally allocate resources and we need to make sure that
+// it won't race with other threads' works.
+TEST_P(MultithreadTextureCopyTests, CopyBufferToDepthNoRace) {
+    enum class Step {
+        Begin,
+        WriteBuffer,
+    };
+
+    constexpr uint32_t kWidth = 16;
+    constexpr uint32_t kHeight = 1;
+
+    const std::vector<float> kExpectedData32 = {
+        0,    0,    0,    0,  //
+        0,    0,    0.4f, 0,  //
+        1.0f, 1.0f, 0,    0,  //
+        1.0f, 1.0f, 0,    0,  //
+    };
+
+    std::vector<uint16_t> kExpectedData16(kExpectedData32.size());
+    for (size_t i = 0; i < kExpectedData32.size(); ++i) {
+        kExpectedData16[i] = kExpectedData32[i] * std::numeric_limits<uint16_t>::max();
+    }
+
+    const uint32_t kExpectedDataSize16 = kExpectedData16.size() * sizeof(kExpectedData16[0]);
+
+    const wgpu::Extent3D kSize = {kWidth, kHeight, 1};
+    LockStep<Step> lockStep(Step::Begin);
+
+    wgpu::Buffer buffer;
+    std::thread writeThread([&] {
+        buffer = CreateBuffer(
+            BufferSizeForTextureCopy(kWidth, kHeight, wgpu::TextureFormat::Depth16Unorm),
+            wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::CopySrc);
+
+        queue.WriteBuffer(buffer, 0, kExpectedData16.data(), kExpectedDataSize16);
+        device.Tick();
+
+        lockStep.Signal(Step::WriteBuffer);
+
+        EXPECT_BUFFER_U16_RANGE_EQ(kExpectedData16.data(), buffer, 0, kExpectedData16.size());
+    });
+
+    std::thread copyThread([&] {
+        auto destTexture =
+            CreateTexture(kWidth, kHeight, wgpu::TextureFormat::Depth16Unorm,
+                          wgpu::TextureUsage::RenderAttachment | wgpu::TextureUsage::CopyDst |
+                              wgpu::TextureUsage::CopySrc);
+
+        auto encoder = device.CreateCommandEncoder();
+
+        wgpu::ImageCopyTexture dest = utils::CreateImageCopyTexture(
+            destTexture, /*dstMipLevel=*/0, {0, 0, 0}, wgpu::TextureAspect::All);
+
+        // Wait until src buffer is written.
+        lockStep.Wait(Step::WriteBuffer);
+        CopyBufferToTextureHelper(buffer, kTextureBytesPerRowAlignment, dest, kSize, encoder);
+
+        // Verify the copied data
+        ExpectAttachmentDepthTestData(destTexture, wgpu::TextureFormat::Depth16Unorm, kWidth,
+                                      kHeight, 0, /*mipLevel=*/0, kExpectedData32);
+    });
+
+    writeThread.join();
+    copyThread.join();
+}
+
+// Test that stencil texture's CopyTextureToTexture() can work in parallel with other commands (such
+// resources creation and texture to buffer copy for texture expectations).
+// This test is needed since most of command encoder's commands are not synchronized, but
+// CopyTextureToTexture() command might internally allocate resources and we need to make sure that
+// it won't race with other threads' works.
+TEST_P(MultithreadTextureCopyTests, CopyStencilToStencilNoRace) {
+    // TODO(crbug.com/dawn/1497): glReadPixels: GL error: HIGH: Invalid format and type
+    // combination.
+    DAWN_SUPPRESS_TEST_IF(IsANGLE());
+
+    // TODO(crbug.com/dawn/667): Work around the fact that some platforms are unable to read
+    // stencil.
+    DAWN_TEST_UNSUPPORTED_IF(HasToggleEnabled("disable_depth_stencil_read"));
+
+    enum class Step {
+        Begin,
+        WriteTexture,
+    };
+
+    constexpr uint32_t kWidth = 1;
+    constexpr uint32_t kHeight = 1;
+
+    constexpr uint8_t kExpectedData = 177;
+    constexpr size_t kExpectedDataSize = sizeof(kExpectedData);
+
+    LockStep<Step> lockStep(Step::Begin);
+
+    wgpu::Texture stencilTexture;
+    std::thread writeThread([&] {
+        stencilTexture = CreateAndWriteTexture(
+            kWidth, kHeight, wgpu::TextureFormat::Stencil8,
+            wgpu::TextureUsage::CopySrc | wgpu::TextureUsage::RenderAttachment, &kExpectedData,
+            kExpectedDataSize);
+
+        lockStep.Signal(Step::WriteTexture);
+
+        // Verify the initial data
+        ExpectAttachmentStencilTestData(stencilTexture, wgpu::TextureFormat::Stencil8, kWidth,
+                                        kHeight, 0, /*mipLevel=*/0, kExpectedData);
+    });
+
+    std::thread copyThread([&] {
+        auto destTexture =
+            CreateTexture(kWidth * 2, kHeight * 2, wgpu::TextureFormat::Stencil8,
+                          wgpu::TextureUsage::RenderAttachment | wgpu::TextureUsage::CopyDst |
+                              wgpu::TextureUsage::CopySrc,
+                          /*mipLevelCount=*/2);
+
+        // Copy from stencilTexture to destTexture.
+        const wgpu::Extent3D dstSize = {kWidth, kHeight, 1};
+        wgpu::ImageCopyTexture dest = utils::CreateImageCopyTexture(
+            destTexture, /*dstMipLevel=*/1, {0, 0, 0}, wgpu::TextureAspect::All);
+        auto encoder = device.CreateCommandEncoder();
+        lockStep.Wait(Step::WriteTexture);
+
+        CopyTextureToTextureHelper(stencilTexture, dest, dstSize, encoder);
+
+        // Verify the copied data
+        ExpectAttachmentStencilTestData(destTexture, wgpu::TextureFormat::Stencil8, kWidth, kHeight,
+                                        0, /*mipLevel=*/1, kExpectedData);
+    });
+
+    writeThread.join();
+    copyThread.join();
+}
+
+// Test that stencil texture's CopyBufferToTexture() can work in parallel with other commands (such
+// resources creation and texture to buffer copy for texture expectations).
+// This test is needed since most of command encoder's commands are not synchronized, but
+// CopyBufferToTexture() command might internally allocate resources and we need to make sure that
+// it won't race with other threads' works.
+TEST_P(MultithreadTextureCopyTests, CopyBufferToStencilNoRace) {
+    enum class Step {
+        Begin,
+        WriteBuffer,
+    };
+
+    constexpr uint32_t kWidth = 1;
+    constexpr uint32_t kHeight = 1;
+
+    constexpr uint8_t kExpectedData = 177;
+
+    const wgpu::Extent3D kSize = {kWidth, kHeight, 1};
+    LockStep<Step> lockStep(Step::Begin);
+
+    wgpu::Buffer buffer;
+    std::thread writeThread([&] {
+        const auto kBufferSize = kTextureBytesPerRowAlignment;
+        buffer = CreateBuffer(kBufferSize, wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::CopySrc);
+
+        std::vector<uint8_t> bufferData(kBufferSize);
+        bufferData[0] = kExpectedData;
+
+        queue.WriteBuffer(buffer.Get(), 0, bufferData.data(), kBufferSize);
+        device.Tick();
+
+        lockStep.Signal(Step::WriteBuffer);
+
+        EXPECT_BUFFER_U8_EQ(kExpectedData, buffer, 0);
+    });
+
+    std::thread copyThread([&] {
+        auto destTexture =
+            CreateTexture(kWidth, kHeight, wgpu::TextureFormat::Stencil8,
+                          wgpu::TextureUsage::RenderAttachment | wgpu::TextureUsage::CopyDst |
+                              wgpu::TextureUsage::CopySrc);
+
+        auto encoder = device.CreateCommandEncoder();
+
+        wgpu::ImageCopyTexture dest = utils::CreateImageCopyTexture(
+            destTexture, /*dstMipLevel=*/0, {0, 0, 0}, wgpu::TextureAspect::All);
+
+        // Wait until src buffer is written.
+        lockStep.Wait(Step::WriteBuffer);
+        CopyBufferToTextureHelper(buffer, kTextureBytesPerRowAlignment, dest, kSize, encoder);
+
+        // Verify the copied data
+        ExpectAttachmentStencilTestData(destTexture, wgpu::TextureFormat::Stencil8, kWidth, kHeight,
+                                        0, /*mipLevel=*/0, kExpectedData);
+    });
+
+    writeThread.join();
+    copyThread.join();
+}
+
+// Test that color texture's CopyTextureForBrowser() can work in parallel with other commands (such
+// resources creation and texture to buffer copy for texture expectations).
+// This test is needed since CopyTextureForBrowser() command might internally allocate resources and
+// we need to make sure that it won't race with other threads' works.
+TEST_P(MultithreadTextureCopyTests, CopyTextureForBrowserNoRace) {
+    // TODO(crbug.com/dawn/1232): Program link error on OpenGLES backend
+    DAWN_SUPPRESS_TEST_IF(IsOpenGLES());
+    DAWN_SUPPRESS_TEST_IF(IsOpenGL() && IsLinux());
+
+    enum class Step {
+        Begin,
+        WriteTexture,
+    };
+
+    constexpr uint32_t kWidth = 4;
+    constexpr uint32_t kHeight = 4;
+
+    const std::vector<utils::RGBA8> kExpectedData = {
+        utils::RGBA8::kBlack, utils::RGBA8::kBlack, utils::RGBA8::kBlack, utils::RGBA8::kBlack,  //
+        utils::RGBA8::kBlack, utils::RGBA8::kBlack, utils::RGBA8::kGreen, utils::RGBA8::kBlack,  //
+        utils::RGBA8::kRed,   utils::RGBA8::kRed,   utils::RGBA8::kBlack, utils::RGBA8::kBlack,  //
+        utils::RGBA8::kRed,   utils::RGBA8::kBlue,  utils::RGBA8::kBlack, utils::RGBA8::kBlack,  //
+    };
+
+    const std::vector<utils::RGBA8> kExpectedFlippedData = {
+        utils::RGBA8::kRed,   utils::RGBA8::kBlue,  utils::RGBA8::kBlack, utils::RGBA8::kBlack,  //
+        utils::RGBA8::kRed,   utils::RGBA8::kRed,   utils::RGBA8::kBlack, utils::RGBA8::kBlack,  //
+        utils::RGBA8::kBlack, utils::RGBA8::kBlack, utils::RGBA8::kGreen, utils::RGBA8::kBlack,  //
+        utils::RGBA8::kBlack, utils::RGBA8::kBlack, utils::RGBA8::kBlack, utils::RGBA8::kBlack,  //
+    };
+
+    const size_t kExpectedDataSize = kExpectedData.size() * sizeof(kExpectedData[0]);
+
+    LockStep<Step> lockStep(Step::Begin);
+
+    wgpu::Texture srcTexture;
+    std::thread writeThread([&] {
+        srcTexture =
+            CreateAndWriteTexture(kWidth, kHeight, wgpu::TextureFormat::RGBA8Unorm,
+                                  wgpu::TextureUsage::CopySrc | wgpu::TextureUsage::TextureBinding,
+                                  kExpectedData.data(), kExpectedDataSize);
+
+        lockStep.Signal(Step::WriteTexture);
+
+        // Verify the initial data
+        EXPECT_TEXTURE_EQ(kExpectedData.data(), srcTexture, {0, 0}, {kWidth, kHeight});
+    });
+
+    std::thread copyThread([&] {
+        auto destTexture =
+            CreateTexture(kWidth, kHeight, wgpu::TextureFormat::RGBA8Unorm,
+                          wgpu::TextureUsage::RenderAttachment | wgpu::TextureUsage::CopyDst |
+                              wgpu::TextureUsage::CopySrc);
+
+        // Copy from srcTexture to destTexture.
+        const wgpu::Extent3D dstSize = {kWidth, kHeight, 1};
+        wgpu::ImageCopyTexture dest = utils::CreateImageCopyTexture(
+            destTexture, /*dstMipLevel=*/0, {0, 0, 0}, wgpu::TextureAspect::All);
+        wgpu::CopyTextureForBrowserOptions options;
+        options.flipY = true;
+
+        lockStep.Wait(Step::WriteTexture);
+        CopyTextureToTextureHelper(srcTexture, dest, dstSize, nullptr, &options);
+
+        // Verify the copied data
+        EXPECT_TEXTURE_EQ(kExpectedFlippedData.data(), destTexture, {0, 0}, {kWidth, kHeight});
+    });
+
+    writeThread.join();
+    copyThread.join();
+}
+
+// Test that error from CopyTextureForBrowser() won't cause deadlock.
+TEST_P(MultithreadTextureCopyTests, CopyTextureForBrowserErrorNoDeadLock) {
+    // TODO(crbug.com/dawn/1232): Program link error on OpenGLES backend
+    DAWN_SUPPRESS_TEST_IF(IsOpenGLES());
+    DAWN_SUPPRESS_TEST_IF(IsOpenGL() && IsLinux());
+
+    DAWN_TEST_UNSUPPORTED_IF(HasToggleEnabled("skip_validation"));
+
+    enum class Step {
+        Begin,
+        WriteTexture,
+    };
+
+    constexpr uint32_t kWidth = 4;
+    constexpr uint32_t kHeight = 4;
+
+    const std::vector<utils::RGBA8> kExpectedData = {
+        utils::RGBA8::kBlack, utils::RGBA8::kBlack, utils::RGBA8::kBlack, utils::RGBA8::kBlack,  //
+        utils::RGBA8::kBlack, utils::RGBA8::kBlack, utils::RGBA8::kGreen, utils::RGBA8::kBlack,  //
+        utils::RGBA8::kRed,   utils::RGBA8::kRed,   utils::RGBA8::kBlack, utils::RGBA8::kBlack,  //
+        utils::RGBA8::kRed,   utils::RGBA8::kBlue,  utils::RGBA8::kBlack, utils::RGBA8::kBlack,  //
+    };
+
+    const size_t kExpectedDataSize = kExpectedData.size() * sizeof(kExpectedData[0]);
+
+    LockStep<Step> lockStep(Step::Begin);
+
+    wgpu::Texture srcTexture;
+    std::thread writeThread([&] {
+        srcTexture =
+            CreateAndWriteTexture(kWidth, kHeight, wgpu::TextureFormat::RGBA8Unorm,
+                                  wgpu::TextureUsage::CopySrc | wgpu::TextureUsage::TextureBinding,
+                                  kExpectedData.data(), kExpectedDataSize);
+
+        lockStep.Signal(Step::WriteTexture);
+
+        // Verify the initial data
+        EXPECT_TEXTURE_EQ(kExpectedData.data(), srcTexture, {0, 0}, {kWidth, kHeight});
+    });
+
+    std::thread copyThread([&] {
+        wgpu::Texture invalidSrcTexture;
+        invalidSrcTexture = CreateTexture(kWidth, kHeight, wgpu::TextureFormat::RGBA8Unorm,
+                                          wgpu::TextureUsage::CopySrc);
+        auto destTexture =
+            CreateTexture(kWidth, kHeight, wgpu::TextureFormat::RGBA8Unorm,
+                          wgpu::TextureUsage::RenderAttachment | wgpu::TextureUsage::CopyDst |
+                              wgpu::TextureUsage::CopySrc);
+
+        // Copy from srcTexture to destTexture.
+        const wgpu::Extent3D dstSize = {kWidth, kHeight, 1};
+        wgpu::ImageCopyTexture dest = utils::CreateImageCopyTexture(
+            destTexture, /*dstMipLevel=*/0, {0, 0, 0}, wgpu::TextureAspect::All);
+        wgpu::CopyTextureForBrowserOptions options = {};
+
+        device.PushErrorScope(wgpu::ErrorFilter::Validation);
+
+        // The first copy should be an error because of missing TextureBinding from src texture.
+        lockStep.Wait(Step::WriteTexture);
+        CopyTextureToTextureHelper(invalidSrcTexture, dest, dstSize, nullptr, &options);
+
+        std::atomic<bool> errorThrown(false);
+        device.PopErrorScope(
+            [](WGPUErrorType type, char const* message, void* userdata) {
+                EXPECT_EQ(type, WGPUErrorType_Validation);
+                auto error = static_cast<std::atomic<bool>*>(userdata);
+                *error = true;
+            },
+            &errorThrown);
+        device.Tick();
+        EXPECT_TRUE(errorThrown.load());
+
+        // Second copy is valid.
+        CopyTextureToTextureHelper(srcTexture, dest, dstSize, nullptr, &options);
+
+        // Verify the copied data
+        EXPECT_TEXTURE_EQ(kExpectedData.data(), destTexture, {0, 0}, {kWidth, kHeight});
+    });
+
+    writeThread.join();
+    copyThread.join();
+}
+
 class MultithreadDrawIndexedIndirectTests : public MultithreadTests {
   protected:
     void SetUp() override {
@@ -376,8 +1174,8 @@ class MultithreadDrawIndexedIndirectTests : public MultithreadTests {
                   wgpu::CommandBuffer commands) {
         queue.Submit(1, &commands);
 
-        LOCKED_CMD(EXPECT_PIXEL_RGBA8_EQ(bottomLeftExpected, renderPass.color, 1, 3));
-        LOCKED_CMD(EXPECT_PIXEL_RGBA8_EQ(topRightExpected, renderPass.color, 3, 1));
+        EXPECT_PIXEL_RGBA8_EQ(bottomLeftExpected, renderPass.color, 1, 3);
+        EXPECT_PIXEL_RGBA8_EQ(topRightExpected, renderPass.color, 3, 1);
     }
 
     wgpu::RenderPipeline pipeline;
@@ -397,7 +1195,7 @@ TEST_P(MultithreadDrawIndexedIndirectTests, IndirectOffsetInParallel) {
     utils::RGBA8 filled(0, 255, 0, 255);
     utils::RGBA8 notFilled(0, 0, 0, 0);
 
-    RunInParallel(10, [=](uint32_t) {
+    utils::RunInParallel(10, [=](uint32_t) {
         // Test an offset draw call, with indirect buffer containing 2 calls:
         // 1) first 3 indices of the second quad (top right triangle)
         // 2) last 3 indices of the second quad
@@ -473,7 +1271,7 @@ TEST_P(MultithreadTimestampQueryTests, ResolveQuerySets_InParallel) {
         destinations[i] = CreateResolveBuffer(kQueryCount * sizeof(uint64_t));
     }
 
-    RunInParallel(kNumThreads, [&](uint32_t index) {
+    utils::RunInParallel(kNumThreads, [&](uint32_t index) {
         const auto& querySet = querySets[index];
         const auto& destination = destinations[index];
         wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
@@ -483,13 +1281,19 @@ TEST_P(MultithreadTimestampQueryTests, ResolveQuerySets_InParallel) {
         wgpu::CommandBuffer commands = encoder.Finish();
         queue.Submit(1, &commands);
 
-        LOCKED_CMD(EXPECT_BUFFER(destination, 0, kQueryCount * sizeof(uint64_t),
-                                 new TimestampExpectation));
+        EXPECT_BUFFER(destination, 0, kQueryCount * sizeof(uint64_t), new TimestampExpectation);
     });
 }
 
 }  // namespace
 
+DAWN_INSTANTIATE_TEST(MultithreadTests,
+                      D3D12Backend(),
+                      MetalBackend(),
+                      OpenGLBackend(),
+                      OpenGLESBackend(),
+                      VulkanBackend());
+
 DAWN_INSTANTIATE_TEST(MultithreadCachingTests,
                       D3D12Backend(),
                       MetalBackend(),
@@ -504,6 +1308,17 @@ DAWN_INSTANTIATE_TEST(MultithreadEncodingTests,
                       OpenGLESBackend(),
                       VulkanBackend());
 
+DAWN_INSTANTIATE_TEST(
+    MultithreadTextureCopyTests,
+    D3D12Backend(),
+    MetalBackend(),
+    MetalBackend({"use_blit_for_buffer_to_depth_texture_copy",
+                  "use_blit_for_depth_texture_to_texture_copy_to_nonzero_subresource"}),
+    MetalBackend({"use_blit_for_buffer_to_stencil_texture_copy"}),
+    OpenGLBackend(),
+    OpenGLESBackend(),
+    VulkanBackend());
+
 DAWN_INSTANTIATE_TEST(MultithreadDrawIndexedIndirectTests,
                       D3D12Backend(),
                       MetalBackend(),
diff --git a/src/dawn/utils/TestUtils.cpp b/src/dawn/utils/TestUtils.cpp
index 3216402abf..bf7b247802 100644
--- a/src/dawn/utils/TestUtils.cpp
+++ b/src/dawn/utils/TestUtils.cpp
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include <algorithm>
+#include <memory>
 #include <ostream>
+#include <thread>
 #include <vector>
 
 #include "dawn/common/Assert.h"
@@ -190,4 +192,22 @@ uint32_t VertexFormatSize(wgpu::VertexFormat format) {
     UNREACHABLE();
 }
 
+void RunInParallel(uint32_t numThreads,
+                   const std::function<void(uint32_t)>& workerFunc,
+                   const std::function<void()>& mainThreadFunc) {
+    std::vector<std::unique_ptr<std::thread>> threads(numThreads);
+
+    for (uint32_t i = 0; i < threads.size(); ++i) {
+        threads[i] = std::make_unique<std::thread>([i, workerFunc] { workerFunc(i); });
+    }
+
+    if (mainThreadFunc != nullptr) {
+        mainThreadFunc();
+    }
+
+    for (auto& thread : threads) {
+        thread->join();
+    }
+}
+
 }  // namespace utils
diff --git a/src/dawn/utils/TestUtils.h b/src/dawn/utils/TestUtils.h
index 5c359b75b7..9654b72116 100644
--- a/src/dawn/utils/TestUtils.h
+++ b/src/dawn/utils/TestUtils.h
@@ -15,6 +15,7 @@
 #ifndef SRC_DAWN_UTILS_TESTUTILS_H_
 #define SRC_DAWN_UTILS_TESTUTILS_H_
 
+#include <functional>
 #include <ostream>
 
 #include "dawn/webgpu_cpp.h"
@@ -84,6 +85,10 @@ void UnalignDynamicUploader(wgpu::Device device);
 
 uint32_t VertexFormatSize(wgpu::VertexFormat format);
 
+void RunInParallel(uint32_t numThreads,
+                   const std::function<void(uint32_t)>& workerFunc,
+                   const std::function<void()>& mainThreadFunc = nullptr);
+
 }  // namespace utils
 
 #endif  // SRC_DAWN_UTILS_TESTUTILS_H_