D3D12: Add workaround for T2T copy issue on Intel GPUs

On Intel Gen9 (SKL) and Gen9.5 (KBL, CFL, CML) GPUs with latest Intel D3D12 driver (27.20.100.9316), there is a bug in the command CopyTextureRegion() when we want to do the texture-to-texture copy with the formats whose texel block size < 4 bytes and source mipmap level > destination mipmap level. This patch adds a workaround for this driver bug by implementing the functionality of the T2T copy with one T2B copy and one B2T copy. BUG=chromium:1161355 TEST=dawn_end2end_tests Change-Id: I688bb8bae277832aaba1be2680012040ee8e1160 Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/43860 Reviewed-by: Corentin Wallez <cwallez@chromium.org> Commit-Queue: Austin Eng <enga@chromium.org>
2025-12-10 05:57:51 +00:00 · 2021-03-11 19:34:50 +00:00
parent 9f6bc4e3a9
commit f905e57be2
9 changed files with 213 additions and 47 deletions
--- a/src/common/GPUInfo.cpp
+++ b/src/common/GPUInfo.cpp
@@ -14,7 +14,31 @@

 #include "common/GPUInfo.h"

+#include <algorithm>
+
 namespace gpu_info {
+    namespace {
+        // Intel
+        // Referenced from the following Mesa source code:
+        // https://github.com/mesa3d/mesa/blob/master/include/pci_ids/i965_pci_ids.h
+        // gen9
+        const uint32_t Skylake[] = {0x1902, 0x1906, 0x190A, 0x190B, 0x190E, 0x1912, 0x1913,
+                                    0x1915, 0x1916, 0x1917, 0x191A, 0x191B, 0x191D, 0x191E,
+                                    0x1921, 0x1923, 0x1926, 0x1927, 0x192A, 0x192B, 0x192D,
+                                    0x1932, 0x193A, 0x193B, 0x193D};
+        // gen9p5
+        const uint32_t Kabylake[] = {0x5916, 0x5913, 0x5906, 0x5926, 0x5921, 0x5915, 0x590E,
+                                     0x591E, 0x5912, 0x5917, 0x5902, 0x591B, 0x593B, 0x590B,
+                                     0x591A, 0x590A, 0x591D, 0x5908, 0x5923, 0x5927};
+        const uint32_t Coffeelake[] = {0x87CA, 0x3E90, 0x3E93, 0x3E99, 0x3E9C, 0x3E91,
+                                       0x3E92, 0x3E96, 0x3E98, 0x3E9A, 0x3E9B, 0x3E94,
+                                       0x3EA9, 0x3EA5, 0x3EA6, 0x3EA7, 0x3EA8};
+        const uint32_t WhiskyLake[] = {0x3EA1, 0x3EA4, 0x3EA0, 0x3EA3, 0x3EA2};
+        const uint32_t CometLake[] = {0x9B21, 0x9BA0, 0x9BA2, 0x9BA4, 0x9BA5, 0x9BA8, 0x9BAA,
+                                      0x9BAB, 0x9BAC, 0x9B41, 0x9BC0, 0x9BC2, 0x9BC4, 0x9BC5,
+                                      0x9BC6, 0x9BC8, 0x9BCA, 0x9BCB, 0x9BCC, 0x9BE6, 0x9BF6};
+    }  // anonymous namespace
+
    bool IsAMD(PCIVendorID vendorId) {
        return vendorId == kVendorID_AMD;
    }
@@ -39,4 +63,20 @@ namespace gpu_info {
    bool IsWARP(PCIVendorID vendorId, PCIDeviceID deviceId) {
        return vendorId == kVendorID_Microsoft && deviceId == kDeviceID_WARP;
    }
+
+    // Intel GPUs
+    bool IsSkylake(PCIDeviceID deviceId) {
+        return std::find(std::begin(Skylake), std::end(Skylake), deviceId) != std::end(Skylake);
+    }
+    bool IsKabylake(PCIDeviceID deviceId) {
+        return std::find(std::begin(Kabylake), std::end(Kabylake), deviceId) != std::end(Kabylake);
+    }
+    bool IsCoffeelake(PCIDeviceID deviceId) {
+        return (std::find(std::begin(Coffeelake), std::end(Coffeelake), deviceId) !=
+                std::end(Coffeelake)) ||
+               (std::find(std::begin(WhiskyLake), std::end(WhiskyLake), deviceId) !=
+                std::end(WhiskyLake)) ||
+               (std::find(std::begin(CometLake), std::end(CometLake), deviceId) !=
+                std::end(CometLake));
+    }
 }  // namespace gpu_info
--- a/src/common/GPUInfo.h
+++ b/src/common/GPUInfo.h
@@ -43,5 +43,10 @@ namespace gpu_info {
    bool IsSwiftshader(PCIVendorID vendorId, PCIDeviceID deviceId);
    bool IsWARP(PCIVendorID vendorId, PCIDeviceID deviceId);

+    // Intel architectures
+    bool IsSkylake(PCIDeviceID deviceId);
+    bool IsKabylake(PCIDeviceID deviceId);
+    bool IsCoffeelake(PCIDeviceID deviceId);
+
 }  // namespace gpu_info
 #endif  // COMMON_GPUINFO_H
--- a/src/dawn_native/Toggles.cpp
+++ b/src/dawn_native/Toggles.cpp
@@ -172,7 +172,17 @@ namespace dawn_native {
              "If needed, use a compute shader to transform timestamp queries from ticks to "
              "nanoseconds. This is temporarily needed to avoid requiring Tint to use timestamp "
              "queries",
-              "https://crbug.com/dawn/686"}}
+              "https://crbug.com/dawn/686"}},
+            {Toggle::UseTempBufferInSmallFormatTextureToTextureCopyFromGreaterToLessMipLevel,
+             {"use_temp_buffer_in_small_format_texture_to_texture_copy_from_greater_to_less_mip_"
+              "level",
+              "Split texture-to-texture copy into two copies: copy from source texture into a "
+              "temporary buffer, and copy from the temporary buffer into the destination texture "
+              "under specific situations. This workaround is by default enabled on some Intel "
+              "GPUs which have a driver bug in the execution of CopyTextureRegion() when we copy "
+              "with the formats whose texel block sizes are less than 4 bytes from a greater mip "
+              "level to a smaller mip level on D3D12 backends.",
+              "https://crbug.com/1161355"}}
            // Dummy comment to separate the }} so it is clearer what to copy-paste to add a toggle.
        }};

--- a/src/dawn_native/Toggles.h
+++ b/src/dawn_native/Toggles.h
@@ -51,6 +51,7 @@ namespace dawn_native {
        UseTintGenerator,
        FlushBeforeClientWaitSync,
        ConvertTimestampsToNanoseconds,
+        UseTempBufferInSmallFormatTextureToTextureCopyFromGreaterToLessMipLevel,

        EnumCount,
        InvalidEnum = EnumCount,
--- a/src/dawn_native/d3d12/CommandBufferD3D12.cpp
+++ b/src/dawn_native/d3d12/CommandBufferD3D12.cpp
@@ -118,6 +118,47 @@ namespace dawn_native { namespace d3d12 {
            }
        }

+        void CopyTextureToBufferWithCopySplit(ID3D12GraphicsCommandList* commandList,
+                                              const TextureCopy& textureCopy,
+                                              const BufferCopy& bufferCopy,
+                                              Texture* texture,
+                                              Buffer* buffer,
+                                              const Extent3D& copySize) {
+            const TexelBlockInfo& blockInfo =
+                texture->GetFormat().GetAspectInfo(textureCopy.aspect).block;
+
+            // See comments around ComputeTextureCopySplits() for more details.
+            const TextureCopySplits copySplits =
+                ComputeTextureCopySplits(textureCopy.origin, copySize, blockInfo, bufferCopy.offset,
+                                         bufferCopy.bytesPerRow, bufferCopy.rowsPerImage);
+
+            const uint64_t bytesPerSlice = bufferCopy.bytesPerRow * bufferCopy.rowsPerImage;
+
+            // copySplits.copies2D[1] is always calculated for the second copy slice with
+            // extra "bytesPerSlice" copy offset compared with the first copy slice. So
+            // here we use an array bufferOffsetsForNextSlice to record the extra offsets
+            // for each copy slice: bufferOffsetsForNextSlice[0] is the extra offset for
+            // the next copy slice that uses copySplits.copies2D[0], and
+            // bufferOffsetsForNextSlice[1] is the extra offset for the next copy slice
+            // that uses copySplits.copies2D[1].
+            std::array<uint64_t, TextureCopySplits::kMaxTextureCopySplits>
+                bufferOffsetsForNextSlice = {{0u, 0u}};
+            for (uint32_t copySlice = 0; copySlice < copySize.depth; ++copySlice) {
+                const uint32_t splitIndex = copySlice % copySplits.copies2D.size();
+
+                const Texture2DCopySplit& copySplitPerLayerBase = copySplits.copies2D[splitIndex];
+                const uint64_t bufferOffsetForNextSlice = bufferOffsetsForNextSlice[splitIndex];
+                const uint32_t copyTextureLayer = copySlice + textureCopy.origin.z;
+
+                RecordCopyTextureToBufferFromTextureCopySplit(
+                    commandList, copySplitPerLayerBase, buffer, bufferOffsetForNextSlice,
+                    bufferCopy.bytesPerRow, texture, textureCopy.mipLevel, copyTextureLayer,
+                    textureCopy.aspect);
+
+                bufferOffsetsForNextSlice[splitIndex] += bytesPerSlice * copySplits.copies2D.size();
+            }
+        }
+
        void RecordWriteTimestampCmd(ID3D12GraphicsCommandList* commandList,
                                     WriteTimestampCmd* cmd) {
            QuerySet* querySet = ToBackend(cmd->querySet.Get());
@@ -148,6 +189,77 @@ namespace dawn_native { namespace d3d12 {
            commandList->SetGraphicsRoot32BitConstants(layout->GetFirstIndexOffsetParameterIndex(),
                                                       count, offsets.data(), 0);
        }
+
+        bool ShouldCopyUsingTemporaryBuffer(DeviceBase* device,
+                                            const TextureCopy& srcCopy,
+                                            const TextureCopy& dstCopy) {
+            // Currently we only need the workaround for an Intel D3D12 driver issue.
+            if (device->IsToggleEnabled(
+                    Toggle::
+                        UseTempBufferInSmallFormatTextureToTextureCopyFromGreaterToLessMipLevel)) {
+                bool copyToLesserLevel = srcCopy.mipLevel > dstCopy.mipLevel;
+                ASSERT(srcCopy.texture->GetFormat().format == dstCopy.texture->GetFormat().format);
+
+                // GetAspectInfo(aspect) requires HasOneBit(aspect) == true, plus the texel block
+                // sizes of depth stencil formats are always no less than 4 bytes.
+                bool isSmallColorFormat =
+                    HasOneBit(srcCopy.aspect) &&
+                    srcCopy.texture->GetFormat().GetAspectInfo(srcCopy.aspect).block.byteSize < 4u;
+                if (copyToLesserLevel && isSmallColorFormat) {
+                    return true;
+                }
+            }
+
+            return false;
+        }
+
+        void RecordCopyTextureWithTemporaryBuffer(CommandRecordingContext* recordingContext,
+                                                  const TextureCopy& srcCopy,
+                                                  const TextureCopy& dstCopy,
+                                                  const Extent3D& copySize) {
+            ASSERT(srcCopy.texture->GetFormat().format == dstCopy.texture->GetFormat().format);
+            ASSERT(srcCopy.aspect == dstCopy.aspect);
+            dawn_native::Format format = srcCopy.texture->GetFormat();
+            const TexelBlockInfo& blockInfo = format.GetAspectInfo(srcCopy.aspect).block;
+            ASSERT(copySize.width % blockInfo.width == 0);
+            uint32_t widthInBlocks = copySize.width / blockInfo.width;
+            ASSERT(copySize.height % blockInfo.height == 0);
+            uint32_t heightInBlocks = copySize.height / blockInfo.height;
+
+            // Create tempBuffer
+            uint32_t bytesPerRow =
+                Align(blockInfo.byteSize * widthInBlocks, kTextureBytesPerRowAlignment);
+            uint32_t rowsPerImage = heightInBlocks;
+            uint64_t tempBufferSize = bytesPerRow * (widthInBlocks * heightInBlocks - 1) +
+                                      Align(blockInfo.byteSize * widthInBlocks, 4);
+            BufferDescriptor tempBufferDescriptor;
+            tempBufferDescriptor.usage = wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst;
+            tempBufferDescriptor.size = tempBufferSize;
+            Device* device = ToBackend(srcCopy.texture->GetDevice());
+            Ref<Buffer> tempBuffer =
+                AcquireRef(ToBackend(device->CreateBuffer(&tempBufferDescriptor)));
+
+            // Copy from source texture into tempBuffer
+            Texture* srcTexture = ToBackend(srcCopy.texture).Get();
+            tempBuffer->TrackUsageAndTransitionNow(recordingContext, wgpu::BufferUsage::CopyDst);
+            BufferCopy bufferCopy;
+            bufferCopy.buffer = tempBuffer;
+            bufferCopy.offset = 0;
+            bufferCopy.bytesPerRow = bytesPerRow;
+            bufferCopy.rowsPerImage = rowsPerImage;
+            CopyTextureToBufferWithCopySplit(recordingContext->GetCommandList(), srcCopy,
+                                             bufferCopy, srcTexture, tempBuffer.Get(), copySize);
+
+            // Copy from tempBuffer into destination texture
+            tempBuffer->TrackUsageAndTransitionNow(recordingContext, wgpu::BufferUsage::CopySrc);
+            Texture* dstTexture = ToBackend(dstCopy.texture).Get();
+            CopyBufferToTextureWithCopySplit(recordingContext, dstCopy,
+                                             tempBuffer->GetD3D12Resource(), 0, bytesPerRow,
+                                             rowsPerImage, copySize, dstTexture, dstCopy.aspect);
+
+            // Save tempBuffer into recordingContext
+            recordingContext->AddToTempBuffers(std::move(tempBuffer));
+        }
    }  // anonymous namespace

    class BindGroupStateTracker : public BindGroupTrackerBase<false, uint64_t> {
@@ -733,43 +845,8 @@ namespace dawn_native { namespace d3d12 {
                                                        subresources);
                    buffer->TrackUsageAndTransitionNow(commandContext, wgpu::BufferUsage::CopyDst);

-                    const TexelBlockInfo& blockInfo =
-                        texture->GetFormat().GetAspectInfo(copy->source.aspect).block;
-
-                    // See comments around ComputeTextureCopySplits() for more details.
-                    const TextureCopySplits copySplits = ComputeTextureCopySplits(
-                        copy->source.origin, copy->copySize, blockInfo, copy->destination.offset,
-                        copy->destination.bytesPerRow, copy->destination.rowsPerImage);
-
-                    const uint64_t bytesPerSlice =
-                        copy->destination.bytesPerRow * copy->destination.rowsPerImage;
-
-                    // copySplits.copies2D[1] is always calculated for the second copy slice with
-                    // extra "bytesPerSlice" copy offset compared with the first copy slice. So
-                    // here we use an array bufferOffsetsForNextSlice to record the extra offsets
-                    // for each copy slice: bufferOffsetsForNextSlice[0] is the extra offset for
-                    // the next copy slice that uses copySplits.copies2D[0], and
-                    // bufferOffsetsForNextSlice[1] is the extra offset for the next copy slice
-                    // that uses copySplits.copies2D[1].
-                    std::array<uint64_t, TextureCopySplits::kMaxTextureCopySplits>
-                        bufferOffsetsForNextSlice = {{0u, 0u}};
-                    for (uint32_t copySlice = 0; copySlice < copy->copySize.depth; ++copySlice) {
-                        const uint32_t splitIndex = copySlice % copySplits.copies2D.size();
-
-                        const Texture2DCopySplit& copySplitPerLayerBase =
-                            copySplits.copies2D[splitIndex];
-                        const uint64_t bufferOffsetForNextSlice =
-                            bufferOffsetsForNextSlice[splitIndex];
-                        const uint32_t copyTextureLayer = copySlice + copy->source.origin.z;
-
-                        RecordCopyTextureToBufferFromTextureCopySplit(
-                            commandList, copySplitPerLayerBase, buffer, bufferOffsetForNextSlice,
-                            copy->destination.bytesPerRow, texture, copy->source.mipLevel,
-                            copyTextureLayer, subresources.aspects);
-
-                        bufferOffsetsForNextSlice[splitIndex] +=
-                            bytesPerSlice * copySplits.copies2D.size();
-                    }
+                    CopyTextureToBufferWithCopySplit(commandList, copy->source, copy->destination,
+                                                     texture, buffer, copy->copySize);

                    break;
                }
@@ -809,6 +886,13 @@ namespace dawn_native { namespace d3d12 {
                                                            wgpu::TextureUsage::CopyDst, dstRange);

                    ASSERT(srcRange.aspects == dstRange.aspects);
+                    if (ShouldCopyUsingTemporaryBuffer(GetDevice(), copy->source,
+                                                       copy->destination)) {
+                        RecordCopyTextureWithTemporaryBuffer(commandContext, copy->source,
+                                                             copy->destination, copy->copySize);
+                        break;
+                    }
+
                    if (CanUseCopyResource(copy->source, copy->destination, copy->copySize)) {
                        commandList->CopyResource(destination->GetD3D12Resource(),
                                                  source->GetD3D12Resource());
--- a/src/dawn_native/d3d12/CommandRecordingContext.cpp
+++ b/src/dawn_native/d3d12/CommandRecordingContext.cpp
@@ -112,10 +112,15 @@ namespace dawn_native { namespace d3d12 {
        mIsOpen = false;
        mSharedTextures.clear();
        mHeapsPendingUsage.clear();
+        mTempBuffers.clear();
    }

    bool CommandRecordingContext::IsOpen() const {
        return mIsOpen;
    }

+    void CommandRecordingContext::AddToTempBuffers(Ref<Buffer> tempBuffer) {
+        mTempBuffers.emplace_back(tempBuffer);
+    }
+
 }}  // namespace dawn_native::d3d12
--- a/src/dawn_native/d3d12/CommandRecordingContext.h
+++ b/src/dawn_native/d3d12/CommandRecordingContext.h
@@ -16,6 +16,7 @@

 #include "dawn_native/Error.h"
 #include "dawn_native/IntegerTypes.h"
+#include "dawn_native/d3d12/BufferD3D12.h"
 #include "dawn_native/d3d12/d3d12_platform.h"

 #include <set>
@@ -41,12 +42,16 @@ namespace dawn_native { namespace d3d12 {

        void TrackHeapUsage(Heap* heap, ExecutionSerial serial);

+        void AddToTempBuffers(Ref<Buffer> tempBuffer);
+
      private:
        ComPtr<ID3D12GraphicsCommandList> mD3d12CommandList;
        ComPtr<ID3D12GraphicsCommandList4> mD3d12CommandList4;
        bool mIsOpen = false;
        std::set<Texture*> mSharedTextures;
        std::vector<Heap*> mHeapsPendingUsage;
+
+        std::vector<Ref<Buffer>> mTempBuffers;
    };
 }}  // namespace dawn_native::d3d12

--- a/src/dawn_native/d3d12/DeviceD3D12.cpp
+++ b/src/dawn_native/d3d12/DeviceD3D12.cpp
@@ -14,6 +14,7 @@

 #include "dawn_native/d3d12/DeviceD3D12.h"

+#include "common/GPUInfo.h"
 #include "dawn_native/Instance.h"
 #include "dawn_native/d3d12/AdapterD3D12.h"
 #include "dawn_native/d3d12/BackendD3D12.h"
@@ -535,6 +536,20 @@ namespace dawn_native { namespace d3d12 {

        // By default use the maximum shader-visible heap size allowed.
        SetToggle(Toggle::UseD3D12SmallShaderVisibleHeapForTesting, false);
+
+        PCIInfo pciInfo = GetAdapter()->GetPCIInfo();
+
+        // Currently this workaround is only needed on Intel Gen9 and Gen9.5 GPUs.
+        // See http://crbug.com/1161355 for more information.
+        // TODO(jiawei.shao@intel.com): disable this workaround on the newer drivers when the driver
+        // bug is fixed.
+        if (gpu_info::IsIntel(pciInfo.vendorId) &&
+            (gpu_info::IsSkylake(pciInfo.deviceId) || gpu_info::IsKabylake(pciInfo.deviceId) ||
+             gpu_info::IsCoffeelake(pciInfo.deviceId))) {
+            SetToggle(
+                Toggle::UseTempBufferInSmallFormatTextureToTextureCopyFromGreaterToLessMipLevel,
+                true);
+        }
    }

    MaybeError Device::WaitForIdleForDestruction() {
--- a/src/tests/end2end/CopyTests.cpp
+++ b/src/tests/end2end/CopyTests.cpp
@@ -1617,9 +1617,7 @@ TEST_P(CopyTests_T2T, CopyFromNonZeroMipLevelWithTexelBlockSizeLessThan4Bytes) {
    // This test can pass on the Windows Intel Vulkan driver version 27.20.100.9168.
    // TODO(jiawei.shao@intel.com): enable this test on Intel Vulkan drivers after the upgrade of
    // try bots.
-    // TODO(jiawei.shao@intel.com): enable this test on Intel D3D12 drivers when the workaround is
-    // implemented.
-    DAWN_SKIP_TEST_IF((IsD3D12() || (IsVulkan() && IsWindows())) && IsIntel());
+    DAWN_SKIP_TEST_IF(IsVulkan() && IsWindows() && IsIntel());

    constexpr std::array<wgpu::TextureFormat, 11> kFormats = {
        {wgpu::TextureFormat::RG8Sint, wgpu::TextureFormat::RG8Uint, wgpu::TextureFormat::RG8Snorm,
@@ -1663,8 +1661,11 @@ TEST_P(CopyTests_T2T, CopyFromNonZeroMipLevelWithTexelBlockSizeLessThan4Bytes) {
    }
 }

-DAWN_INSTANTIATE_TEST(CopyTests_T2T,
+DAWN_INSTANTIATE_TEST(
+    CopyTests_T2T,
    D3D12Backend(),
+    D3D12Backend(
+        {"use_temp_buffer_in_small_format_texture_to_texture_copy_from_greater_to_less_mip_level"}),
    MetalBackend(),
    OpenGLBackend(),
    OpenGLESBackend(),