D3D12: Add workaround for T2T copy issue on Intel GPUs

On Intel Gen9 (SKL) and Gen9.5 (KBL, CFL, CML) GPUs with latest Intel D3D12 driver (27.20.100.9316), there is a bug in the command CopyTextureRegion() when we want to do the texture-to-texture copy with the formats whose texel block size < 4 bytes and source mipmap level > destination mipmap level. This patch adds a workaround for this driver bug by implementing the functionality of the T2T copy with one T2B copy and one B2T copy. BUG=chromium:1161355 TEST=dawn_end2end_tests Change-Id: I688bb8bae277832aaba1be2680012040ee8e1160 Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/43860 Reviewed-by: Corentin Wallez <cwallez@chromium.org> Commit-Queue: Austin Eng <enga@chromium.org>
2025-12-10 05:57:51 +00:00 · 2021-03-11 19:34:50 +00:00
parent 9f6bc4e3a9
commit f905e57be2
9 changed files with 213 additions and 47 deletions
--- a/src/common/GPUInfo.cpp
+++ b/src/common/GPUInfo.cpp
@@ -14,7 +14,31 @@
 #include "common/GPUInfo.h"
 #include <algorithm>
 namespace gpu_info {
    namespace {
        // Intel
        // Referenced from the following Mesa source code:
        // https://github.com/mesa3d/mesa/blob/master/include/pci_ids/i965_pci_ids.h
        // gen9
        const uint32_t Skylake[] = {0x1902, 0x1906, 0x190A, 0x190B, 0x190E, 0x1912, 0x1913,
                                    0x1915, 0x1916, 0x1917, 0x191A, 0x191B, 0x191D, 0x191E,
                                    0x1921, 0x1923, 0x1926, 0x1927, 0x192A, 0x192B, 0x192D,
                                    0x1932, 0x193A, 0x193B, 0x193D};
        // gen9p5
        const uint32_t Kabylake[] = {0x5916, 0x5913, 0x5906, 0x5926, 0x5921, 0x5915, 0x590E,
                                     0x591E, 0x5912, 0x5917, 0x5902, 0x591B, 0x593B, 0x590B,
                                     0x591A, 0x590A, 0x591D, 0x5908, 0x5923, 0x5927};
        const uint32_t Coffeelake[] = {0x87CA, 0x3E90, 0x3E93, 0x3E99, 0x3E9C, 0x3E91,
                                       0x3E92, 0x3E96, 0x3E98, 0x3E9A, 0x3E9B, 0x3E94,
                                       0x3EA9, 0x3EA5, 0x3EA6, 0x3EA7, 0x3EA8};
        const uint32_t WhiskyLake[] = {0x3EA1, 0x3EA4, 0x3EA0, 0x3EA3, 0x3EA2};
        const uint32_t CometLake[] = {0x9B21, 0x9BA0, 0x9BA2, 0x9BA4, 0x9BA5, 0x9BA8, 0x9BAA,
                                      0x9BAB, 0x9BAC, 0x9B41, 0x9BC0, 0x9BC2, 0x9BC4, 0x9BC5,
                                      0x9BC6, 0x9BC8, 0x9BCA, 0x9BCB, 0x9BCC, 0x9BE6, 0x9BF6};
    }  // anonymous namespace
    bool IsAMD(PCIVendorID vendorId) {
        return vendorId == kVendorID_AMD;
    }
@@ -39,4 +63,20 @@ namespace gpu_info {
    bool IsWARP(PCIVendorID vendorId, PCIDeviceID deviceId) {
        return vendorId == kVendorID_Microsoft && deviceId == kDeviceID_WARP;
    }
    // Intel GPUs
    bool IsSkylake(PCIDeviceID deviceId) {
        return std::find(std::begin(Skylake), std::end(Skylake), deviceId) != std::end(Skylake);
    }
    bool IsKabylake(PCIDeviceID deviceId) {
        return std::find(std::begin(Kabylake), std::end(Kabylake), deviceId) != std::end(Kabylake);
    }
    bool IsCoffeelake(PCIDeviceID deviceId) {
        return (std::find(std::begin(Coffeelake), std::end(Coffeelake), deviceId) !=
                std::end(Coffeelake)) ||
               (std::find(std::begin(WhiskyLake), std::end(WhiskyLake), deviceId) !=
                std::end(WhiskyLake)) ||
               (std::find(std::begin(CometLake), std::end(CometLake), deviceId) !=
                std::end(CometLake));
    }
 }  // namespace gpu_info
--- a/src/common/GPUInfo.h
+++ b/src/common/GPUInfo.h
@@ -43,5 +43,10 @@ namespace gpu_info {
    bool IsSwiftshader(PCIVendorID vendorId, PCIDeviceID deviceId);
    bool IsWARP(PCIVendorID vendorId, PCIDeviceID deviceId);
    // Intel architectures
    bool IsSkylake(PCIDeviceID deviceId);
    bool IsKabylake(PCIDeviceID deviceId);
    bool IsCoffeelake(PCIDeviceID deviceId);
 }  // namespace gpu_info
 #endif  // COMMON_GPUINFO_H
--- a/src/dawn_native/Toggles.cpp
+++ b/src/dawn_native/Toggles.cpp
@@ -172,7 +172,17 @@ namespace dawn_native {
              "If needed, use a compute shader to transform timestamp queries from ticks to "
              "nanoseconds. This is temporarily needed to avoid requiring Tint to use timestamp "
              "queries",
-              "https://crbug.com/dawn/686"}}
+              "https://crbug.com/dawn/686"}},
            {Toggle::UseTempBufferInSmallFormatTextureToTextureCopyFromGreaterToLessMipLevel,
             {"use_temp_buffer_in_small_format_texture_to_texture_copy_from_greater_to_less_mip_"
              "level",
              "Split texture-to-texture copy into two copies: copy from source texture into a "
              "temporary buffer, and copy from the temporary buffer into the destination texture "
              "under specific situations. This workaround is by default enabled on some Intel "
              "GPUs which have a driver bug in the execution of CopyTextureRegion() when we copy "
              "with the formats whose texel block sizes are less than 4 bytes from a greater mip "
              "level to a smaller mip level on D3D12 backends.",
              "https://crbug.com/1161355"}}
            // Dummy comment to separate the }} so it is clearer what to copy-paste to add a toggle.
        }};
--- a/src/dawn_native/Toggles.h
+++ b/src/dawn_native/Toggles.h
@@ -51,6 +51,7 @@ namespace dawn_native {
        UseTintGenerator,
        FlushBeforeClientWaitSync,
        ConvertTimestampsToNanoseconds,
        UseTempBufferInSmallFormatTextureToTextureCopyFromGreaterToLessMipLevel,
        EnumCount,
        InvalidEnum = EnumCount,
--- a/src/dawn_native/d3d12/CommandBufferD3D12.cpp
+++ b/src/dawn_native/d3d12/CommandBufferD3D12.cpp
@@ -118,6 +118,47 @@ namespace dawn_native { namespace d3d12 {
            }
        }
        void CopyTextureToBufferWithCopySplit(ID3D12GraphicsCommandList* commandList,
                                              const TextureCopy& textureCopy,
                                              const BufferCopy& bufferCopy,
                                              Texture* texture,
                                              Buffer* buffer,
                                              const Extent3D& copySize) {
            const TexelBlockInfo& blockInfo =
                texture->GetFormat().GetAspectInfo(textureCopy.aspect).block;
            // See comments around ComputeTextureCopySplits() for more details.
            const TextureCopySplits copySplits =
                ComputeTextureCopySplits(textureCopy.origin, copySize, blockInfo, bufferCopy.offset,
                                         bufferCopy.bytesPerRow, bufferCopy.rowsPerImage);
            const uint64_t bytesPerSlice = bufferCopy.bytesPerRow * bufferCopy.rowsPerImage;
            // copySplits.copies2D[1] is always calculated for the second copy slice with
            // extra "bytesPerSlice" copy offset compared with the first copy slice. So
            // here we use an array bufferOffsetsForNextSlice to record the extra offsets
            // for each copy slice: bufferOffsetsForNextSlice[0] is the extra offset for
            // the next copy slice that uses copySplits.copies2D[0], and
            // bufferOffsetsForNextSlice[1] is the extra offset for the next copy slice
            // that uses copySplits.copies2D[1].
            std::array<uint64_t, TextureCopySplits::kMaxTextureCopySplits>
                bufferOffsetsForNextSlice = {{0u, 0u}};
            for (uint32_t copySlice = 0; copySlice < copySize.depth; ++copySlice) {
                const uint32_t splitIndex = copySlice % copySplits.copies2D.size();
                const Texture2DCopySplit& copySplitPerLayerBase = copySplits.copies2D[splitIndex];
                const uint64_t bufferOffsetForNextSlice = bufferOffsetsForNextSlice[splitIndex];
                const uint32_t copyTextureLayer = copySlice + textureCopy.origin.z;
                RecordCopyTextureToBufferFromTextureCopySplit(
                    commandList, copySplitPerLayerBase, buffer, bufferOffsetForNextSlice,
                    bufferCopy.bytesPerRow, texture, textureCopy.mipLevel, copyTextureLayer,
                    textureCopy.aspect);
                bufferOffsetsForNextSlice[splitIndex] += bytesPerSlice * copySplits.copies2D.size();
            }
        }
        void RecordWriteTimestampCmd(ID3D12GraphicsCommandList* commandList,
                                     WriteTimestampCmd* cmd) {
            QuerySet* querySet = ToBackend(cmd->querySet.Get());
@@ -148,6 +189,77 @@ namespace dawn_native { namespace d3d12 {
            commandList->SetGraphicsRoot32BitConstants(layout->GetFirstIndexOffsetParameterIndex(),
                                                       count, offsets.data(), 0);
        }
        bool ShouldCopyUsingTemporaryBuffer(DeviceBase* device,
                                            const TextureCopy& srcCopy,
                                            const TextureCopy& dstCopy) {
            // Currently we only need the workaround for an Intel D3D12 driver issue.
            if (device->IsToggleEnabled(
                    Toggle::
                        UseTempBufferInSmallFormatTextureToTextureCopyFromGreaterToLessMipLevel)) {
                bool copyToLesserLevel = srcCopy.mipLevel > dstCopy.mipLevel;
                ASSERT(srcCopy.texture->GetFormat().format == dstCopy.texture->GetFormat().format);
                // GetAspectInfo(aspect) requires HasOneBit(aspect) == true, plus the texel block
                // sizes of depth stencil formats are always no less than 4 bytes.
                bool isSmallColorFormat =
                    HasOneBit(srcCopy.aspect) &&
                    srcCopy.texture->GetFormat().GetAspectInfo(srcCopy.aspect).block.byteSize < 4u;
                if (copyToLesserLevel && isSmallColorFormat) {
                    return true;
                }
            }
            return false;
        }
        void RecordCopyTextureWithTemporaryBuffer(CommandRecordingContext* recordingContext,
                                                  const TextureCopy& srcCopy,
                                                  const TextureCopy& dstCopy,
                                                  const Extent3D& copySize) {
            ASSERT(srcCopy.texture->GetFormat().format == dstCopy.texture->GetFormat().format);
            ASSERT(srcCopy.aspect == dstCopy.aspect);
            dawn_native::Format format = srcCopy.texture->GetFormat();
            const TexelBlockInfo& blockInfo = format.GetAspectInfo(srcCopy.aspect).block;
            ASSERT(copySize.width % blockInfo.width == 0);
            uint32_t widthInBlocks = copySize.width / blockInfo.width;
            ASSERT(copySize.height % blockInfo.height == 0);
            uint32_t heightInBlocks = copySize.height / blockInfo.height;
            // Create tempBuffer
            uint32_t bytesPerRow =
                Align(blockInfo.byteSize * widthInBlocks, kTextureBytesPerRowAlignment);
            uint32_t rowsPerImage = heightInBlocks;
            uint64_t tempBufferSize = bytesPerRow * (widthInBlocks * heightInBlocks - 1) +
                                      Align(blockInfo.byteSize * widthInBlocks, 4);
            BufferDescriptor tempBufferDescriptor;
            tempBufferDescriptor.usage = wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst;
            tempBufferDescriptor.size = tempBufferSize;
            Device* device = ToBackend(srcCopy.texture->GetDevice());
            Ref<Buffer> tempBuffer =
                AcquireRef(ToBackend(device->CreateBuffer(&tempBufferDescriptor)));
            // Copy from source texture into tempBuffer
            Texture* srcTexture = ToBackend(srcCopy.texture).Get();
            tempBuffer->TrackUsageAndTransitionNow(recordingContext, wgpu::BufferUsage::CopyDst);
            BufferCopy bufferCopy;
            bufferCopy.buffer = tempBuffer;
            bufferCopy.offset = 0;
            bufferCopy.bytesPerRow = bytesPerRow;
            bufferCopy.rowsPerImage = rowsPerImage;
            CopyTextureToBufferWithCopySplit(recordingContext->GetCommandList(), srcCopy,
                                             bufferCopy, srcTexture, tempBuffer.Get(), copySize);
            // Copy from tempBuffer into destination texture
            tempBuffer->TrackUsageAndTransitionNow(recordingContext, wgpu::BufferUsage::CopySrc);
            Texture* dstTexture = ToBackend(dstCopy.texture).Get();
            CopyBufferToTextureWithCopySplit(recordingContext, dstCopy,
                                             tempBuffer->GetD3D12Resource(), 0, bytesPerRow,
                                             rowsPerImage, copySize, dstTexture, dstCopy.aspect);
            // Save tempBuffer into recordingContext
            recordingContext->AddToTempBuffers(std::move(tempBuffer));
        }
    }  // anonymous namespace
    class BindGroupStateTracker : public BindGroupTrackerBase<false, uint64_t> {
@@ -733,43 +845,8 @@ namespace dawn_native { namespace d3d12 {
                                                        subresources);
                    buffer->TrackUsageAndTransitionNow(commandContext, wgpu::BufferUsage::CopyDst);
-                    const TexelBlockInfo& blockInfo =
+                    CopyTextureToBufferWithCopySplit(commandList, copy->source, copy->destination,
-                        texture->GetFormat().GetAspectInfo(copy->source.aspect).block;
+                                                     texture, buffer, copy->copySize);
                    // See comments around ComputeTextureCopySplits() for more details.
                    const TextureCopySplits copySplits = ComputeTextureCopySplits(
                        copy->source.origin, copy->copySize, blockInfo, copy->destination.offset,
                        copy->destination.bytesPerRow, copy->destination.rowsPerImage);
                    const uint64_t bytesPerSlice =
                        copy->destination.bytesPerRow * copy->destination.rowsPerImage;
                    // copySplits.copies2D[1] is always calculated for the second copy slice with
                    // extra "bytesPerSlice" copy offset compared with the first copy slice. So
                    // here we use an array bufferOffsetsForNextSlice to record the extra offsets
                    // for each copy slice: bufferOffsetsForNextSlice[0] is the extra offset for
                    // the next copy slice that uses copySplits.copies2D[0], and
                    // bufferOffsetsForNextSlice[1] is the extra offset for the next copy slice
                    // that uses copySplits.copies2D[1].
                    std::array<uint64_t, TextureCopySplits::kMaxTextureCopySplits>
                        bufferOffsetsForNextSlice = {{0u, 0u}};
                    for (uint32_t copySlice = 0; copySlice < copy->copySize.depth; ++copySlice) {
                        const uint32_t splitIndex = copySlice % copySplits.copies2D.size();
                        const Texture2DCopySplit& copySplitPerLayerBase =
                            copySplits.copies2D[splitIndex];
                        const uint64_t bufferOffsetForNextSlice =
                            bufferOffsetsForNextSlice[splitIndex];
                        const uint32_t copyTextureLayer = copySlice + copy->source.origin.z;
                        RecordCopyTextureToBufferFromTextureCopySplit(
                            commandList, copySplitPerLayerBase, buffer, bufferOffsetForNextSlice,
                            copy->destination.bytesPerRow, texture, copy->source.mipLevel,
                            copyTextureLayer, subresources.aspects);
                        bufferOffsetsForNextSlice[splitIndex] +=
                            bytesPerSlice * copySplits.copies2D.size();
                    }
                    break;
                }
@@ -809,6 +886,13 @@ namespace dawn_native { namespace d3d12 {
                                                            wgpu::TextureUsage::CopyDst, dstRange);
                    ASSERT(srcRange.aspects == dstRange.aspects);
                    if (ShouldCopyUsingTemporaryBuffer(GetDevice(), copy->source,
                                                       copy->destination)) {
                        RecordCopyTextureWithTemporaryBuffer(commandContext, copy->source,
                                                             copy->destination, copy->copySize);
                        break;
                    }
                    if (CanUseCopyResource(copy->source, copy->destination, copy->copySize)) {
                        commandList->CopyResource(destination->GetD3D12Resource(),
                                                  source->GetD3D12Resource());
--- a/src/dawn_native/d3d12/CommandRecordingContext.cpp
+++ b/src/dawn_native/d3d12/CommandRecordingContext.cpp
@@ -112,10 +112,15 @@ namespace dawn_native { namespace d3d12 {
        mIsOpen = false;
        mSharedTextures.clear();
        mHeapsPendingUsage.clear();
        mTempBuffers.clear();
    }
    bool CommandRecordingContext::IsOpen() const {
        return mIsOpen;
    }
    void CommandRecordingContext::AddToTempBuffers(Ref<Buffer> tempBuffer) {
        mTempBuffers.emplace_back(tempBuffer);
    }
 }}  // namespace dawn_native::d3d12
--- a/src/dawn_native/d3d12/CommandRecordingContext.h
+++ b/src/dawn_native/d3d12/CommandRecordingContext.h
@@ -16,6 +16,7 @@
 #include "dawn_native/Error.h"
 #include "dawn_native/IntegerTypes.h"
 #include "dawn_native/d3d12/BufferD3D12.h"
 #include "dawn_native/d3d12/d3d12_platform.h"
 #include <set>
@@ -41,12 +42,16 @@ namespace dawn_native { namespace d3d12 {
        void TrackHeapUsage(Heap* heap, ExecutionSerial serial);
        void AddToTempBuffers(Ref<Buffer> tempBuffer);
      private:
        ComPtr<ID3D12GraphicsCommandList> mD3d12CommandList;
        ComPtr<ID3D12GraphicsCommandList4> mD3d12CommandList4;
        bool mIsOpen = false;
        std::set<Texture*> mSharedTextures;
        std::vector<Heap*> mHeapsPendingUsage;
        std::vector<Ref<Buffer>> mTempBuffers;
    };
 }}  // namespace dawn_native::d3d12
--- a/src/dawn_native/d3d12/DeviceD3D12.cpp
+++ b/src/dawn_native/d3d12/DeviceD3D12.cpp
@@ -14,6 +14,7 @@
 #include "dawn_native/d3d12/DeviceD3D12.h"
 #include "common/GPUInfo.h"
 #include "dawn_native/Instance.h"
 #include "dawn_native/d3d12/AdapterD3D12.h"
 #include "dawn_native/d3d12/BackendD3D12.h"
@@ -535,6 +536,20 @@ namespace dawn_native { namespace d3d12 {
        // By default use the maximum shader-visible heap size allowed.
        SetToggle(Toggle::UseD3D12SmallShaderVisibleHeapForTesting, false);
        PCIInfo pciInfo = GetAdapter()->GetPCIInfo();
        // Currently this workaround is only needed on Intel Gen9 and Gen9.5 GPUs.
        // See http://crbug.com/1161355 for more information.
        // TODO(jiawei.shao@intel.com): disable this workaround on the newer drivers when the driver
        // bug is fixed.
        if (gpu_info::IsIntel(pciInfo.vendorId) &&
            (gpu_info::IsSkylake(pciInfo.deviceId) || gpu_info::IsKabylake(pciInfo.deviceId) ||
             gpu_info::IsCoffeelake(pciInfo.deviceId))) {
            SetToggle(
                Toggle::UseTempBufferInSmallFormatTextureToTextureCopyFromGreaterToLessMipLevel,
                true);
        }
    }
    MaybeError Device::WaitForIdleForDestruction() {
--- a/src/tests/end2end/CopyTests.cpp
+++ b/src/tests/end2end/CopyTests.cpp
@@ -1617,9 +1617,7 @@ TEST_P(CopyTests_T2T, CopyFromNonZeroMipLevelWithTexelBlockSizeLessThan4Bytes) {
    // This test can pass on the Windows Intel Vulkan driver version 27.20.100.9168.
    // TODO(jiawei.shao@intel.com): enable this test on Intel Vulkan drivers after the upgrade of
    // try bots.
-    // TODO(jiawei.shao@intel.com): enable this test on Intel D3D12 drivers when the workaround is
+    DAWN_SKIP_TEST_IF(IsVulkan() && IsWindows() && IsIntel());
    // implemented.
    DAWN_SKIP_TEST_IF((IsD3D12() || (IsVulkan() && IsWindows())) && IsIntel());
    constexpr std::array<wgpu::TextureFormat, 11> kFormats = {
        {wgpu::TextureFormat::RG8Sint, wgpu::TextureFormat::RG8Uint, wgpu::TextureFormat::RG8Snorm,
@@ -1663,8 +1661,11 @@ TEST_P(CopyTests_T2T, CopyFromNonZeroMipLevelWithTexelBlockSizeLessThan4Bytes) {
    }
 }
-DAWN_INSTANTIATE_TEST(CopyTests_T2T,
+DAWN_INSTANTIATE_TEST(
    CopyTests_T2T,
    D3D12Backend(),
    D3D12Backend(
        {"use_temp_buffer_in_small_format_texture_to_texture_copy_from_greater_to_less_mip_level"}),
    MetalBackend(),
    OpenGLBackend(),
    OpenGLESBackend(),