D3D12: Add workaround for T2T copy issue on Intel GPUs

On Intel Gen9 (SKL) and Gen9.5 (KBL, CFL, CML) GPUs with latest
Intel D3D12 driver (27.20.100.9316), there is a bug in the command
CopyTextureRegion() when we want to do the texture-to-texture copy
with the formats whose texel block size < 4 bytes and source mipmap
level > destination mipmap level.

This patch adds a workaround for this driver bug by implementing
the functionality of the T2T copy with one T2B copy and one B2T
copy.

BUG=chromium:1161355
TEST=dawn_end2end_tests

Change-Id: I688bb8bae277832aaba1be2680012040ee8e1160
Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/43860
Reviewed-by: Corentin Wallez <cwallez@chromium.org>
Commit-Queue: Austin Eng <enga@chromium.org>
This commit is contained in:
Jiawei Shao 2021-03-11 19:34:50 +00:00 committed by Commit Bot service account
parent 9f6bc4e3a9
commit f905e57be2
9 changed files with 213 additions and 47 deletions

View File

@ -14,7 +14,31 @@
#include "common/GPUInfo.h"
#include <algorithm>
namespace gpu_info {
namespace {
// Intel
// Referenced from the following Mesa source code:
// https://github.com/mesa3d/mesa/blob/master/include/pci_ids/i965_pci_ids.h
// gen9
const uint32_t Skylake[] = {0x1902, 0x1906, 0x190A, 0x190B, 0x190E, 0x1912, 0x1913,
0x1915, 0x1916, 0x1917, 0x191A, 0x191B, 0x191D, 0x191E,
0x1921, 0x1923, 0x1926, 0x1927, 0x192A, 0x192B, 0x192D,
0x1932, 0x193A, 0x193B, 0x193D};
// gen9p5
const uint32_t Kabylake[] = {0x5916, 0x5913, 0x5906, 0x5926, 0x5921, 0x5915, 0x590E,
0x591E, 0x5912, 0x5917, 0x5902, 0x591B, 0x593B, 0x590B,
0x591A, 0x590A, 0x591D, 0x5908, 0x5923, 0x5927};
const uint32_t Coffeelake[] = {0x87CA, 0x3E90, 0x3E93, 0x3E99, 0x3E9C, 0x3E91,
0x3E92, 0x3E96, 0x3E98, 0x3E9A, 0x3E9B, 0x3E94,
0x3EA9, 0x3EA5, 0x3EA6, 0x3EA7, 0x3EA8};
const uint32_t WhiskyLake[] = {0x3EA1, 0x3EA4, 0x3EA0, 0x3EA3, 0x3EA2};
const uint32_t CometLake[] = {0x9B21, 0x9BA0, 0x9BA2, 0x9BA4, 0x9BA5, 0x9BA8, 0x9BAA,
0x9BAB, 0x9BAC, 0x9B41, 0x9BC0, 0x9BC2, 0x9BC4, 0x9BC5,
0x9BC6, 0x9BC8, 0x9BCA, 0x9BCB, 0x9BCC, 0x9BE6, 0x9BF6};
} // anonymous namespace
bool IsAMD(PCIVendorID vendorId) {
return vendorId == kVendorID_AMD;
}
@ -39,4 +63,20 @@ namespace gpu_info {
bool IsWARP(PCIVendorID vendorId, PCIDeviceID deviceId) {
return vendorId == kVendorID_Microsoft && deviceId == kDeviceID_WARP;
}
// Intel GPUs
bool IsSkylake(PCIDeviceID deviceId) {
return std::find(std::begin(Skylake), std::end(Skylake), deviceId) != std::end(Skylake);
}
bool IsKabylake(PCIDeviceID deviceId) {
return std::find(std::begin(Kabylake), std::end(Kabylake), deviceId) != std::end(Kabylake);
}
bool IsCoffeelake(PCIDeviceID deviceId) {
return (std::find(std::begin(Coffeelake), std::end(Coffeelake), deviceId) !=
std::end(Coffeelake)) ||
(std::find(std::begin(WhiskyLake), std::end(WhiskyLake), deviceId) !=
std::end(WhiskyLake)) ||
(std::find(std::begin(CometLake), std::end(CometLake), deviceId) !=
std::end(CometLake));
}
} // namespace gpu_info

View File

@ -43,5 +43,10 @@ namespace gpu_info {
bool IsSwiftshader(PCIVendorID vendorId, PCIDeviceID deviceId);
bool IsWARP(PCIVendorID vendorId, PCIDeviceID deviceId);
// Intel architectures
bool IsSkylake(PCIDeviceID deviceId);
bool IsKabylake(PCIDeviceID deviceId);
bool IsCoffeelake(PCIDeviceID deviceId);
} // namespace gpu_info
#endif // COMMON_GPUINFO_H

View File

@ -172,7 +172,17 @@ namespace dawn_native {
"If needed, use a compute shader to transform timestamp queries from ticks to "
"nanoseconds. This is temporarily needed to avoid requiring Tint to use timestamp "
"queries",
"https://crbug.com/dawn/686"}}
"https://crbug.com/dawn/686"}},
{Toggle::UseTempBufferInSmallFormatTextureToTextureCopyFromGreaterToLessMipLevel,
{"use_temp_buffer_in_small_format_texture_to_texture_copy_from_greater_to_less_mip_"
"level",
"Split texture-to-texture copy into two copies: copy from source texture into a "
"temporary buffer, and copy from the temporary buffer into the destination texture "
"under specific situations. This workaround is by default enabled on some Intel "
"GPUs which have a driver bug in the execution of CopyTextureRegion() when we copy "
"with the formats whose texel block sizes are less than 4 bytes from a greater mip "
"level to a smaller mip level on D3D12 backends.",
"https://crbug.com/1161355"}}
// Dummy comment to separate the }} so it is clearer what to copy-paste to add a toggle.
}};

View File

@ -51,6 +51,7 @@ namespace dawn_native {
UseTintGenerator,
FlushBeforeClientWaitSync,
ConvertTimestampsToNanoseconds,
UseTempBufferInSmallFormatTextureToTextureCopyFromGreaterToLessMipLevel,
EnumCount,
InvalidEnum = EnumCount,

View File

@ -118,6 +118,47 @@ namespace dawn_native { namespace d3d12 {
}
}
void CopyTextureToBufferWithCopySplit(ID3D12GraphicsCommandList* commandList,
const TextureCopy& textureCopy,
const BufferCopy& bufferCopy,
Texture* texture,
Buffer* buffer,
const Extent3D& copySize) {
const TexelBlockInfo& blockInfo =
texture->GetFormat().GetAspectInfo(textureCopy.aspect).block;
// See comments around ComputeTextureCopySplits() for more details.
const TextureCopySplits copySplits =
ComputeTextureCopySplits(textureCopy.origin, copySize, blockInfo, bufferCopy.offset,
bufferCopy.bytesPerRow, bufferCopy.rowsPerImage);
const uint64_t bytesPerSlice = bufferCopy.bytesPerRow * bufferCopy.rowsPerImage;
// copySplits.copies2D[1] is always calculated for the second copy slice with
// extra "bytesPerSlice" copy offset compared with the first copy slice. So
// here we use an array bufferOffsetsForNextSlice to record the extra offsets
// for each copy slice: bufferOffsetsForNextSlice[0] is the extra offset for
// the next copy slice that uses copySplits.copies2D[0], and
// bufferOffsetsForNextSlice[1] is the extra offset for the next copy slice
// that uses copySplits.copies2D[1].
std::array<uint64_t, TextureCopySplits::kMaxTextureCopySplits>
bufferOffsetsForNextSlice = {{0u, 0u}};
for (uint32_t copySlice = 0; copySlice < copySize.depth; ++copySlice) {
const uint32_t splitIndex = copySlice % copySplits.copies2D.size();
const Texture2DCopySplit& copySplitPerLayerBase = copySplits.copies2D[splitIndex];
const uint64_t bufferOffsetForNextSlice = bufferOffsetsForNextSlice[splitIndex];
const uint32_t copyTextureLayer = copySlice + textureCopy.origin.z;
RecordCopyTextureToBufferFromTextureCopySplit(
commandList, copySplitPerLayerBase, buffer, bufferOffsetForNextSlice,
bufferCopy.bytesPerRow, texture, textureCopy.mipLevel, copyTextureLayer,
textureCopy.aspect);
bufferOffsetsForNextSlice[splitIndex] += bytesPerSlice * copySplits.copies2D.size();
}
}
void RecordWriteTimestampCmd(ID3D12GraphicsCommandList* commandList,
WriteTimestampCmd* cmd) {
QuerySet* querySet = ToBackend(cmd->querySet.Get());
@ -148,6 +189,77 @@ namespace dawn_native { namespace d3d12 {
commandList->SetGraphicsRoot32BitConstants(layout->GetFirstIndexOffsetParameterIndex(),
count, offsets.data(), 0);
}
bool ShouldCopyUsingTemporaryBuffer(DeviceBase* device,
const TextureCopy& srcCopy,
const TextureCopy& dstCopy) {
// Currently we only need the workaround for an Intel D3D12 driver issue.
if (device->IsToggleEnabled(
Toggle::
UseTempBufferInSmallFormatTextureToTextureCopyFromGreaterToLessMipLevel)) {
bool copyToLesserLevel = srcCopy.mipLevel > dstCopy.mipLevel;
ASSERT(srcCopy.texture->GetFormat().format == dstCopy.texture->GetFormat().format);
// GetAspectInfo(aspect) requires HasOneBit(aspect) == true, plus the texel block
// sizes of depth stencil formats are always no less than 4 bytes.
bool isSmallColorFormat =
HasOneBit(srcCopy.aspect) &&
srcCopy.texture->GetFormat().GetAspectInfo(srcCopy.aspect).block.byteSize < 4u;
if (copyToLesserLevel && isSmallColorFormat) {
return true;
}
}
return false;
}
void RecordCopyTextureWithTemporaryBuffer(CommandRecordingContext* recordingContext,
const TextureCopy& srcCopy,
const TextureCopy& dstCopy,
const Extent3D& copySize) {
ASSERT(srcCopy.texture->GetFormat().format == dstCopy.texture->GetFormat().format);
ASSERT(srcCopy.aspect == dstCopy.aspect);
dawn_native::Format format = srcCopy.texture->GetFormat();
const TexelBlockInfo& blockInfo = format.GetAspectInfo(srcCopy.aspect).block;
ASSERT(copySize.width % blockInfo.width == 0);
uint32_t widthInBlocks = copySize.width / blockInfo.width;
ASSERT(copySize.height % blockInfo.height == 0);
uint32_t heightInBlocks = copySize.height / blockInfo.height;
// Create tempBuffer
uint32_t bytesPerRow =
Align(blockInfo.byteSize * widthInBlocks, kTextureBytesPerRowAlignment);
uint32_t rowsPerImage = heightInBlocks;
uint64_t tempBufferSize = bytesPerRow * (widthInBlocks * heightInBlocks - 1) +
Align(blockInfo.byteSize * widthInBlocks, 4);
BufferDescriptor tempBufferDescriptor;
tempBufferDescriptor.usage = wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst;
tempBufferDescriptor.size = tempBufferSize;
Device* device = ToBackend(srcCopy.texture->GetDevice());
Ref<Buffer> tempBuffer =
AcquireRef(ToBackend(device->CreateBuffer(&tempBufferDescriptor)));
// Copy from source texture into tempBuffer
Texture* srcTexture = ToBackend(srcCopy.texture).Get();
tempBuffer->TrackUsageAndTransitionNow(recordingContext, wgpu::BufferUsage::CopyDst);
BufferCopy bufferCopy;
bufferCopy.buffer = tempBuffer;
bufferCopy.offset = 0;
bufferCopy.bytesPerRow = bytesPerRow;
bufferCopy.rowsPerImage = rowsPerImage;
CopyTextureToBufferWithCopySplit(recordingContext->GetCommandList(), srcCopy,
bufferCopy, srcTexture, tempBuffer.Get(), copySize);
// Copy from tempBuffer into destination texture
tempBuffer->TrackUsageAndTransitionNow(recordingContext, wgpu::BufferUsage::CopySrc);
Texture* dstTexture = ToBackend(dstCopy.texture).Get();
CopyBufferToTextureWithCopySplit(recordingContext, dstCopy,
tempBuffer->GetD3D12Resource(), 0, bytesPerRow,
rowsPerImage, copySize, dstTexture, dstCopy.aspect);
// Save tempBuffer into recordingContext
recordingContext->AddToTempBuffers(std::move(tempBuffer));
}
} // anonymous namespace
class BindGroupStateTracker : public BindGroupTrackerBase<false, uint64_t> {
@ -733,43 +845,8 @@ namespace dawn_native { namespace d3d12 {
subresources);
buffer->TrackUsageAndTransitionNow(commandContext, wgpu::BufferUsage::CopyDst);
const TexelBlockInfo& blockInfo =
texture->GetFormat().GetAspectInfo(copy->source.aspect).block;
// See comments around ComputeTextureCopySplits() for more details.
const TextureCopySplits copySplits = ComputeTextureCopySplits(
copy->source.origin, copy->copySize, blockInfo, copy->destination.offset,
copy->destination.bytesPerRow, copy->destination.rowsPerImage);
const uint64_t bytesPerSlice =
copy->destination.bytesPerRow * copy->destination.rowsPerImage;
// copySplits.copies2D[1] is always calculated for the second copy slice with
// extra "bytesPerSlice" copy offset compared with the first copy slice. So
// here we use an array bufferOffsetsForNextSlice to record the extra offsets
// for each copy slice: bufferOffsetsForNextSlice[0] is the extra offset for
// the next copy slice that uses copySplits.copies2D[0], and
// bufferOffsetsForNextSlice[1] is the extra offset for the next copy slice
// that uses copySplits.copies2D[1].
std::array<uint64_t, TextureCopySplits::kMaxTextureCopySplits>
bufferOffsetsForNextSlice = {{0u, 0u}};
for (uint32_t copySlice = 0; copySlice < copy->copySize.depth; ++copySlice) {
const uint32_t splitIndex = copySlice % copySplits.copies2D.size();
const Texture2DCopySplit& copySplitPerLayerBase =
copySplits.copies2D[splitIndex];
const uint64_t bufferOffsetForNextSlice =
bufferOffsetsForNextSlice[splitIndex];
const uint32_t copyTextureLayer = copySlice + copy->source.origin.z;
RecordCopyTextureToBufferFromTextureCopySplit(
commandList, copySplitPerLayerBase, buffer, bufferOffsetForNextSlice,
copy->destination.bytesPerRow, texture, copy->source.mipLevel,
copyTextureLayer, subresources.aspects);
bufferOffsetsForNextSlice[splitIndex] +=
bytesPerSlice * copySplits.copies2D.size();
}
CopyTextureToBufferWithCopySplit(commandList, copy->source, copy->destination,
texture, buffer, copy->copySize);
break;
}
@ -809,6 +886,13 @@ namespace dawn_native { namespace d3d12 {
wgpu::TextureUsage::CopyDst, dstRange);
ASSERT(srcRange.aspects == dstRange.aspects);
if (ShouldCopyUsingTemporaryBuffer(GetDevice(), copy->source,
copy->destination)) {
RecordCopyTextureWithTemporaryBuffer(commandContext, copy->source,
copy->destination, copy->copySize);
break;
}
if (CanUseCopyResource(copy->source, copy->destination, copy->copySize)) {
commandList->CopyResource(destination->GetD3D12Resource(),
source->GetD3D12Resource());

View File

@ -112,10 +112,15 @@ namespace dawn_native { namespace d3d12 {
mIsOpen = false;
mSharedTextures.clear();
mHeapsPendingUsage.clear();
mTempBuffers.clear();
}
bool CommandRecordingContext::IsOpen() const {
return mIsOpen;
}
void CommandRecordingContext::AddToTempBuffers(Ref<Buffer> tempBuffer) {
mTempBuffers.emplace_back(tempBuffer);
}
}} // namespace dawn_native::d3d12

View File

@ -16,6 +16,7 @@
#include "dawn_native/Error.h"
#include "dawn_native/IntegerTypes.h"
#include "dawn_native/d3d12/BufferD3D12.h"
#include "dawn_native/d3d12/d3d12_platform.h"
#include <set>
@ -41,12 +42,16 @@ namespace dawn_native { namespace d3d12 {
void TrackHeapUsage(Heap* heap, ExecutionSerial serial);
void AddToTempBuffers(Ref<Buffer> tempBuffer);
private:
ComPtr<ID3D12GraphicsCommandList> mD3d12CommandList;
ComPtr<ID3D12GraphicsCommandList4> mD3d12CommandList4;
bool mIsOpen = false;
std::set<Texture*> mSharedTextures;
std::vector<Heap*> mHeapsPendingUsage;
std::vector<Ref<Buffer>> mTempBuffers;
};
}} // namespace dawn_native::d3d12

View File

@ -14,6 +14,7 @@
#include "dawn_native/d3d12/DeviceD3D12.h"
#include "common/GPUInfo.h"
#include "dawn_native/Instance.h"
#include "dawn_native/d3d12/AdapterD3D12.h"
#include "dawn_native/d3d12/BackendD3D12.h"
@ -535,6 +536,20 @@ namespace dawn_native { namespace d3d12 {
// By default use the maximum shader-visible heap size allowed.
SetToggle(Toggle::UseD3D12SmallShaderVisibleHeapForTesting, false);
PCIInfo pciInfo = GetAdapter()->GetPCIInfo();
// Currently this workaround is only needed on Intel Gen9 and Gen9.5 GPUs.
// See http://crbug.com/1161355 for more information.
// TODO(jiawei.shao@intel.com): disable this workaround on the newer drivers when the driver
// bug is fixed.
if (gpu_info::IsIntel(pciInfo.vendorId) &&
(gpu_info::IsSkylake(pciInfo.deviceId) || gpu_info::IsKabylake(pciInfo.deviceId) ||
gpu_info::IsCoffeelake(pciInfo.deviceId))) {
SetToggle(
Toggle::UseTempBufferInSmallFormatTextureToTextureCopyFromGreaterToLessMipLevel,
true);
}
}
MaybeError Device::WaitForIdleForDestruction() {

View File

@ -1617,9 +1617,7 @@ TEST_P(CopyTests_T2T, CopyFromNonZeroMipLevelWithTexelBlockSizeLessThan4Bytes) {
// This test can pass on the Windows Intel Vulkan driver version 27.20.100.9168.
// TODO(jiawei.shao@intel.com): enable this test on Intel Vulkan drivers after the upgrade of
// try bots.
// TODO(jiawei.shao@intel.com): enable this test on Intel D3D12 drivers when the workaround is
// implemented.
DAWN_SKIP_TEST_IF((IsD3D12() || (IsVulkan() && IsWindows())) && IsIntel());
DAWN_SKIP_TEST_IF(IsVulkan() && IsWindows() && IsIntel());
constexpr std::array<wgpu::TextureFormat, 11> kFormats = {
{wgpu::TextureFormat::RG8Sint, wgpu::TextureFormat::RG8Uint, wgpu::TextureFormat::RG8Snorm,
@ -1663,12 +1661,15 @@ TEST_P(CopyTests_T2T, CopyFromNonZeroMipLevelWithTexelBlockSizeLessThan4Bytes) {
}
}
DAWN_INSTANTIATE_TEST(CopyTests_T2T,
D3D12Backend(),
MetalBackend(),
OpenGLBackend(),
OpenGLESBackend(),
VulkanBackend());
DAWN_INSTANTIATE_TEST(
CopyTests_T2T,
D3D12Backend(),
D3D12Backend(
{"use_temp_buffer_in_small_format_texture_to_texture_copy_from_greater_to_less_mip_level"}),
MetalBackend(),
OpenGLBackend(),
OpenGLESBackend(),
VulkanBackend());
static constexpr uint64_t kSmallBufferSize = 4;
static constexpr uint64_t kLargeBufferSize = 1 << 16;