D3D12: Add workaround for T2T copy issue on Intel GPUs
On Intel Gen9 (SKL) and Gen9.5 (KBL, CFL, CML) GPUs with latest Intel D3D12 driver (27.20.100.9316), there is a bug in the command CopyTextureRegion() when we want to do the texture-to-texture copy with the formats whose texel block size < 4 bytes and source mipmap level > destination mipmap level. This patch adds a workaround for this driver bug by implementing the functionality of the T2T copy with one T2B copy and one B2T copy. BUG=chromium:1161355 TEST=dawn_end2end_tests Change-Id: I688bb8bae277832aaba1be2680012040ee8e1160 Reviewed-on: https://dawn-review.googlesource.com/c/dawn/+/43860 Reviewed-by: Corentin Wallez <cwallez@chromium.org> Commit-Queue: Austin Eng <enga@chromium.org>
This commit is contained in:
parent
9f6bc4e3a9
commit
f905e57be2
|
@ -14,7 +14,31 @@
|
|||
|
||||
#include "common/GPUInfo.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
namespace gpu_info {
|
||||
namespace {
|
||||
// Intel
|
||||
// Referenced from the following Mesa source code:
|
||||
// https://github.com/mesa3d/mesa/blob/master/include/pci_ids/i965_pci_ids.h
|
||||
// gen9
|
||||
const uint32_t Skylake[] = {0x1902, 0x1906, 0x190A, 0x190B, 0x190E, 0x1912, 0x1913,
|
||||
0x1915, 0x1916, 0x1917, 0x191A, 0x191B, 0x191D, 0x191E,
|
||||
0x1921, 0x1923, 0x1926, 0x1927, 0x192A, 0x192B, 0x192D,
|
||||
0x1932, 0x193A, 0x193B, 0x193D};
|
||||
// gen9p5
|
||||
const uint32_t Kabylake[] = {0x5916, 0x5913, 0x5906, 0x5926, 0x5921, 0x5915, 0x590E,
|
||||
0x591E, 0x5912, 0x5917, 0x5902, 0x591B, 0x593B, 0x590B,
|
||||
0x591A, 0x590A, 0x591D, 0x5908, 0x5923, 0x5927};
|
||||
const uint32_t Coffeelake[] = {0x87CA, 0x3E90, 0x3E93, 0x3E99, 0x3E9C, 0x3E91,
|
||||
0x3E92, 0x3E96, 0x3E98, 0x3E9A, 0x3E9B, 0x3E94,
|
||||
0x3EA9, 0x3EA5, 0x3EA6, 0x3EA7, 0x3EA8};
|
||||
const uint32_t WhiskyLake[] = {0x3EA1, 0x3EA4, 0x3EA0, 0x3EA3, 0x3EA2};
|
||||
const uint32_t CometLake[] = {0x9B21, 0x9BA0, 0x9BA2, 0x9BA4, 0x9BA5, 0x9BA8, 0x9BAA,
|
||||
0x9BAB, 0x9BAC, 0x9B41, 0x9BC0, 0x9BC2, 0x9BC4, 0x9BC5,
|
||||
0x9BC6, 0x9BC8, 0x9BCA, 0x9BCB, 0x9BCC, 0x9BE6, 0x9BF6};
|
||||
} // anonymous namespace
|
||||
|
||||
bool IsAMD(PCIVendorID vendorId) {
|
||||
return vendorId == kVendorID_AMD;
|
||||
}
|
||||
|
@ -39,4 +63,20 @@ namespace gpu_info {
|
|||
bool IsWARP(PCIVendorID vendorId, PCIDeviceID deviceId) {
|
||||
return vendorId == kVendorID_Microsoft && deviceId == kDeviceID_WARP;
|
||||
}
|
||||
|
||||
// Intel GPUs
|
||||
bool IsSkylake(PCIDeviceID deviceId) {
|
||||
return std::find(std::begin(Skylake), std::end(Skylake), deviceId) != std::end(Skylake);
|
||||
}
|
||||
bool IsKabylake(PCIDeviceID deviceId) {
|
||||
return std::find(std::begin(Kabylake), std::end(Kabylake), deviceId) != std::end(Kabylake);
|
||||
}
|
||||
bool IsCoffeelake(PCIDeviceID deviceId) {
|
||||
return (std::find(std::begin(Coffeelake), std::end(Coffeelake), deviceId) !=
|
||||
std::end(Coffeelake)) ||
|
||||
(std::find(std::begin(WhiskyLake), std::end(WhiskyLake), deviceId) !=
|
||||
std::end(WhiskyLake)) ||
|
||||
(std::find(std::begin(CometLake), std::end(CometLake), deviceId) !=
|
||||
std::end(CometLake));
|
||||
}
|
||||
} // namespace gpu_info
|
||||
|
|
|
@ -43,5 +43,10 @@ namespace gpu_info {
|
|||
bool IsSwiftshader(PCIVendorID vendorId, PCIDeviceID deviceId);
|
||||
bool IsWARP(PCIVendorID vendorId, PCIDeviceID deviceId);
|
||||
|
||||
// Intel architectures
|
||||
bool IsSkylake(PCIDeviceID deviceId);
|
||||
bool IsKabylake(PCIDeviceID deviceId);
|
||||
bool IsCoffeelake(PCIDeviceID deviceId);
|
||||
|
||||
} // namespace gpu_info
|
||||
#endif // COMMON_GPUINFO_H
|
||||
|
|
|
@ -172,7 +172,17 @@ namespace dawn_native {
|
|||
"If needed, use a compute shader to transform timestamp queries from ticks to "
|
||||
"nanoseconds. This is temporarily needed to avoid requiring Tint to use timestamp "
|
||||
"queries",
|
||||
"https://crbug.com/dawn/686"}}
|
||||
"https://crbug.com/dawn/686"}},
|
||||
{Toggle::UseTempBufferInSmallFormatTextureToTextureCopyFromGreaterToLessMipLevel,
|
||||
{"use_temp_buffer_in_small_format_texture_to_texture_copy_from_greater_to_less_mip_"
|
||||
"level",
|
||||
"Split texture-to-texture copy into two copies: copy from source texture into a "
|
||||
"temporary buffer, and copy from the temporary buffer into the destination texture "
|
||||
"under specific situations. This workaround is by default enabled on some Intel "
|
||||
"GPUs which have a driver bug in the execution of CopyTextureRegion() when we copy "
|
||||
"with the formats whose texel block sizes are less than 4 bytes from a greater mip "
|
||||
"level to a smaller mip level on D3D12 backends.",
|
||||
"https://crbug.com/1161355"}}
|
||||
// Dummy comment to separate the }} so it is clearer what to copy-paste to add a toggle.
|
||||
}};
|
||||
|
||||
|
|
|
@ -51,6 +51,7 @@ namespace dawn_native {
|
|||
UseTintGenerator,
|
||||
FlushBeforeClientWaitSync,
|
||||
ConvertTimestampsToNanoseconds,
|
||||
UseTempBufferInSmallFormatTextureToTextureCopyFromGreaterToLessMipLevel,
|
||||
|
||||
EnumCount,
|
||||
InvalidEnum = EnumCount,
|
||||
|
|
|
@ -118,6 +118,47 @@ namespace dawn_native { namespace d3d12 {
|
|||
}
|
||||
}
|
||||
|
||||
void CopyTextureToBufferWithCopySplit(ID3D12GraphicsCommandList* commandList,
|
||||
const TextureCopy& textureCopy,
|
||||
const BufferCopy& bufferCopy,
|
||||
Texture* texture,
|
||||
Buffer* buffer,
|
||||
const Extent3D& copySize) {
|
||||
const TexelBlockInfo& blockInfo =
|
||||
texture->GetFormat().GetAspectInfo(textureCopy.aspect).block;
|
||||
|
||||
// See comments around ComputeTextureCopySplits() for more details.
|
||||
const TextureCopySplits copySplits =
|
||||
ComputeTextureCopySplits(textureCopy.origin, copySize, blockInfo, bufferCopy.offset,
|
||||
bufferCopy.bytesPerRow, bufferCopy.rowsPerImage);
|
||||
|
||||
const uint64_t bytesPerSlice = bufferCopy.bytesPerRow * bufferCopy.rowsPerImage;
|
||||
|
||||
// copySplits.copies2D[1] is always calculated for the second copy slice with
|
||||
// extra "bytesPerSlice" copy offset compared with the first copy slice. So
|
||||
// here we use an array bufferOffsetsForNextSlice to record the extra offsets
|
||||
// for each copy slice: bufferOffsetsForNextSlice[0] is the extra offset for
|
||||
// the next copy slice that uses copySplits.copies2D[0], and
|
||||
// bufferOffsetsForNextSlice[1] is the extra offset for the next copy slice
|
||||
// that uses copySplits.copies2D[1].
|
||||
std::array<uint64_t, TextureCopySplits::kMaxTextureCopySplits>
|
||||
bufferOffsetsForNextSlice = {{0u, 0u}};
|
||||
for (uint32_t copySlice = 0; copySlice < copySize.depth; ++copySlice) {
|
||||
const uint32_t splitIndex = copySlice % copySplits.copies2D.size();
|
||||
|
||||
const Texture2DCopySplit& copySplitPerLayerBase = copySplits.copies2D[splitIndex];
|
||||
const uint64_t bufferOffsetForNextSlice = bufferOffsetsForNextSlice[splitIndex];
|
||||
const uint32_t copyTextureLayer = copySlice + textureCopy.origin.z;
|
||||
|
||||
RecordCopyTextureToBufferFromTextureCopySplit(
|
||||
commandList, copySplitPerLayerBase, buffer, bufferOffsetForNextSlice,
|
||||
bufferCopy.bytesPerRow, texture, textureCopy.mipLevel, copyTextureLayer,
|
||||
textureCopy.aspect);
|
||||
|
||||
bufferOffsetsForNextSlice[splitIndex] += bytesPerSlice * copySplits.copies2D.size();
|
||||
}
|
||||
}
|
||||
|
||||
void RecordWriteTimestampCmd(ID3D12GraphicsCommandList* commandList,
|
||||
WriteTimestampCmd* cmd) {
|
||||
QuerySet* querySet = ToBackend(cmd->querySet.Get());
|
||||
|
@ -148,6 +189,77 @@ namespace dawn_native { namespace d3d12 {
|
|||
commandList->SetGraphicsRoot32BitConstants(layout->GetFirstIndexOffsetParameterIndex(),
|
||||
count, offsets.data(), 0);
|
||||
}
|
||||
|
||||
bool ShouldCopyUsingTemporaryBuffer(DeviceBase* device,
|
||||
const TextureCopy& srcCopy,
|
||||
const TextureCopy& dstCopy) {
|
||||
// Currently we only need the workaround for an Intel D3D12 driver issue.
|
||||
if (device->IsToggleEnabled(
|
||||
Toggle::
|
||||
UseTempBufferInSmallFormatTextureToTextureCopyFromGreaterToLessMipLevel)) {
|
||||
bool copyToLesserLevel = srcCopy.mipLevel > dstCopy.mipLevel;
|
||||
ASSERT(srcCopy.texture->GetFormat().format == dstCopy.texture->GetFormat().format);
|
||||
|
||||
// GetAspectInfo(aspect) requires HasOneBit(aspect) == true, plus the texel block
|
||||
// sizes of depth stencil formats are always no less than 4 bytes.
|
||||
bool isSmallColorFormat =
|
||||
HasOneBit(srcCopy.aspect) &&
|
||||
srcCopy.texture->GetFormat().GetAspectInfo(srcCopy.aspect).block.byteSize < 4u;
|
||||
if (copyToLesserLevel && isSmallColorFormat) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void RecordCopyTextureWithTemporaryBuffer(CommandRecordingContext* recordingContext,
|
||||
const TextureCopy& srcCopy,
|
||||
const TextureCopy& dstCopy,
|
||||
const Extent3D& copySize) {
|
||||
ASSERT(srcCopy.texture->GetFormat().format == dstCopy.texture->GetFormat().format);
|
||||
ASSERT(srcCopy.aspect == dstCopy.aspect);
|
||||
dawn_native::Format format = srcCopy.texture->GetFormat();
|
||||
const TexelBlockInfo& blockInfo = format.GetAspectInfo(srcCopy.aspect).block;
|
||||
ASSERT(copySize.width % blockInfo.width == 0);
|
||||
uint32_t widthInBlocks = copySize.width / blockInfo.width;
|
||||
ASSERT(copySize.height % blockInfo.height == 0);
|
||||
uint32_t heightInBlocks = copySize.height / blockInfo.height;
|
||||
|
||||
// Create tempBuffer
|
||||
uint32_t bytesPerRow =
|
||||
Align(blockInfo.byteSize * widthInBlocks, kTextureBytesPerRowAlignment);
|
||||
uint32_t rowsPerImage = heightInBlocks;
|
||||
uint64_t tempBufferSize = bytesPerRow * (widthInBlocks * heightInBlocks - 1) +
|
||||
Align(blockInfo.byteSize * widthInBlocks, 4);
|
||||
BufferDescriptor tempBufferDescriptor;
|
||||
tempBufferDescriptor.usage = wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst;
|
||||
tempBufferDescriptor.size = tempBufferSize;
|
||||
Device* device = ToBackend(srcCopy.texture->GetDevice());
|
||||
Ref<Buffer> tempBuffer =
|
||||
AcquireRef(ToBackend(device->CreateBuffer(&tempBufferDescriptor)));
|
||||
|
||||
// Copy from source texture into tempBuffer
|
||||
Texture* srcTexture = ToBackend(srcCopy.texture).Get();
|
||||
tempBuffer->TrackUsageAndTransitionNow(recordingContext, wgpu::BufferUsage::CopyDst);
|
||||
BufferCopy bufferCopy;
|
||||
bufferCopy.buffer = tempBuffer;
|
||||
bufferCopy.offset = 0;
|
||||
bufferCopy.bytesPerRow = bytesPerRow;
|
||||
bufferCopy.rowsPerImage = rowsPerImage;
|
||||
CopyTextureToBufferWithCopySplit(recordingContext->GetCommandList(), srcCopy,
|
||||
bufferCopy, srcTexture, tempBuffer.Get(), copySize);
|
||||
|
||||
// Copy from tempBuffer into destination texture
|
||||
tempBuffer->TrackUsageAndTransitionNow(recordingContext, wgpu::BufferUsage::CopySrc);
|
||||
Texture* dstTexture = ToBackend(dstCopy.texture).Get();
|
||||
CopyBufferToTextureWithCopySplit(recordingContext, dstCopy,
|
||||
tempBuffer->GetD3D12Resource(), 0, bytesPerRow,
|
||||
rowsPerImage, copySize, dstTexture, dstCopy.aspect);
|
||||
|
||||
// Save tempBuffer into recordingContext
|
||||
recordingContext->AddToTempBuffers(std::move(tempBuffer));
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
class BindGroupStateTracker : public BindGroupTrackerBase<false, uint64_t> {
|
||||
|
@ -733,43 +845,8 @@ namespace dawn_native { namespace d3d12 {
|
|||
subresources);
|
||||
buffer->TrackUsageAndTransitionNow(commandContext, wgpu::BufferUsage::CopyDst);
|
||||
|
||||
const TexelBlockInfo& blockInfo =
|
||||
texture->GetFormat().GetAspectInfo(copy->source.aspect).block;
|
||||
|
||||
// See comments around ComputeTextureCopySplits() for more details.
|
||||
const TextureCopySplits copySplits = ComputeTextureCopySplits(
|
||||
copy->source.origin, copy->copySize, blockInfo, copy->destination.offset,
|
||||
copy->destination.bytesPerRow, copy->destination.rowsPerImage);
|
||||
|
||||
const uint64_t bytesPerSlice =
|
||||
copy->destination.bytesPerRow * copy->destination.rowsPerImage;
|
||||
|
||||
// copySplits.copies2D[1] is always calculated for the second copy slice with
|
||||
// extra "bytesPerSlice" copy offset compared with the first copy slice. So
|
||||
// here we use an array bufferOffsetsForNextSlice to record the extra offsets
|
||||
// for each copy slice: bufferOffsetsForNextSlice[0] is the extra offset for
|
||||
// the next copy slice that uses copySplits.copies2D[0], and
|
||||
// bufferOffsetsForNextSlice[1] is the extra offset for the next copy slice
|
||||
// that uses copySplits.copies2D[1].
|
||||
std::array<uint64_t, TextureCopySplits::kMaxTextureCopySplits>
|
||||
bufferOffsetsForNextSlice = {{0u, 0u}};
|
||||
for (uint32_t copySlice = 0; copySlice < copy->copySize.depth; ++copySlice) {
|
||||
const uint32_t splitIndex = copySlice % copySplits.copies2D.size();
|
||||
|
||||
const Texture2DCopySplit& copySplitPerLayerBase =
|
||||
copySplits.copies2D[splitIndex];
|
||||
const uint64_t bufferOffsetForNextSlice =
|
||||
bufferOffsetsForNextSlice[splitIndex];
|
||||
const uint32_t copyTextureLayer = copySlice + copy->source.origin.z;
|
||||
|
||||
RecordCopyTextureToBufferFromTextureCopySplit(
|
||||
commandList, copySplitPerLayerBase, buffer, bufferOffsetForNextSlice,
|
||||
copy->destination.bytesPerRow, texture, copy->source.mipLevel,
|
||||
copyTextureLayer, subresources.aspects);
|
||||
|
||||
bufferOffsetsForNextSlice[splitIndex] +=
|
||||
bytesPerSlice * copySplits.copies2D.size();
|
||||
}
|
||||
CopyTextureToBufferWithCopySplit(commandList, copy->source, copy->destination,
|
||||
texture, buffer, copy->copySize);
|
||||
|
||||
break;
|
||||
}
|
||||
|
@ -809,6 +886,13 @@ namespace dawn_native { namespace d3d12 {
|
|||
wgpu::TextureUsage::CopyDst, dstRange);
|
||||
|
||||
ASSERT(srcRange.aspects == dstRange.aspects);
|
||||
if (ShouldCopyUsingTemporaryBuffer(GetDevice(), copy->source,
|
||||
copy->destination)) {
|
||||
RecordCopyTextureWithTemporaryBuffer(commandContext, copy->source,
|
||||
copy->destination, copy->copySize);
|
||||
break;
|
||||
}
|
||||
|
||||
if (CanUseCopyResource(copy->source, copy->destination, copy->copySize)) {
|
||||
commandList->CopyResource(destination->GetD3D12Resource(),
|
||||
source->GetD3D12Resource());
|
||||
|
|
|
@ -112,10 +112,15 @@ namespace dawn_native { namespace d3d12 {
|
|||
mIsOpen = false;
|
||||
mSharedTextures.clear();
|
||||
mHeapsPendingUsage.clear();
|
||||
mTempBuffers.clear();
|
||||
}
|
||||
|
||||
bool CommandRecordingContext::IsOpen() const {
|
||||
return mIsOpen;
|
||||
}
|
||||
|
||||
void CommandRecordingContext::AddToTempBuffers(Ref<Buffer> tempBuffer) {
|
||||
mTempBuffers.emplace_back(tempBuffer);
|
||||
}
|
||||
|
||||
}} // namespace dawn_native::d3d12
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
|
||||
#include "dawn_native/Error.h"
|
||||
#include "dawn_native/IntegerTypes.h"
|
||||
#include "dawn_native/d3d12/BufferD3D12.h"
|
||||
#include "dawn_native/d3d12/d3d12_platform.h"
|
||||
|
||||
#include <set>
|
||||
|
@ -41,12 +42,16 @@ namespace dawn_native { namespace d3d12 {
|
|||
|
||||
void TrackHeapUsage(Heap* heap, ExecutionSerial serial);
|
||||
|
||||
void AddToTempBuffers(Ref<Buffer> tempBuffer);
|
||||
|
||||
private:
|
||||
ComPtr<ID3D12GraphicsCommandList> mD3d12CommandList;
|
||||
ComPtr<ID3D12GraphicsCommandList4> mD3d12CommandList4;
|
||||
bool mIsOpen = false;
|
||||
std::set<Texture*> mSharedTextures;
|
||||
std::vector<Heap*> mHeapsPendingUsage;
|
||||
|
||||
std::vector<Ref<Buffer>> mTempBuffers;
|
||||
};
|
||||
}} // namespace dawn_native::d3d12
|
||||
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
|
||||
#include "dawn_native/d3d12/DeviceD3D12.h"
|
||||
|
||||
#include "common/GPUInfo.h"
|
||||
#include "dawn_native/Instance.h"
|
||||
#include "dawn_native/d3d12/AdapterD3D12.h"
|
||||
#include "dawn_native/d3d12/BackendD3D12.h"
|
||||
|
@ -535,6 +536,20 @@ namespace dawn_native { namespace d3d12 {
|
|||
|
||||
// By default use the maximum shader-visible heap size allowed.
|
||||
SetToggle(Toggle::UseD3D12SmallShaderVisibleHeapForTesting, false);
|
||||
|
||||
PCIInfo pciInfo = GetAdapter()->GetPCIInfo();
|
||||
|
||||
// Currently this workaround is only needed on Intel Gen9 and Gen9.5 GPUs.
|
||||
// See http://crbug.com/1161355 for more information.
|
||||
// TODO(jiawei.shao@intel.com): disable this workaround on the newer drivers when the driver
|
||||
// bug is fixed.
|
||||
if (gpu_info::IsIntel(pciInfo.vendorId) &&
|
||||
(gpu_info::IsSkylake(pciInfo.deviceId) || gpu_info::IsKabylake(pciInfo.deviceId) ||
|
||||
gpu_info::IsCoffeelake(pciInfo.deviceId))) {
|
||||
SetToggle(
|
||||
Toggle::UseTempBufferInSmallFormatTextureToTextureCopyFromGreaterToLessMipLevel,
|
||||
true);
|
||||
}
|
||||
}
|
||||
|
||||
MaybeError Device::WaitForIdleForDestruction() {
|
||||
|
|
|
@ -1617,9 +1617,7 @@ TEST_P(CopyTests_T2T, CopyFromNonZeroMipLevelWithTexelBlockSizeLessThan4Bytes) {
|
|||
// This test can pass on the Windows Intel Vulkan driver version 27.20.100.9168.
|
||||
// TODO(jiawei.shao@intel.com): enable this test on Intel Vulkan drivers after the upgrade of
|
||||
// try bots.
|
||||
// TODO(jiawei.shao@intel.com): enable this test on Intel D3D12 drivers when the workaround is
|
||||
// implemented.
|
||||
DAWN_SKIP_TEST_IF((IsD3D12() || (IsVulkan() && IsWindows())) && IsIntel());
|
||||
DAWN_SKIP_TEST_IF(IsVulkan() && IsWindows() && IsIntel());
|
||||
|
||||
constexpr std::array<wgpu::TextureFormat, 11> kFormats = {
|
||||
{wgpu::TextureFormat::RG8Sint, wgpu::TextureFormat::RG8Uint, wgpu::TextureFormat::RG8Snorm,
|
||||
|
@ -1663,12 +1661,15 @@ TEST_P(CopyTests_T2T, CopyFromNonZeroMipLevelWithTexelBlockSizeLessThan4Bytes) {
|
|||
}
|
||||
}
|
||||
|
||||
DAWN_INSTANTIATE_TEST(CopyTests_T2T,
|
||||
D3D12Backend(),
|
||||
MetalBackend(),
|
||||
OpenGLBackend(),
|
||||
OpenGLESBackend(),
|
||||
VulkanBackend());
|
||||
DAWN_INSTANTIATE_TEST(
|
||||
CopyTests_T2T,
|
||||
D3D12Backend(),
|
||||
D3D12Backend(
|
||||
{"use_temp_buffer_in_small_format_texture_to_texture_copy_from_greater_to_less_mip_level"}),
|
||||
MetalBackend(),
|
||||
OpenGLBackend(),
|
||||
OpenGLESBackend(),
|
||||
VulkanBackend());
|
||||
|
||||
static constexpr uint64_t kSmallBufferSize = 4;
|
||||
static constexpr uint64_t kLargeBufferSize = 1 << 16;
|
||||
|
|
Loading…
Reference in New Issue