dawn-cmake/src/dawn/native/vulkan/BufferVk.cpp

// Copyright 2017 The Dawn Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "dawn/native/vulkan/BufferVk.h"

#include "dawn/native/CommandBuffer.h"
#include "dawn/native/vulkan/DeviceVk.h"
#include "dawn/native/vulkan/FencedDeleter.h"
#include "dawn/native/vulkan/ResourceHeapVk.h"
#include "dawn/native/vulkan/ResourceMemoryAllocatorVk.h"
#include "dawn/native/vulkan/UtilsVulkan.h"
#include "dawn/native/vulkan/VulkanError.h"

#include <cstring>

namespace dawn::native::vulkan {

    namespace {

        VkBufferUsageFlags VulkanBufferUsage(wgpu::BufferUsage usage) {
            VkBufferUsageFlags flags = 0;

            if (usage & wgpu::BufferUsage::CopySrc) {
                flags |= VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
            }
            if (usage & wgpu::BufferUsage::CopyDst) {
                flags |= VK_BUFFER_USAGE_TRANSFER_DST_BIT;
            }
            if (usage & wgpu::BufferUsage::Index) {
                flags |= VK_BUFFER_USAGE_INDEX_BUFFER_BIT;
            }
            if (usage & wgpu::BufferUsage::Vertex) {
                flags |= VK_BUFFER_USAGE_VERTEX_BUFFER_BIT;
            }
            if (usage & wgpu::BufferUsage::Uniform) {
                flags |= VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
            }
            if (usage &
                (wgpu::BufferUsage::Storage | kInternalStorageBuffer | kReadOnlyStorageBuffer)) {
                flags |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
            }
            if (usage & wgpu::BufferUsage::Indirect) {
                flags |= VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT;
            }
            if (usage & wgpu::BufferUsage::QueryResolve) {
                flags |= VK_BUFFER_USAGE_TRANSFER_DST_BIT;
            }

            return flags;
        }

        VkPipelineStageFlags VulkanPipelineStage(wgpu::BufferUsage usage) {
            VkPipelineStageFlags flags = 0;

            if (usage & kMappableBufferUsages) {
                flags |= VK_PIPELINE_STAGE_HOST_BIT;
            }
            if (usage & (wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst)) {
                flags |= VK_PIPELINE_STAGE_TRANSFER_BIT;
            }
            if (usage & (wgpu::BufferUsage::Index | wgpu::BufferUsage::Vertex)) {
                flags |= VK_PIPELINE_STAGE_VERTEX_INPUT_BIT;
            }
            if (usage & (wgpu::BufferUsage::Uniform | wgpu::BufferUsage::Storage |
                         kInternalStorageBuffer | kReadOnlyStorageBuffer)) {
                flags |= VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
                         VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
                         VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
            }
            if (usage & wgpu::BufferUsage::Indirect) {
                flags |= VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT;
            }
            if (usage & wgpu::BufferUsage::QueryResolve) {
                flags |= VK_PIPELINE_STAGE_TRANSFER_BIT;
            }

            return flags;
        }

        VkAccessFlags VulkanAccessFlags(wgpu::BufferUsage usage) {
            VkAccessFlags flags = 0;

            if (usage & wgpu::BufferUsage::MapRead) {
                flags |= VK_ACCESS_HOST_READ_BIT;
            }
            if (usage & wgpu::BufferUsage::MapWrite) {
                flags |= VK_ACCESS_HOST_WRITE_BIT;
            }
            if (usage & wgpu::BufferUsage::CopySrc) {
                flags |= VK_ACCESS_TRANSFER_READ_BIT;
            }
            if (usage & wgpu::BufferUsage::CopyDst) {
                flags |= VK_ACCESS_TRANSFER_WRITE_BIT;
            }
            if (usage & wgpu::BufferUsage::Index) {
                flags |= VK_ACCESS_INDEX_READ_BIT;
            }
            if (usage & wgpu::BufferUsage::Vertex) {
                flags |= VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT;
            }
            if (usage & wgpu::BufferUsage::Uniform) {
                flags |= VK_ACCESS_UNIFORM_READ_BIT;
            }
            if (usage & (wgpu::BufferUsage::Storage | kInternalStorageBuffer)) {
                flags |= VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT;
            }
            if (usage & kReadOnlyStorageBuffer) {
                flags |= VK_ACCESS_SHADER_READ_BIT;
            }
            if (usage & wgpu::BufferUsage::Indirect) {
                flags |= VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
            }
            if (usage & wgpu::BufferUsage::QueryResolve) {
                flags |= VK_ACCESS_TRANSFER_WRITE_BIT;
            }

            return flags;
        }

    }  // namespace

    // static
    ResultOrError<Ref<Buffer>> Buffer::Create(Device* device, const BufferDescriptor* descriptor) {
        Ref<Buffer> buffer = AcquireRef(new Buffer(device, descriptor));
        DAWN_TRY(buffer->Initialize(descriptor->mappedAtCreation));
        return std::move(buffer);
    }

    MaybeError Buffer::Initialize(bool mappedAtCreation) {
        // vkCmdFillBuffer requires the size to be a multiple of 4.
        constexpr size_t kAlignment = 4u;

        uint32_t extraBytes = 0u;
        if (GetUsage() & (wgpu::BufferUsage::Vertex | wgpu::BufferUsage::Index)) {
            // vkCmdSetIndexBuffer and vkCmdSetVertexBuffer are invalid if the offset
            // is equal to the whole buffer size. Allocate at least one more byte so it
            // is valid to setVertex/IndexBuffer with a zero-sized range at the end
            // of the buffer with (offset=buffer.size, size=0).
            extraBytes = 1u;
        }

        uint64_t size = GetSize();
        if (size > std::numeric_limits<uint64_t>::max() - extraBytes) {
            return DAWN_OUT_OF_MEMORY_ERROR("Buffer allocation is too large");
        }

        size += extraBytes;

        // Allocate at least 4 bytes so clamped accesses are always in bounds.
        // Also, Vulkan requires the size to be non-zero.
        size = std::max(size, uint64_t(4u));

        if (size > std::numeric_limits<uint64_t>::max() - kAlignment) {
            // Alignment would overlow.
            return DAWN_OUT_OF_MEMORY_ERROR("Buffer allocation is too large");
        }
        mAllocatedSize = Align(size, kAlignment);

        // Avoid passing ludicrously large sizes to drivers because it causes issues: drivers add
        // some constants to the size passed and align it, but for values close to the maximum
        // VkDeviceSize this can cause overflows and makes drivers crash or return bad sizes in the
        // VkmemoryRequirements. See https://gitlab.khronos.org/vulkan/vulkan/issues/1904
        // Any size with one of two top bits of VkDeviceSize set is a HUGE allocation and we can
        // safely return an OOM error.
        if (mAllocatedSize & (uint64_t(3) << uint64_t(62))) {
            return DAWN_OUT_OF_MEMORY_ERROR("Buffer size is HUGE and could cause overflows");
        }

        VkBufferCreateInfo createInfo;
        createInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
        createInfo.pNext = nullptr;
        createInfo.flags = 0;
        createInfo.size = mAllocatedSize;
        // Add CopyDst for non-mappable buffer initialization with mappedAtCreation
        // and robust resource initialization.
        createInfo.usage = VulkanBufferUsage(GetUsage() | wgpu::BufferUsage::CopyDst);
        createInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
        createInfo.queueFamilyIndexCount = 0;
        createInfo.pQueueFamilyIndices = 0;

        Device* device = ToBackend(GetDevice());
        DAWN_TRY(CheckVkOOMThenSuccess(
            device->fn.CreateBuffer(device->GetVkDevice(), &createInfo, nullptr, &*mHandle),
            "vkCreateBuffer"));

        // Gather requirements for the buffer's memory and allocate it.
        VkMemoryRequirements requirements;
        device->fn.GetBufferMemoryRequirements(device->GetVkDevice(), mHandle, &requirements);

        MemoryKind requestKind = MemoryKind::Linear;
        if (GetUsage() & kMappableBufferUsages) {
            requestKind = MemoryKind::LinearMappable;
        }
        DAWN_TRY_ASSIGN(mMemoryAllocation,
                        device->GetResourceMemoryAllocator()->Allocate(requirements, requestKind));

        // Finally associate it with the buffer.
        DAWN_TRY(CheckVkSuccess(
            device->fn.BindBufferMemory(device->GetVkDevice(), mHandle,
                                        ToBackend(mMemoryAllocation.GetResourceHeap())->GetMemory(),
                                        mMemoryAllocation.GetOffset()),
            "vkBindBufferMemory"));

        // The buffers with mappedAtCreation == true will be initialized in
        // BufferBase::MapAtCreation().
        if (device->IsToggleEnabled(Toggle::NonzeroClearResourcesOnCreationForTesting) &&
            !mappedAtCreation) {
            ClearBuffer(device->GetPendingRecordingContext(), 0x01010101);
        }

        // Initialize the padding bytes to zero.
        if (device->IsToggleEnabled(Toggle::LazyClearResourceOnFirstUse) && !mappedAtCreation) {
            uint32_t paddingBytes = GetAllocatedSize() - GetSize();
            if (paddingBytes > 0) {
                uint32_t clearSize = Align(paddingBytes, 4);
                uint64_t clearOffset = GetAllocatedSize() - clearSize;

                CommandRecordingContext* recordingContext = device->GetPendingRecordingContext();
                ClearBuffer(recordingContext, 0, clearOffset, clearSize);
            }
        }

        SetLabelImpl();

        return {};
    }

    Buffer::~Buffer() = default;

    VkBuffer Buffer::GetHandle() const {
        return mHandle;
    }

    void Buffer::TransitionUsageNow(CommandRecordingContext* recordingContext,
                                    wgpu::BufferUsage usage) {
        VkBufferMemoryBarrier barrier;
        VkPipelineStageFlags srcStages = 0;
        VkPipelineStageFlags dstStages = 0;

        if (TransitionUsageAndGetResourceBarrier(usage, &barrier, &srcStages, &dstStages)) {
            ASSERT(srcStages != 0 && dstStages != 0);
            ToBackend(GetDevice())
                ->fn.CmdPipelineBarrier(recordingContext->commandBuffer, srcStages, dstStages, 0, 0,
                                        nullptr, 1u, &barrier, 0, nullptr);
        }
    }

    bool Buffer::TransitionUsageAndGetResourceBarrier(wgpu::BufferUsage usage,
                                                      VkBufferMemoryBarrier* barrier,
                                                      VkPipelineStageFlags* srcStages,
                                                      VkPipelineStageFlags* dstStages) {
        bool lastIncludesTarget = IsSubset(usage, mLastUsage);
        bool lastReadOnly = IsSubset(mLastUsage, kReadOnlyBufferUsages);

        // We can skip transitions to already current read-only usages.
        if (lastIncludesTarget && lastReadOnly) {
            return false;
        }

        // Special-case for the initial transition: Vulkan doesn't allow access flags to be 0.
        if (mLastUsage == wgpu::BufferUsage::None) {
            mLastUsage = usage;
            return false;
        }

        *srcStages |= VulkanPipelineStage(mLastUsage);
        *dstStages |= VulkanPipelineStage(usage);

        barrier->sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
        barrier->pNext = nullptr;
        barrier->srcAccessMask = VulkanAccessFlags(mLastUsage);
        barrier->dstAccessMask = VulkanAccessFlags(usage);
        barrier->srcQueueFamilyIndex = 0;
        barrier->dstQueueFamilyIndex = 0;
        barrier->buffer = mHandle;
        barrier->offset = 0;
        // VK_WHOLE_SIZE doesn't work on old Windows Intel Vulkan drivers, so we don't use it.
        barrier->size = GetAllocatedSize();

        mLastUsage = usage;

        return true;
    }

    bool Buffer::IsCPUWritableAtCreation() const {
        // TODO(enga): Handle CPU-visible memory on UMA
        return mMemoryAllocation.GetMappedPointer() != nullptr;
    }

    MaybeError Buffer::MapAtCreationImpl() {
        return {};
    }

    MaybeError Buffer::MapAsyncImpl(wgpu::MapMode mode, size_t offset, size_t size) {
        Device* device = ToBackend(GetDevice());

        CommandRecordingContext* recordingContext = device->GetPendingRecordingContext();

        // TODO(crbug.com/dawn/852): initialize mapped buffer in CPU side.
        EnsureDataInitialized(recordingContext);

        if (mode & wgpu::MapMode::Read) {
            TransitionUsageNow(recordingContext, wgpu::BufferUsage::MapRead);
        } else {
            ASSERT(mode & wgpu::MapMode::Write);
            TransitionUsageNow(recordingContext, wgpu::BufferUsage::MapWrite);
        }
        return {};
    }

    void Buffer::UnmapImpl() {
        // No need to do anything, we keep CPU-visible memory mapped at all time.
    }

    void* Buffer::GetMappedPointerImpl() {
        uint8_t* memory = mMemoryAllocation.GetMappedPointer();
        ASSERT(memory != nullptr);
        return memory;
    }

    void Buffer::DestroyImpl() {
        BufferBase::DestroyImpl();

        ToBackend(GetDevice())->GetResourceMemoryAllocator()->Deallocate(&mMemoryAllocation);

        if (mHandle != VK_NULL_HANDLE) {
            ToBackend(GetDevice())->GetFencedDeleter()->DeleteWhenUnused(mHandle);
            mHandle = VK_NULL_HANDLE;
        }
    }

    bool Buffer::EnsureDataInitialized(CommandRecordingContext* recordingContext) {
        if (!NeedsInitialization()) {
            return false;
        }

        InitializeToZero(recordingContext);
        return true;
    }

    bool Buffer::EnsureDataInitializedAsDestination(CommandRecordingContext* recordingContext,
                                                    uint64_t offset,
                                                    uint64_t size) {
        if (!NeedsInitialization()) {
            return false;
        }

        if (IsFullBufferRange(offset, size)) {
            SetIsDataInitialized();
            return false;
        }

        InitializeToZero(recordingContext);
        return true;
    }

    bool Buffer::EnsureDataInitializedAsDestination(CommandRecordingContext* recordingContext,
                                                    const CopyTextureToBufferCmd* copy) {
        if (!NeedsInitialization()) {
            return false;
        }

        if (IsFullBufferOverwrittenInTextureToBufferCopy(copy)) {
            SetIsDataInitialized();
            return false;
        }

        InitializeToZero(recordingContext);
        return true;
    }

    void Buffer::SetLabelImpl() {
        SetDebugName(ToBackend(GetDevice()), mHandle, "Dawn_Buffer", GetLabel());
    }

    void Buffer::InitializeToZero(CommandRecordingContext* recordingContext) {
        ASSERT(NeedsInitialization());

        ClearBuffer(recordingContext, 0u);
        GetDevice()->IncrementLazyClearCountForTesting();
        SetIsDataInitialized();
    }

    void Buffer::ClearBuffer(CommandRecordingContext* recordingContext,
                             uint32_t clearValue,
                             uint64_t offset,
                             uint64_t size) {
        ASSERT(recordingContext != nullptr);
        size = size > 0 ? size : GetAllocatedSize();
        ASSERT(size > 0);

        TransitionUsageNow(recordingContext, wgpu::BufferUsage::CopyDst);

        Device* device = ToBackend(GetDevice());
        // VK_WHOLE_SIZE doesn't work on old Windows Intel Vulkan drivers, so we don't use it.
        // Note: Allocated size must be a multiple of 4.
        ASSERT(size % 4 == 0);
        device->fn.CmdFillBuffer(recordingContext->commandBuffer, mHandle, offset, size,
                                 clearValue);
    }
}  // namespace dawn::native::vulkan