// Copyright 2018 The Dawn Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "dawn_native/metal/DeviceMTL.h" #include "common/GPUInfo.h" #include "common/Platform.h" #include "dawn_native/BackendConnection.h" #include "dawn_native/BindGroupLayout.h" #include "dawn_native/Commands.h" #include "dawn_native/ErrorData.h" #include "dawn_native/metal/BindGroupLayoutMTL.h" #include "dawn_native/metal/BindGroupMTL.h" #include "dawn_native/metal/BufferMTL.h" #include "dawn_native/metal/CommandBufferMTL.h" #include "dawn_native/metal/ComputePipelineMTL.h" #include "dawn_native/metal/PipelineLayoutMTL.h" #include "dawn_native/metal/QuerySetMTL.h" #include "dawn_native/metal/QueueMTL.h" #include "dawn_native/metal/RenderPipelineMTL.h" #include "dawn_native/metal/SamplerMTL.h" #include "dawn_native/metal/ShaderModuleMTL.h" #include "dawn_native/metal/StagingBufferMTL.h" #include "dawn_native/metal/SwapChainMTL.h" #include "dawn_native/metal/TextureMTL.h" #include "dawn_native/metal/UtilsMetal.h" #include "dawn_platform/DawnPlatform.h" #include "dawn_platform/tracing/TraceEvent.h" #include namespace dawn::native::metal { namespace { // The time interval for each round of kalman filter static constexpr uint64_t kFilterIntervalInMs = static_cast(NSEC_PER_SEC / 10); struct KalmanInfo { float filterValue; // The estimation value float kalmanGain; // The kalman gain float R; // The covariance of the observation noise float P; // The a posteriori estimate covariance }; // A simplified kalman filter for estimating timestamp period based on measured values float KalmanFilter(KalmanInfo* info, float measuredValue) { // Optimize kalman gain info->kalmanGain = info->P / (info->P + info->R); // Correct filter value info->filterValue = info->kalmanGain * measuredValue + (1.0 - info->kalmanGain) * info->filterValue; // Update estimate covariance info->P = (1.0f - info->kalmanGain) * info->P; return info->filterValue; } void API_AVAILABLE(macos(10.15), ios(14)) UpdateTimestampPeriod(id device, KalmanInfo* info, MTLTimestamp* cpuTimestampStart, MTLTimestamp* gpuTimestampStart, float* timestampPeriod) { // The filter value is converged to an optimal value when the kalman gain is less than // 0.01. At this time, the weight of the measured value is too small to change the next // filter value, the sampling and calculations do not need to continue anymore. if (info->kalmanGain < 0.01f) { return; } MTLTimestamp cpuTimestampEnd = 0, gpuTimestampEnd = 0; [device sampleTimestamps:&cpuTimestampEnd gpuTimestamp:&gpuTimestampEnd]; // Update the timestamp start values when timestamp reset happens if (cpuTimestampEnd < *cpuTimestampStart || gpuTimestampEnd < *gpuTimestampStart) { *cpuTimestampStart = cpuTimestampEnd; *gpuTimestampStart = gpuTimestampEnd; return; } if (cpuTimestampEnd - *cpuTimestampStart >= kFilterIntervalInMs) { // The measured timestamp period float measurement = (cpuTimestampEnd - *cpuTimestampStart) / static_cast(gpuTimestampEnd - *gpuTimestampStart); // Measurement update *timestampPeriod = KalmanFilter(info, measurement); *cpuTimestampStart = cpuTimestampEnd; *gpuTimestampStart = gpuTimestampEnd; } } } // namespace // static ResultOrError> Device::Create(AdapterBase* adapter, NSPRef> mtlDevice, const DeviceDescriptor* descriptor) { Ref device = AcquireRef(new Device(adapter, std::move(mtlDevice), descriptor)); DAWN_TRY(device->Initialize()); return device; } Device::Device(AdapterBase* adapter, NSPRef> mtlDevice, const DeviceDescriptor* descriptor) : DeviceBase(adapter, descriptor), mMtlDevice(std::move(mtlDevice)), mCompletedSerial(0) { } Device::~Device() { Destroy(); } MaybeError Device::Initialize() { InitTogglesFromDriver(); mCommandQueue.Acquire([*mMtlDevice newCommandQueue]); if (mCommandQueue == nil) { return DAWN_INTERNAL_ERROR("Failed to allocate MTLCommandQueue."); } DAWN_TRY(mCommandContext.PrepareNextCommandBuffer(*mCommandQueue)); if (IsFeatureEnabled(Feature::TimestampQuery)) { // Make a best guess of timestamp period based on device vendor info, and converge it to // an accurate value by the following calculations. mTimestampPeriod = gpu_info::IsIntel(GetAdapter()->GetVendorId()) ? 83.333f : 1.0f; // Initialize kalman filter parameters mKalmanInfo = std::make_unique(); mKalmanInfo->filterValue = 0.0f; mKalmanInfo->kalmanGain = 0.5f; mKalmanInfo->R = 0.0001f; // The smaller this value is, the smaller the error of measured value is, // the more we can trust the measured value. mKalmanInfo->P = 1.0f; if (@available(macos 10.15, iOS 14.0, *)) { // Sample CPU timestamp and GPU timestamp for first time at device creation [*mMtlDevice sampleTimestamps:&mCpuTimestamp gpuTimestamp:&mGpuTimestamp]; } } return DeviceBase::Initialize(new Queue(this)); } void Device::InitTogglesFromDriver() { { bool haveStoreAndMSAAResolve = false; #if defined(DAWN_PLATFORM_MACOS) if (@available(macOS 10.12, *)) { haveStoreAndMSAAResolve = [*mMtlDevice supportsFeatureSet:MTLFeatureSet_macOS_GPUFamily1_v2]; } #elif defined(DAWN_PLATFORM_IOS) haveStoreAndMSAAResolve = [*mMtlDevice supportsFeatureSet:MTLFeatureSet_iOS_GPUFamily3_v2]; #endif // On tvOS, we would need MTLFeatureSet_tvOS_GPUFamily2_v1. SetToggle(Toggle::EmulateStoreAndMSAAResolve, !haveStoreAndMSAAResolve); bool haveSamplerCompare = true; #if defined(DAWN_PLATFORM_IOS) haveSamplerCompare = [*mMtlDevice supportsFeatureSet:MTLFeatureSet_iOS_GPUFamily3_v1]; #endif // TODO(crbug.com/dawn/342): Investigate emulation -- possibly expensive. SetToggle(Toggle::MetalDisableSamplerCompare, !haveSamplerCompare); bool haveBaseVertexBaseInstance = true; #if defined(DAWN_PLATFORM_IOS) haveBaseVertexBaseInstance = [*mMtlDevice supportsFeatureSet:MTLFeatureSet_iOS_GPUFamily3_v1]; #endif // TODO(crbug.com/dawn/343): Investigate emulation. SetToggle(Toggle::DisableBaseVertex, !haveBaseVertexBaseInstance); SetToggle(Toggle::DisableBaseInstance, !haveBaseVertexBaseInstance); } // Vertex buffer robustness is implemented by using programmable vertex pulling. Enable // that code path if it isn't explicitly disabled. if (IsRobustnessEnabled()) { SetToggle(Toggle::MetalEnableVertexPulling, true); } // TODO(crbug.com/dawn/846): tighten this workaround when the driver bug is fixed. SetToggle(Toggle::AlwaysResolveIntoZeroLevelAndLayer, true); uint32_t deviceId = GetAdapter()->GetDeviceId(); uint32_t vendorId = GetAdapter()->GetVendorId(); // TODO(crbug.com/dawn/847): Use MTLStorageModeShared instead of MTLStorageModePrivate when // creating MTLCounterSampleBuffer in QuerySet on Intel platforms, otherwise it fails to // create the buffer. Change to use MTLStorageModePrivate when the bug is fixed. if (@available(macOS 10.15, iOS 14.0, *)) { bool useSharedMode = gpu_info::IsIntel(vendorId); SetToggle(Toggle::MetalUseSharedModeForCounterSampleBuffer, useSharedMode); } // TODO(crbug.com/dawn/1071): r8unorm and rg8unorm textures with multiple mip levels don't // clear properly on Intel Macs. if (gpu_info::IsIntel(vendorId)) { SetToggle(Toggle::DisableR8RG8Mipmaps, true); } // On some Intel GPU vertex only render pipeline get wrong depth result if no fragment // shader provided. Create a dummy fragment shader module to work around this issue. if (gpu_info::IsIntel(vendorId)) { bool useDummyFragmentShader = true; if (gpu_info::IsSkylake(deviceId)) { useDummyFragmentShader = false; } SetToggle(Toggle::UseDummyFragmentInVertexOnlyPipeline, useDummyFragmentShader); } } ResultOrError> Device::CreateBindGroupImpl( const BindGroupDescriptor* descriptor) { return BindGroup::Create(this, descriptor); } ResultOrError> Device::CreateBindGroupLayoutImpl( const BindGroupLayoutDescriptor* descriptor, PipelineCompatibilityToken pipelineCompatibilityToken) { return BindGroupLayout::Create(this, descriptor, pipelineCompatibilityToken); } ResultOrError> Device::CreateBufferImpl(const BufferDescriptor* descriptor) { return Buffer::Create(this, descriptor); } ResultOrError> Device::CreateCommandBuffer( CommandEncoder* encoder, const CommandBufferDescriptor* descriptor) { return CommandBuffer::Create(encoder, descriptor); } Ref Device::CreateUninitializedComputePipelineImpl( const ComputePipelineDescriptor* descriptor) { return ComputePipeline::CreateUninitialized(this, descriptor); } ResultOrError> Device::CreatePipelineLayoutImpl( const PipelineLayoutDescriptor* descriptor) { return PipelineLayout::Create(this, descriptor); } ResultOrError> Device::CreateQuerySetImpl( const QuerySetDescriptor* descriptor) { return QuerySet::Create(this, descriptor); } Ref Device::CreateUninitializedRenderPipelineImpl( const RenderPipelineDescriptor* descriptor) { return RenderPipeline::CreateUninitialized(this, descriptor); } ResultOrError> Device::CreateSamplerImpl(const SamplerDescriptor* descriptor) { return Sampler::Create(this, descriptor); } ResultOrError> Device::CreateShaderModuleImpl( const ShaderModuleDescriptor* descriptor, ShaderModuleParseResult* parseResult) { return ShaderModule::Create(this, descriptor, parseResult); } ResultOrError> Device::CreateSwapChainImpl( const SwapChainDescriptor* descriptor) { return OldSwapChain::Create(this, descriptor); } ResultOrError> Device::CreateSwapChainImpl( Surface* surface, NewSwapChainBase* previousSwapChain, const SwapChainDescriptor* descriptor) { return SwapChain::Create(this, surface, previousSwapChain, descriptor); } ResultOrError> Device::CreateTextureImpl(const TextureDescriptor* descriptor) { return Texture::Create(this, descriptor); } ResultOrError> Device::CreateTextureViewImpl( TextureBase* texture, const TextureViewDescriptor* descriptor) { return TextureView::Create(texture, descriptor); } void Device::InitializeComputePipelineAsyncImpl(Ref computePipeline, WGPUCreateComputePipelineAsyncCallback callback, void* userdata) { ComputePipeline::InitializeAsync(std::move(computePipeline), callback, userdata); } void Device::InitializeRenderPipelineAsyncImpl(Ref renderPipeline, WGPUCreateRenderPipelineAsyncCallback callback, void* userdata) { RenderPipeline::InitializeAsync(std::move(renderPipeline), callback, userdata); } ResultOrError Device::CheckAndUpdateCompletedSerials() { uint64_t frontendCompletedSerial{GetCompletedCommandSerial()}; if (frontendCompletedSerial > mCompletedSerial) { // sometimes we increase the serials, in which case the completed serial in // the device base will surpass the completed serial we have in the metal backend, so we // must update ours when we see that the completed serial from device base has // increased. mCompletedSerial = frontendCompletedSerial; } return ExecutionSerial(mCompletedSerial.load()); } MaybeError Device::TickImpl() { DAWN_TRY(SubmitPendingCommandBuffer()); // Just run timestamp period calculation when timestamp feature is enabled. if (IsFeatureEnabled(Feature::TimestampQuery)) { if (@available(macos 10.15, iOS 14.0, *)) { UpdateTimestampPeriod(GetMTLDevice(), mKalmanInfo.get(), &mCpuTimestamp, &mGpuTimestamp, &mTimestampPeriod); } } return {}; } id Device::GetMTLDevice() { return mMtlDevice.Get(); } id Device::GetMTLQueue() { return mCommandQueue.Get(); } CommandRecordingContext* Device::GetPendingCommandContext() { mCommandContext.MarkUsed(); return &mCommandContext; } MaybeError Device::SubmitPendingCommandBuffer() { if (!mCommandContext.WasUsed()) { return {}; } IncrementLastSubmittedCommandSerial(); // Acquire the pending command buffer, which is retained. It must be released later. NSPRef> pendingCommands = mCommandContext.AcquireCommands(); // Replace mLastSubmittedCommands with the mutex held so we avoid races between the // schedule handler and this code. { std::lock_guard lock(mLastSubmittedCommandsMutex); mLastSubmittedCommands = pendingCommands; } // Make a local copy of the pointer to the commands because it's not clear how ObjC blocks // handle types with copy / move constructors being referenced in the block.. id pendingCommandsPointer = pendingCommands.Get(); [*pendingCommands addScheduledHandler:^(id) { // This is DRF because we hold the mutex for mLastSubmittedCommands and pendingCommands // is a local value (and not the member itself). std::lock_guard lock(mLastSubmittedCommandsMutex); if (this->mLastSubmittedCommands.Get() == pendingCommandsPointer) { this->mLastSubmittedCommands = nullptr; } }]; // Update the completed serial once the completed handler is fired. Make a local copy of // mLastSubmittedSerial so it is captured by value. ExecutionSerial pendingSerial = GetLastSubmittedCommandSerial(); // this ObjC block runs on a different thread [*pendingCommands addCompletedHandler:^(id) { TRACE_EVENT_ASYNC_END0(GetPlatform(), GPUWork, "DeviceMTL::SubmitPendingCommandBuffer", uint64_t(pendingSerial)); ASSERT(uint64_t(pendingSerial) > mCompletedSerial.load()); this->mCompletedSerial = uint64_t(pendingSerial); }]; TRACE_EVENT_ASYNC_BEGIN0(GetPlatform(), GPUWork, "DeviceMTL::SubmitPendingCommandBuffer", uint64_t(pendingSerial)); [*pendingCommands commit]; return mCommandContext.PrepareNextCommandBuffer(*mCommandQueue); } ResultOrError> Device::CreateStagingBuffer(size_t size) { std::unique_ptr stagingBuffer = std::make_unique(size, this); DAWN_TRY(stagingBuffer->Initialize()); return std::move(stagingBuffer); } MaybeError Device::CopyFromStagingToBuffer(StagingBufferBase* source, uint64_t sourceOffset, BufferBase* destination, uint64_t destinationOffset, uint64_t size) { // Metal validation layers forbid 0-sized copies, assert it is skipped prior to calling // this function. ASSERT(size != 0); ToBackend(destination) ->EnsureDataInitializedAsDestination(GetPendingCommandContext(), destinationOffset, size); id uploadBuffer = ToBackend(source)->GetBufferHandle(); id buffer = ToBackend(destination)->GetMTLBuffer(); [GetPendingCommandContext()->EnsureBlit() copyFromBuffer:uploadBuffer sourceOffset:sourceOffset toBuffer:buffer destinationOffset:destinationOffset size:size]; return {}; } // In Metal we don't write from the CPU to the texture directly which can be done using the // replaceRegion function, because the function requires a non-private storage mode and Dawn // sets the private storage mode by default for all textures except IOSurfaces on macOS. MaybeError Device::CopyFromStagingToTexture(const StagingBufferBase* source, const TextureDataLayout& dataLayout, TextureCopy* dst, const Extent3D& copySizePixels) { Texture* texture = ToBackend(dst->texture.Get()); EnsureDestinationTextureInitialized(GetPendingCommandContext(), texture, *dst, copySizePixels); RecordCopyBufferToTexture(GetPendingCommandContext(), ToBackend(source)->GetBufferHandle(), source->GetSize(), dataLayout.offset, dataLayout.bytesPerRow, dataLayout.rowsPerImage, texture, dst->mipLevel, dst->origin, dst->aspect, copySizePixels); return {}; } Ref Device::CreateTextureWrappingIOSurface(const ExternalImageDescriptor* descriptor, IOSurfaceRef ioSurface, uint32_t plane) { const TextureDescriptor* textureDescriptor = FromAPI(descriptor->cTextureDescriptor); if (ConsumedError(ValidateTextureDescriptor(this, textureDescriptor))) { return nullptr; } if (ConsumedError( ValidateIOSurfaceCanBeWrapped(this, textureDescriptor, ioSurface, plane))) { return nullptr; } Ref result; if (ConsumedError(Texture::CreateFromIOSurface(this, descriptor, ioSurface, plane), &result)) { return nullptr; } return result; } void Device::WaitForCommandsToBeScheduled() { if (ConsumedError(SubmitPendingCommandBuffer())) { return; } // Only lock the object while we take a reference to it, otherwise we could block further // progress if the driver calls the scheduled handler (which also acquires the lock) before // finishing the waitUntilScheduled. NSPRef> lastSubmittedCommands; { std::lock_guard lock(mLastSubmittedCommandsMutex); lastSubmittedCommands = mLastSubmittedCommands; } [*lastSubmittedCommands waitUntilScheduled]; } MaybeError Device::WaitForIdleForDestruction() { // Forget all pending commands. mCommandContext.AcquireCommands(); DAWN_TRY(CheckPassedSerials()); // Wait for all commands to be finished so we can free resources while (GetCompletedCommandSerial() != GetLastSubmittedCommandSerial()) { usleep(100); DAWN_TRY(CheckPassedSerials()); } return {}; } void Device::DestroyImpl() { ASSERT(GetState() == State::Disconnected); // Forget all pending commands. mCommandContext.AcquireCommands(); mCommandQueue = nullptr; mMtlDevice = nullptr; } uint32_t Device::GetOptimalBytesPerRowAlignment() const { return 1; } uint64_t Device::GetOptimalBufferToTextureCopyOffsetAlignment() const { return 1; } float Device::GetTimestampPeriodInNS() const { return mTimestampPeriod; } } // namespace dawn::native::metal