From 5a8d8dad9a890a19ecf4734a28043d5188fdb33b Mon Sep 17 00:00:00 2001 From: Sil Vilerino Date: Fri, 30 Jun 2023 15:37:19 -0400 Subject: [PATCH] d3d12: Video Process - Implement get_processor_fence and async queing Reviewed-by: Jesse Natalie Part-of: --- .../drivers/d3d12/d3d12_video_proc.cpp | 232 ++++++++++++++---- src/gallium/drivers/d3d12/d3d12_video_proc.h | 21 +- 2 files changed, 210 insertions(+), 43 deletions(-) diff --git a/src/gallium/drivers/d3d12/d3d12_video_proc.cpp b/src/gallium/drivers/d3d12/d3d12_video_proc.cpp index 3aba46dec18..24614aada1a 100644 --- a/src/gallium/drivers/d3d12/d3d12_video_proc.cpp +++ b/src/gallium/drivers/d3d12/d3d12_video_proc.cpp @@ -40,6 +40,25 @@ d3d12_video_processor_begin_frame(struct pipe_video_codec * codec, "fenceValue: %d\n", pD3D12Proc->m_fenceValue); + /// + /// Wait here to make sure the next in flight resource set is empty before using it + /// + uint64_t fenceValueToWaitOn = static_cast(std::max(static_cast(0l), static_cast(pD3D12Proc->m_fenceValue) - static_cast(D3D12_VIDEO_PROC_ASYNC_DEPTH) )); + + debug_printf("[d3d12_video_processor] d3d12_video_processor_begin_frame Waiting for completion of in flight resource sets with previous work with fenceValue: %" PRIu64 "\n", + fenceValueToWaitOn); + + ASSERTED bool wait_res = d3d12_video_processor_sync_completion(codec, fenceValueToWaitOn, OS_TIMEOUT_INFINITE); + assert(wait_res); + + HRESULT hr = pD3D12Proc->m_spCommandList->Reset(pD3D12Proc->m_spCommandAllocators[d3d12_video_processor_pool_current_index(pD3D12Proc)].Get()); + if (FAILED(hr)) { + debug_printf( + "[d3d12_video_processor] resetting ID3D12GraphicsCommandList failed with HR %x\n", + hr); + assert(false); + } + // Setup process frame arguments for output/target texture. struct d3d12_video_buffer *pOutputVideoBuffer = (struct d3d12_video_buffer *) target; @@ -159,6 +178,10 @@ d3d12_video_processor_end_frame(struct pipe_video_codec * codec, std::swap(BarrierDesc.Transition.StateBefore, BarrierDesc.Transition.StateAfter); pD3D12Proc->m_spCommandList->ResourceBarrier(static_cast(barrier_transitions.size()), barrier_transitions.data()); + + pD3D12Proc->m_PendingFences[d3d12_video_processor_pool_current_index(pD3D12Proc)].value = pD3D12Proc->m_fenceValue; + pD3D12Proc->m_PendingFences[d3d12_video_processor_pool_current_index(pD3D12Proc)].cmdqueue_fence = pD3D12Proc->m_spFence.Get(); + *picture->fence = (pipe_fence_handle*) &pD3D12Proc->m_PendingFences[d3d12_video_processor_pool_current_index(pD3D12Proc)]; } void @@ -168,6 +191,9 @@ d3d12_video_processor_process_frame(struct pipe_video_codec *codec, { struct d3d12_video_processor * pD3D12Proc = (struct d3d12_video_processor *) codec; + // begin_frame gets only called once so wouldn't update process_properties->src_surface_fence correctly + pD3D12Proc->input_surface_fence = (struct d3d12_fence*) process_properties->src_surface_fence; + // Get the underlying resources from the pipe_video_buffers struct d3d12_video_buffer *pInputVideoBuffer = (struct d3d12_video_buffer *) input_texture; @@ -265,10 +291,17 @@ d3d12_video_processor_destroy(struct pipe_video_codec * codec) if (codec == nullptr) { return; } - d3d12_video_processor_flush(codec); // Flush pending work before destroying. + // Flush pending work before destroying. + struct d3d12_video_processor *pD3D12Proc = (struct d3d12_video_processor *) codec; + + uint64_t curBatchFence = pD3D12Proc->m_fenceValue; + if (pD3D12Proc->m_needsGPUFlush) + { + d3d12_video_processor_flush(codec); + d3d12_video_processor_sync_completion(codec, curBatchFence, OS_TIMEOUT_INFINITE); + } // Call dtor to make ComPtr work - struct d3d12_video_processor * pD3D12Proc = (struct d3d12_video_processor *) codec; delete pD3D12Proc; } @@ -292,10 +325,6 @@ d3d12_video_processor_flush(struct pipe_video_codec * codec) // Make the resources permanently resident for video use d3d12_promote_to_permanent_residency(pD3D12Proc->m_pD3D12Screen, pD3D12Proc->m_OutputArguments.buffer->texture); - // Synchronize against the resources that are going to be read/written to - d3d12_resource_wait_idle(d3d12_context(pD3D12Proc->base.context), - pD3D12Proc->m_OutputArguments.buffer->texture, - true /*wantToWrite*/); for(auto curInput : pD3D12Proc->m_InputBuffers) { @@ -303,10 +332,6 @@ d3d12_video_processor_flush(struct pipe_video_codec * codec) curInput->texture); // Make the resources permanently resident for video use d3d12_promote_to_permanent_residency(pD3D12Proc->m_pD3D12Screen, curInput->texture); - // Synchronize against the resources that are going to be read/written to - d3d12_resource_wait_idle(d3d12_context(pD3D12Proc->base.context), - curInput->texture, - false /*wantToWrite*/); } HRESULT hr = pD3D12Proc->m_pD3D12Screen->dev->GetDeviceRemovedReason(); @@ -333,29 +358,20 @@ d3d12_video_processor_flush(struct pipe_video_codec * codec) goto flush_fail; } + // Flush any work batched in the d3d12_screen and Wait on the m_spCommandQueue + struct pipe_fence_handle *completion_fence = NULL; + pD3D12Proc->base.context->flush(pD3D12Proc->base.context, &completion_fence, PIPE_FLUSH_ASYNC | PIPE_FLUSH_HINT_FINISH); + struct d3d12_fence *casted_completion_fence = d3d12_fence(completion_fence); + pD3D12Proc->m_spCommandQueue->Wait(casted_completion_fence->cmdqueue_fence, casted_completion_fence->value); + pD3D12Proc->m_pD3D12Screen->base.fence_reference(&pD3D12Proc->m_pD3D12Screen->base, &completion_fence, NULL); + + struct d3d12_fence *input_surface_fence = pD3D12Proc->input_surface_fence; + if (input_surface_fence) + pD3D12Proc->m_spCommandQueue->Wait(input_surface_fence->cmdqueue_fence, input_surface_fence->value); + ID3D12CommandList *ppCommandLists[1] = { pD3D12Proc->m_spCommandList.Get() }; pD3D12Proc->m_spCommandQueue->ExecuteCommandLists(1, ppCommandLists); pD3D12Proc->m_spCommandQueue->Signal(pD3D12Proc->m_spFence.Get(), pD3D12Proc->m_fenceValue); - pD3D12Proc->m_spFence->SetEventOnCompletion(pD3D12Proc->m_fenceValue, nullptr); - debug_printf("[d3d12_video_processor] d3d12_video_processor_flush - ExecuteCommandLists finished on signal with " - "fenceValue: %d\n", - pD3D12Proc->m_fenceValue); - - hr = pD3D12Proc->m_spCommandAllocator->Reset(); - if (FAILED(hr)) { - debug_printf( - "[d3d12_video_processor] d3d12_video_processor_flush - resetting ID3D12CommandAllocator failed with HR %x\n", - hr); - goto flush_fail; - } - - hr = pD3D12Proc->m_spCommandList->Reset(pD3D12Proc->m_spCommandAllocator.Get()); - if (FAILED(hr)) { - debug_printf( - "[d3d12_video_processor] d3d12_video_processor_flush - resetting ID3D12GraphicsCommandList failed with HR %x\n", - hr); - goto flush_fail; - } // Validate device was not removed hr = pD3D12Proc->m_pD3D12Screen->dev->GetDeviceRemovedReason(); @@ -395,6 +411,7 @@ d3d12_video_processor_create(struct pipe_context *context, const struct pipe_vid // Not using new doesn't call ctor and the initializations in the class declaration are lost struct d3d12_video_processor *pD3D12Proc = new d3d12_video_processor; + pD3D12Proc->m_PendingFences.resize(D3D12_VIDEO_PROC_ASYNC_DEPTH); pD3D12Proc->base = *codec; pD3D12Proc->base.context = context; @@ -405,6 +422,7 @@ d3d12_video_processor_create(struct pipe_context *context, const struct pipe_vid pD3D12Proc->base.process_frame = d3d12_video_processor_process_frame; pD3D12Proc->base.end_frame = d3d12_video_processor_end_frame; pD3D12Proc->base.flush = d3d12_video_processor_flush; + pD3D12Proc->base.get_processor_fence = d3d12_video_processor_get_processor_fence; /// @@ -651,22 +669,32 @@ d3d12_video_processor_create_command_objects(struct d3d12_video_processor *pD3D1 return false; } - hr = pD3D12Proc->m_pD3D12Screen->dev->CreateCommandAllocator( - D3D12_COMMAND_LIST_TYPE_VIDEO_PROCESS, - IID_PPV_ARGS(pD3D12Proc->m_spCommandAllocator.GetAddressOf())); + pD3D12Proc->m_spCommandAllocators.resize(D3D12_VIDEO_PROC_ASYNC_DEPTH); + for (uint32_t i = 0; i < pD3D12Proc->m_spCommandAllocators.size() ; i++) { + hr = pD3D12Proc->m_pD3D12Screen->dev->CreateCommandAllocator( + D3D12_COMMAND_LIST_TYPE_VIDEO_PROCESS, + IID_PPV_ARGS(pD3D12Proc->m_spCommandAllocators[i].GetAddressOf())); - if (FAILED(hr)) { - debug_printf("[d3d12_video_processor] d3d12_video_processor_create_command_objects - Call to " - "CreateCommandAllocator failed with HR %x\n", - hr); + if (FAILED(hr)) { + debug_printf("[d3d12_video_processor] d3d12_video_processor_create_command_objects - Call to " + "CreateCommandAllocator failed with HR %x\n", + hr); + return false; + } + } + + ComPtr spD3D12Device4; + if (FAILED(pD3D12Proc->m_pD3D12Screen->dev->QueryInterface( + IID_PPV_ARGS(spD3D12Device4.GetAddressOf())))) { + debug_printf( + "[d3d12_video_processor] d3d12_video_processor_create_processor - D3D12 Device has no ID3D12Device4 support\n"); return false; } - hr = pD3D12Proc->m_pD3D12Screen->dev->CreateCommandList(0, - D3D12_COMMAND_LIST_TYPE_VIDEO_PROCESS, - pD3D12Proc->m_spCommandAllocator.Get(), - nullptr, - IID_PPV_ARGS(pD3D12Proc->m_spCommandList.GetAddressOf())); + hr = spD3D12Device4->CreateCommandList1(0, + D3D12_COMMAND_LIST_TYPE_VIDEO_PROCESS, + D3D12_COMMAND_LIST_FLAG_NONE, + IID_PPV_ARGS(pD3D12Proc->m_spCommandList.GetAddressOf())); if (FAILED(hr)) { debug_printf("[d3d12_video_processor] d3d12_video_processor_create_command_objects - Call to CreateCommandList " @@ -711,3 +739,123 @@ d3d12_video_processor_convert_pipe_rotation(enum pipe_video_vpp_orientation orie return result; } + +uint64_t +d3d12_video_processor_pool_current_index(struct d3d12_video_processor *pD3D12Proc) +{ + return pD3D12Proc->m_fenceValue % D3D12_VIDEO_PROC_ASYNC_DEPTH; +} + + +bool +d3d12_video_processor_ensure_fence_finished(struct pipe_video_codec *codec, + uint64_t fenceValueToWaitOn, + uint64_t timeout_ns) +{ + bool wait_result = true; + struct d3d12_video_processor *pD3D12Proc = (struct d3d12_video_processor *) codec; + HRESULT hr = S_OK; + uint64_t completedValue = pD3D12Proc->m_spFence->GetCompletedValue(); + + debug_printf( + "[d3d12_video_processor] d3d12_video_processor_ensure_fence_finished - Waiting for fence (with timeout_ns %" PRIu64 + ") to finish with " + "fenceValue: %" PRIu64 " - Current Fence Completed Value %" PRIu64 "\n", + timeout_ns, + fenceValueToWaitOn, + completedValue); + + if (completedValue < fenceValueToWaitOn) { + + HANDLE event = {}; + int event_fd = 0; + event = d3d12_fence_create_event(&event_fd); + + hr = pD3D12Proc->m_spFence->SetEventOnCompletion(fenceValueToWaitOn, event); + if (FAILED(hr)) { + debug_printf("[d3d12_video_processor] d3d12_video_processor_ensure_fence_finished - SetEventOnCompletion for " + "fenceValue %" PRIu64 " failed with HR %x\n", + fenceValueToWaitOn, + hr); + goto ensure_fence_finished_fail; + } + + wait_result = d3d12_fence_wait_event(event, event_fd, timeout_ns); + d3d12_fence_close_event(event, event_fd); + + debug_printf("[d3d12_video_processor] d3d12_video_processor_ensure_fence_finished - Waiting on fence to be done with " + "fenceValue: %" PRIu64 " - current CompletedValue: %" PRIu64 "\n", + fenceValueToWaitOn, + completedValue); + } else { + debug_printf("[d3d12_video_processor] d3d12_video_processor_ensure_fence_finished - Fence already done with " + "fenceValue: %" PRIu64 " - current CompletedValue: %" PRIu64 "\n", + fenceValueToWaitOn, + completedValue); + } + return wait_result; + +ensure_fence_finished_fail: + debug_printf("[d3d12_video_processor] d3d12_video_processor_sync_completion failed for fenceValue: %" PRIu64 "\n", + fenceValueToWaitOn); + assert(false); + return false; +} + +bool +d3d12_video_processor_sync_completion(struct pipe_video_codec *codec, uint64_t fenceValueToWaitOn, uint64_t timeout_ns) +{ + struct d3d12_video_processor *pD3D12Proc = (struct d3d12_video_processor *) codec; + assert(pD3D12Proc); + assert(pD3D12Proc->m_spD3D12VideoDevice); + assert(pD3D12Proc->m_spCommandQueue); + HRESULT hr = S_OK; + + ASSERTED bool wait_result = d3d12_video_processor_ensure_fence_finished(codec, fenceValueToWaitOn, timeout_ns); + assert(wait_result); + + hr = + pD3D12Proc->m_spCommandAllocators[fenceValueToWaitOn % D3D12_VIDEO_PROC_ASYNC_DEPTH]->Reset(); + if (FAILED(hr)) { + debug_printf("m_spCommandAllocator->Reset() failed with %x.\n", hr); + goto sync_with_token_fail; + } + + // Validate device was not removed + hr = pD3D12Proc->m_pD3D12Screen->dev->GetDeviceRemovedReason(); + if (hr != S_OK) { + debug_printf("[d3d12_video_processor] d3d12_video_processor_sync_completion" + " - D3D12Device was removed AFTER d3d12_video_processor_ensure_fence_finished " + "execution with HR %x, but wasn't before.\n", + hr); + goto sync_with_token_fail; + } + + debug_printf( + "[d3d12_video_processor] d3d12_video_processor_sync_completion - GPU execution finalized for fenceValue: %" PRIu64 + "\n", + fenceValueToWaitOn); + + return wait_result; + +sync_with_token_fail: + debug_printf("[d3d12_video_processor] d3d12_video_processor_sync_completion failed for fenceValue: %" PRIu64 "\n", + fenceValueToWaitOn); + assert(false); + return false; +} + +int d3d12_video_processor_get_processor_fence(struct pipe_video_codec *codec, + struct pipe_fence_handle *fence, + uint64_t timeout) +{ + struct d3d12_fence *fenceValueToWaitOn = (struct d3d12_fence *) fence; + assert(fenceValueToWaitOn); + + ASSERTED bool wait_res = d3d12_video_processor_sync_completion(codec, fenceValueToWaitOn->value, timeout); + + // Return semantics based on p_video_codec interface + // ret == 0 -> work in progress + // ret != 0 -> work completed + return wait_res ? 1 : 0; +} diff --git a/src/gallium/drivers/d3d12/d3d12_video_proc.h b/src/gallium/drivers/d3d12/d3d12_video_proc.h index 92eaad6edd4..5631a71982e 100644 --- a/src/gallium/drivers/d3d12/d3d12_video_proc.h +++ b/src/gallium/drivers/d3d12/d3d12_video_proc.h @@ -109,7 +109,8 @@ struct d3d12_video_processor std::vector m_inputStreamDescs; ComPtr m_spVideoProcessor; ComPtr m_spCommandQueue; - ComPtr m_spCommandAllocator; + std::vector> m_spCommandAllocators; + std::vector m_PendingFences; ComPtr m_spCommandList; std::vector m_transitionsBeforeCloseCmdList; @@ -123,6 +124,8 @@ struct d3d12_video_processor bool m_needsGPUFlush = false; D3D12_FEATURE_DATA_VIDEO_PROCESS_MAX_INPUT_STREAMS m_vpMaxInputStreams = { }; + + struct d3d12_fence* input_surface_fence = NULL; }; struct pipe_video_codec * @@ -141,6 +144,22 @@ d3d12_video_processor_create_command_objects(struct d3d12_video_processor *pD3D1 D3D12_VIDEO_PROCESS_ORIENTATION d3d12_video_processor_convert_pipe_rotation(enum pipe_video_vpp_orientation orientation); +bool +d3d12_video_processor_ensure_fence_finished(struct pipe_video_codec *codec, uint64_t fenceValueToWaitOn, uint64_t timeout_ns); + +bool +d3d12_video_processor_sync_completion(struct pipe_video_codec *codec, uint64_t fenceValueToWaitOn, uint64_t timeout_ns); + +uint64_t +d3d12_video_processor_pool_current_index(struct d3d12_video_processor *codec); + +int d3d12_video_processor_get_processor_fence(struct pipe_video_codec *codec, + struct pipe_fence_handle *fence, + uint64_t timeout); + +// We need enough to so next item in pipeline doesn't ask for a fence value we lost +const uint64_t D3D12_VIDEO_PROC_ASYNC_DEPTH = 36; + /// /// d3d12_video_processor functions ends ///