/* * D3D12 Hardware-Accelerated Motion Estimation Filter * * Copyright (c) 2025 Advanced Micro Devices, Inc. * * This file is part of FFmpeg. * * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "libavutil/avassert.h" #include "libavutil/buffer.h" #include "libavutil/hwcontext.h" #include "libavutil/hwcontext_d3d12va_internal.h" #include "libavutil/hwcontext_d3d12va.h" #include "libavutil/internal.h" #include "libavutil/opt.h" #include "libavutil/motion_vector.h" #include "libavutil/mem.h" #include "avfilter.h" #include "filters.h" #include "video.h" typedef struct MEstimateD3D12Context { const AVClass *class; AVBufferRef *hw_device_ref; AVBufferRef *hw_frames_ref; AVD3D12VADeviceContext *device_ctx; AVD3D12VAFramesContext *frames_ctx; ID3D12Device *device; ID3D12VideoDevice1 *video_device; ID3D12VideoMotionEstimator *motion_estimator; ID3D12VideoMotionVectorHeap *motion_vector_heap; ID3D12VideoEncodeCommandList *command_list; ID3D12CommandQueue *command_queue; ID3D12CommandAllocator *command_allocator; // Graphics command list and queue for copy operations ID3D12GraphicsCommandList *copy_command_list; ID3D12CommandAllocator *copy_command_allocator; ID3D12CommandQueue *copy_command_queue; // Synchronization ID3D12Fence *fence; HANDLE fence_event; uint64_t fence_value; // Motion estimation parameters int block_size; // 8 or 16 D3D12_VIDEO_MOTION_ESTIMATOR_SEARCH_BLOCK_SIZE d3d12_block_size; D3D12_VIDEO_MOTION_ESTIMATOR_VECTOR_PRECISION precision; // Frame buffer AVFrame *prev_frame; AVFrame *cur_frame; AVFrame *next_frame; // Output textures for resolved motion vectors (GPU-side, DEFAULT heap) ID3D12Resource *resolved_mv_texture_back; ID3D12Resource *resolved_mv_texture_fwd; // Readback buffers for CPU access (READBACK heap) ID3D12Resource *readback_buffer_back; ID3D12Resource *readback_buffer_fwd; size_t readback_buffer_size; int initialized; } MEstimateD3D12Context; static int mestimate_d3d12_init(AVFilterContext *ctx) { MEstimateD3D12Context *s = ctx->priv; s->initialized = 0; s->fence_value = 0; // Validate block size - only 8 and 16 are valid if (s->block_size != 8 && s->block_size != 16) { av_log(ctx, AV_LOG_ERROR, "Invalid block_size %d. Only 8 and 16 are supported.\n", s->block_size); return AVERROR(EINVAL); } // Set D3D12 block size based on user option if (s->block_size == 8) s->d3d12_block_size = D3D12_VIDEO_MOTION_ESTIMATOR_SEARCH_BLOCK_SIZE_8X8; else s->d3d12_block_size = D3D12_VIDEO_MOTION_ESTIMATOR_SEARCH_BLOCK_SIZE_16X16; // Use quarter-pel precision s->precision = D3D12_VIDEO_MOTION_ESTIMATOR_VECTOR_PRECISION_QUARTER_PEL; return 0; } static int mestimate_d3d12_create_objects(AVFilterContext *ctx) { MEstimateD3D12Context *s = ctx->priv; HRESULT hr; D3D12_COMMAND_QUEUE_DESC queue_desc = { .Type = D3D12_COMMAND_LIST_TYPE_VIDEO_ENCODE, .Priority = 0, .Flags = D3D12_COMMAND_QUEUE_FLAG_NONE, .NodeMask = 0, }; // Create fence for synchronization hr = ID3D12Device_CreateFence(s->device, 0, D3D12_FENCE_FLAG_NONE, &IID_ID3D12Fence, (void **)&s->fence); if (FAILED(hr)) { av_log(ctx, AV_LOG_ERROR, "Failed to create fence\n"); return AVERROR(EINVAL); } s->fence_event = CreateEvent(NULL, FALSE, FALSE, NULL); if (!s->fence_event) { av_log(ctx, AV_LOG_ERROR, "Failed to create fence event\n"); return AVERROR(EINVAL); } // Create command queue hr = ID3D12Device_CreateCommandQueue(s->device, &queue_desc, &IID_ID3D12CommandQueue, (void **)&s->command_queue); if (FAILED(hr)) { av_log(ctx, AV_LOG_ERROR, "Failed to create command queue\n"); return AVERROR(EINVAL); } // Create command allocator hr = ID3D12Device_CreateCommandAllocator(s->device, D3D12_COMMAND_LIST_TYPE_VIDEO_ENCODE, &IID_ID3D12CommandAllocator, (void **)&s->command_allocator); if (FAILED(hr)) { av_log(ctx, AV_LOG_ERROR, "Failed to create command allocator\n"); return AVERROR(EINVAL); } // Create command list hr = ID3D12Device_CreateCommandList(s->device, 0, D3D12_COMMAND_LIST_TYPE_VIDEO_ENCODE, s->command_allocator, NULL, &IID_ID3D12VideoEncodeCommandList, (void **)&s->command_list); if (FAILED(hr)) { av_log(ctx, AV_LOG_ERROR, "Failed to create command list\n"); return AVERROR(EINVAL); } hr = ID3D12VideoEncodeCommandList_Close(s->command_list); if (FAILED(hr)) { av_log(ctx, AV_LOG_ERROR, "Failed to close command list\n"); return AVERROR(EINVAL); } return 0; } static int mestimate_d3d12_create_motion_estimator(AVFilterContext *ctx, int width, int height) { MEstimateD3D12Context *s = ctx->priv; HRESULT hr; D3D12_FEATURE_DATA_VIDEO_MOTION_ESTIMATOR feature_data = {0}; D3D12_VIDEO_MOTION_ESTIMATOR_DESC me_desc = {0}; D3D12_VIDEO_MOTION_VECTOR_HEAP_DESC heap_desc = {0}; // Check if motion estimation is supported // Set the input parameters for what we want to query feature_data.NodeIndex = 0; feature_data.InputFormat = s->frames_ctx->format; feature_data.BlockSizeFlags = 0; // Will be filled by CheckFeatureSupport with supported flags feature_data.PrecisionFlags = 0; // Will be filled by CheckFeatureSupport with supported flags feature_data.SizeRange.MaxWidth = width; feature_data.SizeRange.MaxHeight = height; feature_data.SizeRange.MinWidth = width; feature_data.SizeRange.MinHeight = height; hr = ID3D12VideoDevice1_CheckFeatureSupport(s->video_device, D3D12_FEATURE_VIDEO_MOTION_ESTIMATOR, &feature_data, sizeof(feature_data)); if (FAILED(hr)) { av_log(ctx, AV_LOG_ERROR, "Failed to check motion estimator support (hr=0x%lx)\n", (long)hr); return AVERROR(EINVAL); } // Verify the requested features are actually supported (check returned flags) D3D12_VIDEO_MOTION_ESTIMATOR_SEARCH_BLOCK_SIZE_FLAGS requested_block_flag = (s->d3d12_block_size == D3D12_VIDEO_MOTION_ESTIMATOR_SEARCH_BLOCK_SIZE_8X8) ? D3D12_VIDEO_MOTION_ESTIMATOR_SEARCH_BLOCK_SIZE_FLAG_8X8 : D3D12_VIDEO_MOTION_ESTIMATOR_SEARCH_BLOCK_SIZE_FLAG_16X16; if (!(feature_data.BlockSizeFlags & requested_block_flag)) { av_log(ctx, AV_LOG_ERROR, "Requested block size (%dx%d) not supported by device (supported flags: 0x%x)\n", s->block_size, s->block_size, feature_data.BlockSizeFlags); return AVERROR(ENOSYS); } if (!(feature_data.PrecisionFlags & D3D12_VIDEO_MOTION_ESTIMATOR_VECTOR_PRECISION_FLAG_QUARTER_PEL)) { av_log(ctx, AV_LOG_ERROR, "Quarter-pel precision not supported by device (supported flags: 0x%x)\n", feature_data.PrecisionFlags); return AVERROR(ENOSYS); } av_log(ctx, AV_LOG_VERBOSE, "Motion estimator support confirmed: block_size=%dx%d, precision=quarter-pel\n", s->block_size, s->block_size); // Create motion estimator me_desc.NodeMask = 0; me_desc.InputFormat = s->frames_ctx->format; me_desc.BlockSize = s->d3d12_block_size; me_desc.Precision = s->precision; me_desc.SizeRange = feature_data.SizeRange; hr = ID3D12VideoDevice1_CreateVideoMotionEstimator(s->video_device, &me_desc, NULL, &IID_ID3D12VideoMotionEstimator, (void **)&s->motion_estimator); if (FAILED(hr)) { av_log(ctx, AV_LOG_ERROR, "Failed to create motion estimator\n"); return AVERROR(EINVAL); } // Create motion vector heap heap_desc.NodeMask = 0; heap_desc.InputFormat = s->frames_ctx->format; heap_desc.BlockSize = s->d3d12_block_size; heap_desc.Precision = s->precision; heap_desc.SizeRange = feature_data.SizeRange; hr = ID3D12VideoDevice1_CreateVideoMotionVectorHeap(s->video_device, &heap_desc, NULL, &IID_ID3D12VideoMotionVectorHeap, (void **)&s->motion_vector_heap); if (FAILED(hr)) { av_log(ctx, AV_LOG_ERROR, "Failed to create motion vector heap\n"); return AVERROR(EINVAL); } // Create resolved motion vector textures in DEFAULT heap (GPU writable) // ResolveMotionVectorHeap outputs to TEXTURE2D with DXGI_FORMAT_R16G16_SINT int mb_width = (width + s->block_size - 1) / s->block_size; int mb_height = (height + s->block_size - 1) / s->block_size; D3D12_HEAP_PROPERTIES heap_props_default = {.Type = D3D12_HEAP_TYPE_DEFAULT}; D3D12_RESOURCE_DESC texture_desc = { .Dimension = D3D12_RESOURCE_DIMENSION_TEXTURE2D, .Alignment = 0, .Width = mb_width, .Height = mb_height, .DepthOrArraySize = 1, .MipLevels = 1, .Format = DXGI_FORMAT_R16G16_SINT, // Motion vector format: signed 16-bit X,Y .SampleDesc = {.Count = 1, .Quality = 0}, .Layout = D3D12_TEXTURE_LAYOUT_UNKNOWN, .Flags = D3D12_RESOURCE_FLAG_NONE, }; hr = ID3D12Device_CreateCommittedResource(s->device, &heap_props_default, D3D12_HEAP_FLAG_NONE, &texture_desc, D3D12_RESOURCE_STATE_COMMON, NULL, &IID_ID3D12Resource, (void **)&s->resolved_mv_texture_back); if (FAILED(hr)) { av_log(ctx, AV_LOG_ERROR, "Failed to create backward motion vector texture (hr=0x%lx)\n", (long)hr); return AVERROR(EINVAL); } hr = ID3D12Device_CreateCommittedResource(s->device, &heap_props_default, D3D12_HEAP_FLAG_NONE, &texture_desc, D3D12_RESOURCE_STATE_COMMON, NULL, &IID_ID3D12Resource, (void **)&s->resolved_mv_texture_fwd); if (FAILED(hr)) { av_log(ctx, AV_LOG_ERROR, "Failed to create forward motion vector texture (hr=0x%lx)\n", (long)hr); return AVERROR(EINVAL); } // Create READBACK buffers for CPU access // Need to calculate proper size accounting for D3D12 row pitch alignment // Get the footprint to determine the actual required buffer size D3D12_PLACED_SUBRESOURCE_FOOTPRINT temp_layout; UINT64 temp_total_size; ID3D12Device_GetCopyableFootprints(s->device, &texture_desc, 0, 1, 0, &temp_layout, NULL, NULL, &temp_total_size); s->readback_buffer_size = temp_total_size; av_log(ctx, AV_LOG_DEBUG, "Readback buffer size: %llu bytes (texture: %dx%d, pitch: %u)\n", (unsigned long long)s->readback_buffer_size, mb_width, mb_height, temp_layout.Footprint.RowPitch); D3D12_HEAP_PROPERTIES heap_props_readback = {.Type = D3D12_HEAP_TYPE_READBACK}; D3D12_RESOURCE_DESC buffer_desc = { .Dimension = D3D12_RESOURCE_DIMENSION_BUFFER, .Alignment = 0, .Width = s->readback_buffer_size, .Height = 1, .DepthOrArraySize = 1, .MipLevels = 1, .Format = DXGI_FORMAT_UNKNOWN, .SampleDesc = {.Count = 1, .Quality = 0}, .Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR, .Flags = D3D12_RESOURCE_FLAG_NONE, }; hr = ID3D12Device_CreateCommittedResource(s->device, &heap_props_readback, D3D12_HEAP_FLAG_NONE, &buffer_desc, D3D12_RESOURCE_STATE_COPY_DEST, NULL, &IID_ID3D12Resource, (void **)&s->readback_buffer_back); if (FAILED(hr)) { av_log(ctx, AV_LOG_ERROR, "Failed to create backward readback buffer (hr=0x%lx)\n", (long)hr); return AVERROR(EINVAL); } hr = ID3D12Device_CreateCommittedResource(s->device, &heap_props_readback, D3D12_HEAP_FLAG_NONE, &buffer_desc, D3D12_RESOURCE_STATE_COPY_DEST, NULL, &IID_ID3D12Resource, (void **)&s->readback_buffer_fwd); if (FAILED(hr)) { av_log(ctx, AV_LOG_ERROR, "Failed to create forward readback buffer (hr=0x%lx)\n", (long)hr); return AVERROR(EINVAL); } // Create graphics command queue, allocator and list for copy operations D3D12_COMMAND_QUEUE_DESC copy_queue_desc = { .Type = D3D12_COMMAND_LIST_TYPE_DIRECT, .Priority = 0, .Flags = D3D12_COMMAND_QUEUE_FLAG_NONE, .NodeMask = 0, }; hr = ID3D12Device_CreateCommandQueue(s->device, ©_queue_desc, &IID_ID3D12CommandQueue, (void **)&s->copy_command_queue); if (FAILED(hr)) { av_log(ctx, AV_LOG_ERROR, "Failed to create copy command queue\n"); return AVERROR(EINVAL); } hr = ID3D12Device_CreateCommandAllocator(s->device, D3D12_COMMAND_LIST_TYPE_DIRECT, &IID_ID3D12CommandAllocator, (void **)&s->copy_command_allocator); if (FAILED(hr)) { av_log(ctx, AV_LOG_ERROR, "Failed to create copy command allocator\n"); return AVERROR(EINVAL); } hr = ID3D12Device_CreateCommandList(s->device, 0, D3D12_COMMAND_LIST_TYPE_DIRECT, s->copy_command_allocator, NULL, &IID_ID3D12GraphicsCommandList, (void **)&s->copy_command_list); if (FAILED(hr)) { av_log(ctx, AV_LOG_ERROR, "Failed to create copy command list\n"); return AVERROR(EINVAL); } hr = ID3D12GraphicsCommandList_Close(s->copy_command_list); if (FAILED(hr)) { av_log(ctx, AV_LOG_ERROR, "Failed to close copy command list\n"); return AVERROR(EINVAL); } return 0; } static int mestimate_d3d12_config_props(AVFilterLink *outlink) { AVFilterContext *ctx = outlink->src; AVFilterLink *inlink = ctx->inputs[0]; FilterLink *inl = ff_filter_link(inlink); FilterLink *outl = ff_filter_link(outlink); MEstimateD3D12Context *s = ctx->priv; AVHWFramesContext *hw_frames_ctx; HRESULT hr; int err; if (!inl->hw_frames_ctx) { av_log(ctx, AV_LOG_ERROR, "D3D12 hardware frames context required\n"); return AVERROR(EINVAL); } hw_frames_ctx = (AVHWFramesContext *)inl->hw_frames_ctx->data; if (hw_frames_ctx->format != AV_PIX_FMT_D3D12) { av_log(ctx, AV_LOG_ERROR, "Input must be D3D12 frames\n"); return AVERROR(EINVAL); } s->hw_frames_ref = av_buffer_ref(inl->hw_frames_ctx); if (!s->hw_frames_ref) return AVERROR(ENOMEM); s->frames_ctx = hw_frames_ctx->hwctx; s->hw_device_ref = av_buffer_ref(hw_frames_ctx->device_ref); if (!s->hw_device_ref) return AVERROR(ENOMEM); s->device_ctx = ((AVHWDeviceContext *)s->hw_device_ref->data)->hwctx; s->device = s->device_ctx->device; // Propagate hardware frames context to output outl->hw_frames_ctx = av_buffer_ref(inl->hw_frames_ctx); if (!outl->hw_frames_ctx) return AVERROR(ENOMEM); // Query for ID3D12VideoDevice1 interface from the base video device hr = ID3D12VideoDevice_QueryInterface(s->device_ctx->video_device, &IID_ID3D12VideoDevice1, (void **)&s->video_device); if (FAILED(hr)) { av_log(ctx, AV_LOG_ERROR, "ID3D12VideoDevice1 interface not supported\n"); return AVERROR(ENOSYS); } err = mestimate_d3d12_create_objects(ctx); if (err < 0) return err; err = mestimate_d3d12_create_motion_estimator(ctx, inlink->w, inlink->h); if (err < 0) return err; s->initialized = 1; return 0; } static int mestimate_d3d12_sync_gpu(MEstimateD3D12Context *s) { uint64_t completion = ID3D12Fence_GetCompletedValue(s->fence); if (completion < s->fence_value) { if (FAILED(ID3D12Fence_SetEventOnCompletion(s->fence, s->fence_value, s->fence_event))) return AVERROR(EINVAL); WaitForSingleObjectEx(s->fence_event, INFINITE, FALSE); } return 0; } static inline void d3d12_barrier_transition(D3D12_RESOURCE_BARRIER *barrier, ID3D12Resource *resource, D3D12_RESOURCE_STATES state_before, D3D12_RESOURCE_STATES state_after) { barrier->Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION; barrier->Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE; barrier->Transition.pResource = resource; barrier->Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES; barrier->Transition.StateBefore = state_before; barrier->Transition.StateAfter = state_after; } static void add_mv_data(AVMotionVector *mv, int mb_size, int x, int y, int x_mv, int y_mv, int dir) { mv->w = mb_size; mv->h = mb_size; mv->dst_x = x + (mb_size >> 1); mv->dst_y = y + (mb_size >> 1); mv->src_x = x_mv + (mb_size >> 1); mv->src_y = y_mv + (mb_size >> 1); mv->source = dir ? 1 : -1; mv->flags = 0; mv->motion_x = x_mv - x; mv->motion_y = y_mv - y; mv->motion_scale = 1; } static int mestimate_d3d12_read_motion_vectors(AVFilterContext *ctx, AVFrame *out, int direction) { MEstimateD3D12Context *s = ctx->priv; uint8_t *mapped_data = NULL; HRESULT hr; int err = 0; AVFrameSideData *sd; AVMotionVector *mvs; int mb_x, mb_y, mv_idx; int mb_width, mb_height; int16_t *d3d12_mvs; ID3D12Resource *buffer = (direction == 0) ? s->readback_buffer_back : s->readback_buffer_fwd; // Map the readback buffer hr = ID3D12Resource_Map(buffer, 0, NULL, (void **)&mapped_data); if (FAILED(hr)) { av_log(ctx, AV_LOG_ERROR, "Failed to map readback buffer (dir=%d, hr=0x%lx)\n", direction, (long)hr); return AVERROR(EINVAL); } // Get the motion vector side data sd = av_frame_get_side_data(out, AV_FRAME_DATA_MOTION_VECTORS); if (!sd) { av_log(ctx, AV_LOG_ERROR, "No motion vector side data found\n"); ID3D12Resource_Unmap(buffer, 0, NULL); return AVERROR(EINVAL); } mvs = (AVMotionVector *)sd->data; mb_width = (out->width + s->block_size - 1) / s->block_size; mb_height = (out->height + s->block_size - 1) / s->block_size; // Calculate offset for this direction (0 = backward, 1 = forward) mv_idx = direction * mb_width * mb_height; // Parse D3D12 motion vector format // According to Microsoft documentation: // - Format: DXGI_FORMAT_R16G16_SINT (2D texture) // - Data: Signed 16-bit integers // - Units: Quarter-PEL (quarter pixel precision) // - Layout: X component in R channel, Y component in G channel // - Storage: 2D array matching block layout // // Each motion vector is stored as two int16_t values (X, Y) in quarter-pel units // The buffer is organized as a 2D array: [mb_height][mb_width][2] d3d12_mvs = (int16_t *)mapped_data; for (mb_y = 0; mb_y < mb_height; mb_y++) { for (mb_x = 0; mb_x < mb_width; mb_x++) { const int x_mb = mb_x * s->block_size; const int y_mb = mb_y * s->block_size; const int mv_offset = (mb_y * mb_width + mb_x) * 2; // Read motion vector components in quarter-pel units // R component (index 0) = X motion // G component (index 1) = Y motion int16_t mv_x_qpel = d3d12_mvs[mv_offset + 0]; int16_t mv_y_qpel = d3d12_mvs[mv_offset + 1]; // Convert from quarter-pel to full pixel coordinates // Quarter-pel means the value is 4x the actual pixel displacement // So divide by 4 to get pixel displacement int src_x = x_mb + (mv_x_qpel / 4); int src_y = y_mb + (mv_y_qpel / 4); // Store the motion vector data // This will set dst (current position) and src (where it came from) add_mv_data(&mvs[mv_idx++], s->block_size, x_mb, y_mb, src_x, src_y, direction); av_log(ctx, AV_LOG_TRACE, "Block[%d,%d] dir=%d: MV=(%d,%d) qpel -> (%d,%d) pixels\n", mb_x, mb_y, direction, mv_x_qpel, mv_y_qpel, mv_x_qpel / 4, mv_y_qpel / 4); } } ID3D12Resource_Unmap(buffer, 0, NULL); av_log(ctx, AV_LOG_DEBUG, "Parsed %d motion vectors for direction %d\n", mb_width * mb_height, direction); return err; } static int mestimate_d3d12_filter_frame(AVFilterLink *inlink, AVFrame *frame) { AVFilterContext *ctx = inlink->dst; MEstimateD3D12Context *s = ctx->priv; AVFrame *out; AVFrameSideData *sd; AVD3D12VAFrame *cur_hwframe, *prev_hwframe, *next_hwframe = NULL; HRESULT hr; int err; int mb_width, mb_height, mb_count; if (!s->initialized) { err = mestimate_d3d12_config_props(ctx->outputs[0]); if (err < 0) { av_frame_free(&frame); return err; } } // Manage frame buffer av_frame_free(&s->prev_frame); s->prev_frame = s->cur_frame; s->cur_frame = s->next_frame; s->next_frame = frame; if (!s->cur_frame) { s->cur_frame = av_frame_clone(frame); if (!s->cur_frame) return AVERROR(ENOMEM); } if (!s->prev_frame) return 0; // Clone current frame for output out = av_frame_clone(s->cur_frame); if (!out) return AVERROR(ENOMEM); mb_width = (frame->width + s->block_size - 1) / s->block_size; mb_height = (frame->height + s->block_size - 1) / s->block_size; mb_count = mb_width * mb_height; // Allocate side data for motion vectors (2 directions) sd = av_frame_new_side_data(out, AV_FRAME_DATA_MOTION_VECTORS, 2 * mb_count * sizeof(AVMotionVector)); if (!sd) { av_frame_free(&out); return AVERROR(ENOMEM); } // Get hardware frame pointers cur_hwframe = (AVD3D12VAFrame *)s->cur_frame->data[0]; prev_hwframe = (AVD3D12VAFrame *)s->prev_frame->data[0]; if (s->next_frame) next_hwframe = (AVD3D12VAFrame *)s->next_frame->data[0]; // Reset command allocator and list ONCE for both estimations hr = ID3D12CommandAllocator_Reset(s->command_allocator); if (FAILED(hr)) { av_log(ctx, AV_LOG_ERROR, "Failed to reset command allocator\n"); av_frame_free(&out); return AVERROR(EINVAL); } hr = ID3D12VideoEncodeCommandList_Reset(s->command_list, s->command_allocator); if (FAILED(hr)) { av_log(ctx, AV_LOG_ERROR, "Failed to reset command list\n"); av_frame_free(&out); return AVERROR(EINVAL); } // Transition current and previous frames to VIDEO_ENCODE_READ D3D12_RESOURCE_BARRIER barriers[3]; int barrier_count = 2; d3d12_barrier_transition(&barriers[0], cur_hwframe->texture, D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_VIDEO_ENCODE_READ); d3d12_barrier_transition(&barriers[1], prev_hwframe->texture, D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_VIDEO_ENCODE_READ); if (next_hwframe) { d3d12_barrier_transition(&barriers[2], next_hwframe->texture, D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_VIDEO_ENCODE_READ); barrier_count = 3; } ID3D12VideoEncodeCommandList_ResourceBarrier(s->command_list, barrier_count, barriers); // Backward motion estimation (cur -> prev) D3D12_VIDEO_MOTION_ESTIMATOR_INPUT input_back = { .pInputTexture2D = cur_hwframe->texture, .InputSubresourceIndex = 0, .pReferenceTexture2D = prev_hwframe->texture, .ReferenceSubresourceIndex = 0, .pHintMotionVectorHeap = NULL, }; D3D12_VIDEO_MOTION_ESTIMATOR_OUTPUT output = { .pMotionVectorHeap = s->motion_vector_heap, }; ID3D12VideoEncodeCommandList_EstimateMotion(s->command_list, s->motion_estimator, &output, &input_back); D3D12_RESOLVE_VIDEO_MOTION_VECTOR_HEAP_INPUT resolve_input = { .pMotionVectorHeap = s->motion_vector_heap, .PixelWidth = s->cur_frame->width, .PixelHeight = s->cur_frame->height, }; D3D12_RESOLVE_VIDEO_MOTION_VECTOR_HEAP_OUTPUT resolve_output_back = { .pMotionVectorTexture2D = s->resolved_mv_texture_back, .MotionVectorCoordinate = {.X = 0, .Y = 0, .Z = 0, .SubresourceIndex = 0}, }; ID3D12VideoEncodeCommandList_ResolveMotionVectorHeap(s->command_list, &resolve_output_back, &resolve_input); // Copy resolved texture to readback buffer for CPU access // CopyTextureRegion is not available on video encode command list // We'll need to read directly from the resolved texture after GPU sync // Forward motion estimation (cur -> next) if next frame exists if (next_hwframe) { D3D12_VIDEO_MOTION_ESTIMATOR_INPUT input_fwd = { .pInputTexture2D = cur_hwframe->texture, .InputSubresourceIndex = 0, .pReferenceTexture2D = next_hwframe->texture, .ReferenceSubresourceIndex = 0, .pHintMotionVectorHeap = NULL, }; ID3D12VideoEncodeCommandList_EstimateMotion(s->command_list, s->motion_estimator, &output, &input_fwd); D3D12_RESOLVE_VIDEO_MOTION_VECTOR_HEAP_OUTPUT resolve_output_fwd = { .pMotionVectorTexture2D = s->resolved_mv_texture_fwd, .MotionVectorCoordinate = {.X = 0, .Y = 0, .Z = 0, .SubresourceIndex = 0}, }; ID3D12VideoEncodeCommandList_ResolveMotionVectorHeap(s->command_list, &resolve_output_fwd, &resolve_input); // Copy will be done after command list execution } // Transition resources back to COMMON (reuse barriers by swapping states) for (int i = 0; i < barrier_count; i++) FFSWAP(D3D12_RESOURCE_STATES, barriers[i].Transition.StateBefore, barriers[i].Transition.StateAfter); ID3D12VideoEncodeCommandList_ResourceBarrier(s->command_list, barrier_count, barriers); // Close command list ONCE hr = ID3D12VideoEncodeCommandList_Close(s->command_list); if (FAILED(hr)) { av_log(ctx, AV_LOG_ERROR, "Failed to close command list (hr=0x%lx)\n", (long)hr); av_frame_free(&out); return AVERROR(EINVAL); } // Wait for input frame sync hr = ID3D12CommandQueue_Wait(s->command_queue, cur_hwframe->sync_ctx.fence, cur_hwframe->sync_ctx.fence_value); if (FAILED(hr)) { av_log(ctx, AV_LOG_ERROR, "Failed to wait for current frame\n"); av_frame_free(&out); return AVERROR(EINVAL); } hr = ID3D12CommandQueue_Wait(s->command_queue, prev_hwframe->sync_ctx.fence, prev_hwframe->sync_ctx.fence_value); if (FAILED(hr)) { av_log(ctx, AV_LOG_ERROR, "Failed to wait for previous frame\n"); av_frame_free(&out); return AVERROR(EINVAL); } if (next_hwframe) { hr = ID3D12CommandQueue_Wait(s->command_queue, next_hwframe->sync_ctx.fence, next_hwframe->sync_ctx.fence_value); if (FAILED(hr)) { av_log(ctx, AV_LOG_ERROR, "Failed to wait for next frame\n"); av_frame_free(&out); return AVERROR(EINVAL); } } // Execute command list ONCE ID3D12CommandQueue_ExecuteCommandLists(s->command_queue, 1, (ID3D12CommandList **)&s->command_list); // Signal completion hr = ID3D12CommandQueue_Signal(s->command_queue, s->fence, ++s->fence_value); if (FAILED(hr)) { av_log(ctx, AV_LOG_ERROR, "Failed to signal fence\n"); av_frame_free(&out); return AVERROR(EINVAL); } // Wait for GPU to complete err = mestimate_d3d12_sync_gpu(s); if (err < 0) { av_frame_free(&out); return err; } // Now copy the resolved textures to readback buffers using graphics command list hr = ID3D12CommandAllocator_Reset(s->copy_command_allocator); if (FAILED(hr)) { av_log(ctx, AV_LOG_ERROR, "Failed to reset copy command allocator\n"); av_frame_free(&out); return AVERROR(EINVAL); } hr = ID3D12GraphicsCommandList_Reset(s->copy_command_list, s->copy_command_allocator, NULL); if (FAILED(hr)) { av_log(ctx, AV_LOG_ERROR, "Failed to reset copy command list\n"); av_frame_free(&out); return AVERROR(EINVAL); } // Transition resolved textures to COPY_SOURCE state D3D12_RESOURCE_BARRIER copy_barriers[2]; int copy_barrier_count = 1; d3d12_barrier_transition(©_barriers[0], s->resolved_mv_texture_back, D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_COPY_SOURCE); if (s->next_frame) { d3d12_barrier_transition(©_barriers[1], s->resolved_mv_texture_fwd, D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_COPY_SOURCE); copy_barrier_count = 2; } ID3D12GraphicsCommandList_ResourceBarrier(s->copy_command_list, copy_barrier_count, copy_barriers); // Get texture layout for backward copy D3D12_RESOURCE_DESC texture_desc_back; D3D12_PLACED_SUBRESOURCE_FOOTPRINT layout_back; UINT64 row_size_back, total_size_back; UINT num_rows_back; // Get the resource description for backward texture s->resolved_mv_texture_back->lpVtbl->GetDesc(s->resolved_mv_texture_back, &texture_desc_back); av_log(ctx, AV_LOG_DEBUG, "Back texture desc: Width=%llu, Height=%u, Format=%d\n", (unsigned long long)texture_desc_back.Width, texture_desc_back.Height, texture_desc_back.Format); // Get the copyable footprints for the backward texture ID3D12Device_GetCopyableFootprints(s->device, &texture_desc_back, 0, 1, 0, &layout_back, &num_rows_back, &row_size_back, &total_size_back); av_log(ctx, AV_LOG_DEBUG, "Back layout: Offset=%llu, Width=%u, Height=%u, Depth=%u, RowPitch=%u\n", (unsigned long long)layout_back.Offset, layout_back.Footprint.Width, layout_back.Footprint.Height, layout_back.Footprint.Depth, layout_back.Footprint.RowPitch); // Copy backward motion vectors D3D12_TEXTURE_COPY_LOCATION src_back = { .pResource = s->resolved_mv_texture_back, .Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX, .SubresourceIndex = 0 }; D3D12_TEXTURE_COPY_LOCATION dst_back = { .pResource = s->readback_buffer_back, .Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT, .PlacedFootprint = { .Offset = 0, .Footprint = layout_back.Footprint } }; av_log(ctx, AV_LOG_DEBUG, "Copying backward MVs...\n"); ID3D12GraphicsCommandList_CopyTextureRegion(s->copy_command_list, &dst_back, 0, 0, 0, &src_back, NULL); // Copy forward motion vectors if available if (s->next_frame) { // Get texture layout for forward copy D3D12_RESOURCE_DESC texture_desc_fwd; D3D12_PLACED_SUBRESOURCE_FOOTPRINT layout_fwd; UINT64 row_size_fwd, total_size_fwd; UINT num_rows_fwd; // Get the resource description for forward texture s->resolved_mv_texture_fwd->lpVtbl->GetDesc(s->resolved_mv_texture_fwd, &texture_desc_fwd); av_log(ctx, AV_LOG_DEBUG, "Fwd texture desc: Width=%llu, Height=%u, Format=%d\n", (unsigned long long)texture_desc_fwd.Width, texture_desc_fwd.Height, texture_desc_fwd.Format); // Get the copyable footprints for the forward texture ID3D12Device_GetCopyableFootprints(s->device, &texture_desc_fwd, 0, 1, 0, &layout_fwd, &num_rows_fwd, &row_size_fwd, &total_size_fwd); av_log(ctx, AV_LOG_DEBUG, "Fwd layout: Offset=%llu, Width=%u, Height=%u, Depth=%u, RowPitch=%u\n", (unsigned long long)layout_fwd.Offset, layout_fwd.Footprint.Width, layout_fwd.Footprint.Height, layout_fwd.Footprint.Depth, layout_fwd.Footprint.RowPitch); D3D12_TEXTURE_COPY_LOCATION src_fwd = { .pResource = s->resolved_mv_texture_fwd, .Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX, .SubresourceIndex = 0 }; D3D12_TEXTURE_COPY_LOCATION dst_fwd = { .pResource = s->readback_buffer_fwd, .Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT, .PlacedFootprint = { .Offset = 0, .Footprint = layout_fwd.Footprint } }; av_log(ctx, AV_LOG_DEBUG, "Copying forward MVs...\n"); ID3D12GraphicsCommandList_CopyTextureRegion(s->copy_command_list, &dst_fwd, 0, 0, 0, &src_fwd, NULL); } // Transition back to COMMON state (reuse barriers by swapping states) for (int i = 0; i < copy_barrier_count; i++) FFSWAP(D3D12_RESOURCE_STATES, copy_barriers[i].Transition.StateBefore, copy_barriers[i].Transition.StateAfter); ID3D12GraphicsCommandList_ResourceBarrier(s->copy_command_list, copy_barrier_count, copy_barriers); hr = ID3D12GraphicsCommandList_Close(s->copy_command_list); if (FAILED(hr)) { av_log(ctx, AV_LOG_ERROR, "Failed to close copy command list (hr=0x%lx)\n", (long)hr); av_frame_free(&out); return AVERROR(EINVAL); } // Execute copy command list on the copy queue ID3D12CommandQueue_ExecuteCommandLists(s->copy_command_queue, 1, (ID3D12CommandList **)&s->copy_command_list); // Signal and wait for copy completion hr = ID3D12CommandQueue_Signal(s->copy_command_queue, s->fence, ++s->fence_value); if (FAILED(hr)) { av_log(ctx, AV_LOG_ERROR, "Failed to signal fence for copy\n"); av_frame_free(&out); return AVERROR(EINVAL); } err = mestimate_d3d12_sync_gpu(s); if (err < 0) { av_frame_free(&out); return err; } // Read motion vectors for both directions err = mestimate_d3d12_read_motion_vectors(ctx, out, 0); if (err < 0) { av_frame_free(&out); return err; } if (s->next_frame) { err = mestimate_d3d12_read_motion_vectors(ctx, out, 1); if (err < 0) { av_frame_free(&out); return err; } } return ff_filter_frame(ctx->outputs[0], out); } static av_cold void mestimate_d3d12_uninit(AVFilterContext *ctx) { MEstimateD3D12Context *s = ctx->priv; av_frame_free(&s->prev_frame); av_frame_free(&s->cur_frame); av_frame_free(&s->next_frame); D3D12_OBJECT_RELEASE(s->copy_command_list); D3D12_OBJECT_RELEASE(s->copy_command_allocator); D3D12_OBJECT_RELEASE(s->copy_command_queue); D3D12_OBJECT_RELEASE(s->readback_buffer_back); D3D12_OBJECT_RELEASE(s->readback_buffer_fwd); D3D12_OBJECT_RELEASE(s->resolved_mv_texture_back); D3D12_OBJECT_RELEASE(s->resolved_mv_texture_fwd); D3D12_OBJECT_RELEASE(s->motion_vector_heap); D3D12_OBJECT_RELEASE(s->motion_estimator); D3D12_OBJECT_RELEASE(s->command_list); D3D12_OBJECT_RELEASE(s->command_allocator); D3D12_OBJECT_RELEASE(s->command_queue); D3D12_OBJECT_RELEASE(s->fence); if (s->fence_event) CloseHandle(s->fence_event); av_buffer_unref(&s->hw_frames_ref); av_buffer_unref(&s->hw_device_ref); } static const AVFilterPad mestimate_d3d12_inputs[] = { { .name = "default", .type = AVMEDIA_TYPE_VIDEO, .filter_frame = mestimate_d3d12_filter_frame, }, }; static const AVFilterPad mestimate_d3d12_outputs[] = { { .name = "default", .type = AVMEDIA_TYPE_VIDEO, .config_props = mestimate_d3d12_config_props, }, }; #define OFFSET(x) offsetof(MEstimateD3D12Context, x) #define FLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM static const AVOption mestimate_d3d12_options[] = { { "mb_size", "macroblock size, only 8 and 16 are supported", OFFSET(block_size), AV_OPT_TYPE_INT, {.i64 = 16}, 8, 16, FLAGS }, { NULL } }; AVFILTER_DEFINE_CLASS(mestimate_d3d12); const FFFilter ff_vf_mestimate_d3d12 = { .p.name = "mestimate_d3d12", .p.description = NULL_IF_CONFIG_SMALL("Generate motion vectors using D3D12 hardware acceleration."), .p.priv_class = &mestimate_d3d12_class, .p.flags = AVFILTER_FLAG_METADATA_ONLY | AVFILTER_FLAG_HWDEVICE, .priv_size = sizeof(MEstimateD3D12Context), .init = mestimate_d3d12_init, .uninit = mestimate_d3d12_uninit, .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE, FILTER_INPUTS(mestimate_d3d12_inputs), FILTER_SINGLE_PIXFMT(AV_PIX_FMT_D3D12), FILTER_OUTPUTS(mestimate_d3d12_outputs), };