;****************************************************************************** ;* Copyright (c) 2025 Niklas Haas ;* ;* This file is part of FFmpeg. ;* ;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* ;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public ;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** %include "libavutil/x86/x86util.asm" ; High-level explanation of how the x86 backend works: ; ; sws_processN is the shared entry point for all operation chains. This ; function is responsible for the block loop, as well as initializing the ; plane pointers. It will jump directly into the first operation kernel, ; and each operation kernel will jump directly into the next one, with the ; final kernel jumping back into the sws_process return point. (See label ; `sws_process.return` in ops_int.asm) ; ; To handle the jump back to the return point, we append an extra address ; corresponding to the correct sws_process.return label into the SwsOpChain, ; and have the WRITE kernel jump into it as usual. (See the FINISH macro) ; ; Inside an operation chain, we use a custom calling convention to preserve ; registers between kernels. The exact register allocation is found further ; below in this file, but we basically reserve (and share) the following ; registers: ; ; - const execq (read-only, shared execution data, see SwsOpExec); stores the ; static metadata for this call and describes the image layouts ; ; - implq (read-only, operation chain, see SwsOpChain); stores the private data ; for each operation as well as the pointer to the next kernel in the sequence. ; This register is automatically incremented by the CONTINUE macro, and will ; be reset back to the first operation kernel by sws_process. ; ; - bxd, yd: current line and block number, used as loop counters in sws_process. ; Also used by e.g. the dithering code to do position-dependent dithering. ; ; - tmp0, tmp1: two temporary registers which are NOT preserved between kernels ; ; - inNq, outNq: plane pointers. These are incremented automatically after the ; corresponding read/write operation, by the read/write kernels themselves. ; sws_process will take care of resetting these to the next line after the ; block loop is done. ; ; Additionally, we pass data between kernels by directly keeping them inside ; vector registers. For this, we reserve the following registers: ; ; - mx, my, mz, mw: low half of the X, Y, Z and W components ; - mx2, my2, mz2, mw2: high half of the X, Y, Z and W components ; (As well as sized variants for xmx, ymx, etc.) ; ; The "high half" registers are only sometimes used; in order to enable ; processing more pixels at the same time. See `decl_v2` below, which allows ; assembling the same operation twice, once with only the lower half (V2=0), ; and once with both halves (V2=1). The remaining vectors are free for use ; inside operation kernels, starting from m8. ; ; The basic rule is that we always use the full set of both vector registers ; when processing the largest element size within a pixel chain. For example, ; if we load 8-bit values and convert them to 32-bit floats internally, then ; we would have an operation chain which combines an SSE4 V2=0 u8 kernel (128 ; bits = 16 pixels) with an AVX2 V2=1 f32 kernel (512 bits = 16 pixels). This ; keeps the number of pixels being processed (the block size) constant. The ; V2 setting is suffixed to the operation name (_m1 or _m2) during name ; mangling. ; ; This design leaves us with the following set of possibilities: ; ; SSE4: ; - max element is 32-bit: currently unsupported ; - max element is 16-bit: currently unsupported ; - max element is 8-bit: block size 32, u8_m2_sse4 ; ; AVX2: ; - max element is 32-bit: block size 16, u32_m2_avx2, u16_m1_avx2, u8_m1_sse4 ; - max element is 16-bit: block size 32, u16_m2_avx2, u8_m1_avx2 ; - max element is 8-bit: block size 64, u8_m2_avx2 ; ; Meaning we need to cover the following code paths for each bit depth: ; ; - 8-bit kernels: m1_sse4, m2_sse4, m1_avx2, m2_avx2 ; - 16-bit kernels: m1_avx2, m2_avx2 ; - 32-bit kernels: m2_avx2 ; ; This is achieved by macro'ing each operation kernel and declaring it once ; per SIMD version, and (if needed) once per V2 setting using decl_v2. (See ; the bottom of ops_int.asm for an example) ; ; Finally, we overload some operation kernel to different number of components, ; using the `decl_pattern` and `decl_common_patterns` macros. Inside these ; kernels, the variables X, Y, Z and W will be set to 0 or 1 respectively, ; depending on which components are active for this particular kernel instance. ; They will receive the _pXYZW prefix during name mangling. struc SwsOpExec .in0 resq 1 .in1 resq 1 .in2 resq 1 .in3 resq 1 .out0 resq 1 .out1 resq 1 .out2 resq 1 .out3 resq 1 .in_stride0 resq 1 .in_stride1 resq 1 .in_stride2 resq 1 .in_stride3 resq 1 .out_stride0 resq 1 .out_stride1 resq 1 .out_stride2 resq 1 .out_stride3 resq 1 .in_bump0 resq 1 .in_bump1 resq 1 .in_bump2 resq 1 .in_bump3 resq 1 .out_bump0 resq 1 .out_bump1 resq 1 .out_bump2 resq 1 .out_bump3 resq 1 .width resd 1 .height resd 1 .slice_y resd 1 .slice_h resd 1 .block_size_in resd 1 .block_size_out resd 1 .in_sub_y4 resb 4 .out_sub_y4 resb 4 .in_sub_x4 resb 4 .out_sub_x4 resb 4 endstruc struc SwsOpImpl .cont resb 16 .priv resb 16 .next resb 0 endstruc ;--------------------------------------------------------- ; Common macros for declaring operations ; Declare an operation kernel with the correct name mangling. %macro op 1 ; name %ifdef X %define ADD_PAT(name) p %+ X %+ Y %+ Z %+ W %+ _ %+ name %else %define ADD_PAT(name) name %endif %ifdef V2 %if V2 %define ADD_MUL(name) name %+ _m2 %else %define ADD_MUL(name) name %+ _m1 %endif %else %define ADD_MUL(name) name %endif cglobal ADD_PAT(ADD_MUL(%1)), 0, 0, 0 ; already allocated by entry point %undef ADD_PAT %undef ADD_MUL %endmacro ; Declare an operation kernel twice, once with V2=0 and once with V2=1 %macro decl_v2 2+ ; v2, func %xdefine V2 %1 %2 %undef V2 %endmacro ; Declare an operation kernel specialized to a given subset of active components %macro decl_pattern 5+ ; X, Y, Z, W, func %xdefine X %1 %xdefine Y %2 %xdefine Z %3 %xdefine W %4 %5 %undef X %undef Y %undef Z %undef W %endmacro ; Declare an operation kernel specialized to each common component pattern %macro decl_common_patterns 1+ ; func decl_pattern 1, 0, 0, 0, %1 ; y decl_pattern 1, 0, 0, 1, %1 ; ya decl_pattern 1, 1, 1, 0, %1 ; yuv decl_pattern 1, 1, 1, 1, %1 ; yuva %endmacro ;--------------------------------------------------------- ; Common names for the internal calling convention %define mx m0 %define my m1 %define mz m2 %define mw m3 %define xmx xm0 %define xmy xm1 %define xmz xm2 %define xmw xm3 %define ymx ym0 %define ymy ym1 %define ymz ym2 %define ymw ym3 %define mx2 m4 %define my2 m5 %define mz2 m6 %define mw2 m7 %define xmx2 xm4 %define xmy2 xm5 %define xmz2 xm6 %define xmw2 xm7 %define ymx2 ym4 %define ymy2 ym5 %define ymz2 ym6 %define ymw2 ym7 ; Reserved in this order by the signature of SwsOpFunc %define execq r0q %define implq r1q %define bxd r2d %define yd r3d ; Extra registers for free use by kernels, not saved between ops %define tmp0q r4q %define tmp1q r5q %define tmp0d r4d %define tmp1d r5d %define tmp0w r4w %define tmp1w r5w ; Registers for plane pointers; put at the end (and in ascending plane order) ; so that we can avoid reserving them when not necessary %define out0q r6q %define in0q r7q %define out1q r8q %define in1q r9q %define out2q r10q %define in2q r11q %define out3q r12q %define in3q r13q ;--------------------------------------------------------- ; Common macros for linking together different kernels ; Load the next operation kernel's address to a register %macro LOAD_CONT 1 ; reg mov %1, [implq + SwsOpImpl.cont] %endmacro ; Tail call into the next operation kernel, given that kernel's address %macro CONTINUE 1 ; reg add implq, SwsOpImpl.next jmp %1 annotate_function_size %endmacro ; Convenience macro to load and continue to the next kernel in one step %macro CONTINUE 0 LOAD_CONT tmp0q CONTINUE tmp0q %endmacro ; Final macro to end the operation chain, used by WRITE kernels to jump back ; to the process function return point. Very similar to CONTINUE, but skips ; incrementing the implq pointer, and also clears AVX registers to avoid ; phantom dependencies between loop iterations. %macro FINISH 1 ; reg %if vzeroupper_required ; we may jump back into an SSE read, so always zero upper regs here vzeroupper %endif jmp %1 annotate_function_size %endmacro ; Helper for inline conditionals; used to conditionally include single lines %macro IF 2+ ; cond, body %if %1 %2 %endif %endmacro ; Alternate name; for nested usage (to work around NASM limitations) %macro IF1 2+ %if %1 %2 %endif %endmacro