;***************************************************************************** ;* SSE2-optimized CAVS QPEL code ;***************************************************************************** ;* Copyright (c) 2006 Stefan Gehrer ;* based on H.264 optimizations by Michael Niedermayer and Loren Merritt ;* Copyright (c) 2025 Andreas Rheinhardt ;* ;* This file is part of FFmpeg. ;* ;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* ;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public ;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** %include "libavutil/x86/x86util.asm" SECTION_RODATA cextern pw_4 cextern pw_5 cextern pw_7 cextern pw_64 pw_42: times 8 dw 42 pw_96: times 8 dw 96 SECTION .text %macro op_avgh 3 movh %3, %2 pavgb %1, %3 movh %2, %1 %endmacro %macro op_puth 2-3 movh %2, %1 %endmacro %macro CAVS_QPEL_H 1 ; ff_put_cavs_qpel8_mc20(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) cglobal %1_cavs_qpel8_mc20, 3,4,6 mov r3d, 8 jmp %1_cavs_qpel8_h_after_prologue ; ff_put_cavs_qpel8_h(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h) cglobal %1_cavs_qpel8_h, 4,4,6 %1_cavs_qpel8_h_after_prologue: mova m3, [pw_4] mova m4, [pw_5] pxor m5, m5 .loop: movh m0, [r1] movh m1, [r1+1] punpcklbw m0, m5 punpcklbw m1, m5 paddw m0, m1 movh m1, [r1-1] movh m2, [r1+2] pmullw m0, m4 punpcklbw m1, m5 punpcklbw m2, m5 paddw m0, m3 add r1, r2 paddw m1, m2 psubw m0, m1 psraw m0, 3 packuswb m0, m5 op_%1h m0, [r0], m1 add r0, r2 dec r3d jne .loop RET %endmacro INIT_XMM sse2 CAVS_QPEL_H avg CAVS_QPEL_H put %macro FILT_V 1 movh m3, [r1] punpcklbw m3, m7 mova m4, m1 paddw m4, m2 paddw m0, m3 add r1, r2 pmullw m4, m5 psubw m4, m0 paddw m4, m6 psraw m4, 3 packuswb m4, m7 op_%1h m4, [r0], m0 add r0, r2 SWAP 0, 1, 2, 3 %endmacro %macro CAVS_QPEL_MC02 1 ; ff_put_cavs_qpel8_mc02(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) cglobal %1_cavs_qpel8_mc02, 3,4,8 mov r3d, 8 jmp %1_cavs_qpel8_v2_after_prologue ; ff_put_cavs_qpel8_v2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h) cglobal %1_cavs_qpel8_v2, 4,4,8 %1_cavs_qpel8_v2_after_prologue: movh m1, [r1] sub r1, r2 movh m0, [r1] lea r1, [r1+2*r2] pxor m7, m7 movh m2, [r1] add r1, r2 punpcklbw m1, m7 punpcklbw m0, m7 punpcklbw m2, m7 mova m5, [pw_5] mova m6, [pw_4] .loop: FILT_V %1 FILT_V %1 FILT_V %1 FILT_V %1 sub r3d, 4 jne .loop RET %endmacro INIT_XMM sse2 CAVS_QPEL_MC02 avg CAVS_QPEL_MC02 put %macro FILT_V3 1 pmullw m0, PW_7 movh m4, [r1] mova m5, m1 mova m6, m2 pmullw m5, PW_42 punpcklbw m4, m7 pmullw m6, PW_96 paddw m0, m3 add r1, r2 paddw m0, m3 paddw m5, m6 paddw m0, m4 ; m5-m0 can be in the -10*255..(42 + 96)*255 range and ; therefore is not guaranteed to fit into either a signed or ; an unsigned word. Because we need to clamp the result to 0..255 ; anyway, we use saturated subtraction and a logical right shift ; for rescaling. psubusw m5, m0 paddw m5, PW_64 psrlw m5, 7 packuswb m5, m7 op_%1h m5, [r0], m0 add r0, r2 SWAP 0, 1, 2, 3, 4 %endmacro %macro CAVS_QPEL_MC03 1 ; ff_put_cavs_qpel8_mc03(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) cglobal %1_cavs_qpel8_mc03, 3,4,8+4*ARCH_X86_64 mov r3d, 8 jmp %1_cavs_qpel8_v3_after_prologue ; ff_put_cavs_qpel8_v3(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h) cglobal %1_cavs_qpel8_v3, 4,4,8+4*ARCH_X86_64 %1_cavs_qpel8_v3_after_prologue: movh m1, [r1] movh m2, [r1+r2] movh m3, [r1+2*r2] sub r1, r2 pxor m7, m7 movh m0, [r1] lea r1, [r1+4*r2] punpcklbw m1, m7 punpcklbw m2, m7 %if ARCH_X86_64 %define PW_7 m8 %define PW_42 m9 %define PW_96 m10 %define PW_64 m11 mova m8, [pw_7] mova m9, [pw_42] mova m10, [pw_96] mova m11, [pw_64] %else %define PW_7 [pw_7] %define PW_42 [pw_42] %define PW_96 [pw_96] %define PW_64 [pw_64] %endif punpcklbw m3, m7 punpcklbw m0, m7 .loop: FILT_V3 %1 FILT_V3 %1 FILT_V3 %1 FILT_V3 %1 SWAP 0, 1, 2, 3, 4 mova m3, m2 mova m2, m1 mova m1, m0 mova m0, m4 sub r3d, 4 jne .loop RET %endmacro INIT_XMM sse2 CAVS_QPEL_MC03 avg CAVS_QPEL_MC03 put