; /*
; * Provide SIMD MC functions for VVC decoding
; *
; * Copyright © 2021, VideoLAN and dav1d authors
; * Copyright © 2021, Two Orioles, LLC
; * All rights reserved.
; *
; * Copyright (c) 2023-2024 Nuo Mi
; * Copyright (c) 2023-2024 Wu Jianhua
; *
; * This file is part of FFmpeg.
; *
; * FFmpeg is free software; you can redistribute it and/or
; * modify it under the terms of the GNU Lesser General Public
; * License as published by the Free Software Foundation; either
; * version 2.1 of the License, or (at your option) any later version.
; *
; * FFmpeg is distributed in the hope that it will be useful,
; * but WITHOUT ANY WARRANTY; without even the implied warranty of
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
; * Lesser General Public License for more details.
; *
; * You should have received a copy of the GNU Lesser General Public
; * License along with FFmpeg; if not, write to the Free Software
; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
; */

%include "libavutil/x86/x86util.asm"

%define MAX_PB_SIZE 128

SECTION_RODATA

%if ARCH_X86_64

%if HAVE_AVX2_EXTERNAL

%macro AVG_JMP_TABLE 4-*
    %xdefine %1_%2_%4_table (%%table - 2*%5)
    %xdefine %%base %1_%2_%4_table
    %xdefine %%prefix mangle(private_prefix %+ _vvc_%1_%3_%4)
    %%table:
    %rep %0 - 4
        dd %%prefix %+ .w%5 - %%base
        %rotate 1
    %endrep
%endmacro

AVG_JMP_TABLE    avg,  8,  8, avx2,                2, 4, 8, 16, 32, 64, 128
AVG_JMP_TABLE    avg, 16, 10, avx2,                2, 4, 8, 16, 32, 64, 128
AVG_JMP_TABLE  w_avg,  8,  8, avx2,                2, 4, 8, 16, 32, 64, 128
AVG_JMP_TABLE  w_avg, 16, 10, avx2,                2, 4, 8, 16, 32, 64, 128

SECTION .text

%macro AVG_W16_FN 3 ; bpc, op, count
    %assign %%i 0
    %rep %3
        %define off %%i
        AVG_LOAD_W16        0, off
        %2                 %1, 16
        AVG_SAVE_W16       %1, 0, off


        AVG_LOAD_W16        1, off
        %2                 %1, 16
        AVG_SAVE_W16       %1, 1, off

        %assign %%i %%i+1
    %endrep
%endmacro

%macro AVG_FN 2-3 1; bpc, op, instantiate implementation
   jmp                  wq

%if %3
INIT_XMM cpuname
.w2:
    movd                xm0, [src0q]
    pinsrd              xm0, [src0q + AVG_SRC_STRIDE], 1
    movd                xm1, [src1q]
    pinsrd              xm1, [src1q + AVG_SRC_STRIDE], 1
    %2                   %1, 2
    AVG_SAVE_W2          %1
    AVG_LOOP_END        .w2

.w4:
    movq                xm0, [src0q]
    pinsrq              xm0, [src0q + AVG_SRC_STRIDE], 1
    movq                xm1, [src1q]
    pinsrq              xm1, [src1q + AVG_SRC_STRIDE], 1
    %2                   %1, 4
    AVG_SAVE_W4          %1

    AVG_LOOP_END        .w4

INIT_YMM cpuname
.w8:
    movu               xm0, [src0q]
    movu               xm1, [src1q]
    vinserti128         m0, m0, [src0q + AVG_SRC_STRIDE], 1
    vinserti128         m1, m1, [src1q + AVG_SRC_STRIDE], 1
    %2                  %1, 8
    AVG_SAVE_W8         %1

    AVG_LOOP_END       .w8

.w16:
    AVG_W16_FN          %1, %2, 1

    AVG_LOOP_END       .w16

.w32:
    AVG_W16_FN          %1, %2, 2

    AVG_LOOP_END       .w32

.w64:
    AVG_W16_FN          %1, %2, 4

    AVG_LOOP_END       .w64

.w128:
    AVG_W16_FN          %1, %2, 8

    AVG_LOOP_END       .w128

.ret:
    RET
%endif
%endmacro

%macro AVG   2 ; bpc, width
    paddsw               m0, m1
    pmulhrsw             m0, m2
%if %1 != 8
    CLIPW                m0, m3, m4
%endif
%endmacro

%macro W_AVG 2 ; bpc, width
%if %2 > 2
    punpckhwd            m5, m0, m1
    pmaddwd              m5, m3
    paddd                m5, m4
    psrad                m5, xm2
%endif

    punpcklwd            m0, m0, m1
    pmaddwd              m0, m3
    paddd                m0, m4
    psrad                m0, xm2

%if %2 == 2
    packssdw             m0, m0
%else
    packssdw             m0, m5
%endif
%if %1 != 8
    CLIPW                m0, m6, m7
%endif
%endmacro

%macro AVG_LOAD_W16 2  ; line, offset
    movu               m0, [src0q + %1 * AVG_SRC_STRIDE + %2 * 32]
    movu               m1, [src1q + %1 * AVG_SRC_STRIDE + %2 * 32]
%endmacro

%macro AVG_SAVE_W2 1 ;bpc
    %if %1 == 16
        movd             [dstq], xm0
        pextrd [dstq + strideq], xm0, 1
    %else
        packuswb           m0, m0
        pextrw           [dstq], xm0, 0
        pextrw [dstq + strideq], xm0, 1
    %endif
%endmacro

%macro AVG_SAVE_W4 1 ;bpc
    %if %1 == 16
        movq             [dstq], xm0
        pextrq [dstq + strideq], xm0, 1
    %else
        packuswb           m0, m0
        movd             [dstq], xm0
        pextrd [dstq + strideq], xm0, 1
    %endif
%endmacro

%macro AVG_SAVE_W8 1 ;bpc
    %if %1 == 16
        movu                    [dstq], xm0
        vextracti128  [dstq + strideq], m0, 1
    %else
        packuswb                    m0, m0
        vpermq                      m0, m0, 1000b
        movq                    [dstq], xm0
        pextrq        [dstq + strideq], xm0, 1
    %endif
%endmacro

%macro AVG_SAVE_W16 3 ; bpc, line, offset
    %if %1 == 16
        movu               [dstq + %2 * strideq + %3 * 32], m0
    %else
        packuswb                                        m0, m0
        vpermq                                          m0, m0, 1000b
        movu               [dstq + %2 * strideq + %3 * 16], xm0
    %endif
%endmacro

%macro AVG_LOOP_END 1
    sub                  hd, 2
    je                 .ret

    lea               src0q, [src0q + 2 * AVG_SRC_STRIDE]
    lea               src1q, [src1q + 2 * AVG_SRC_STRIDE]
    lea                dstq, [dstq + 2 * strideq]
    jmp                  %1
%endmacro

%define AVG_SRC_STRIDE MAX_PB_SIZE*2

;void ff_vvc_avg_%1_avx2(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src0,
;                        const int16_t *src1, int width, int height);
%macro VVC_AVG_AVX2 3
cglobal vvc_avg_%2, 4, 7, 5, dst, stride, src0, src1, w, h
    movifnidn            hd, hm

    pcmpeqw              m2, m2
%if %1 != 8
    pxor                 m3, m3             ; pixel min
%endif

    lea                  r6, [avg_%1 %+ SUFFIX %+ _table]
    tzcnt                wd, wm
    movsxd               wq, dword [r6+wq*4]
    psrlw                m4, m2, 16-%2      ; pixel max
    psubw                m2, m4, m2         ; 1 << bpp
    add                  wq, r6
    AVG_FN               %1, AVG, %3
%endmacro

;void ff_vvc_w_avg_%2_avx(uint8_t *dst, ptrdiff_t dst_stride,
;                         const int16_t *src0, const int16_t *src1, int width, int height,
;                         int denom, intptr_t w0, int w1, int o);
%macro VVC_W_AVG_AVX2 3
cglobal vvc_w_avg_%2, 4, 7+2*UNIX64, 6+2*(%1 != 8), dst, stride, src0, src1, w, h
%if UNIX64
    ; r6-r8 are volatile and not used for parameter passing
    DECLARE_REG_TMP 6, 7, 8
%else ; Win64
    ; r4-r6 are volatile and not used for parameter passing
    DECLARE_REG_TMP 4, 5, 6
%endif

    mov                 t1d, r6m                ; denom
    mov                 t0d, r9m                ; o0 + o1
    movifnidn           t2d, r8m                ; w1
    add                 t1d, 15-%2
%if %2 != 8
    shl                 t0d, %2 - 8
%endif
    movd                xm2, t1d                ; shift
    inc                 t0d                     ; ((o0 + o1) << (BIT_DEPTH - 8)) + 1
    shl                 t2d, 16
    movd                xm4, t0d
    mov                 t2w, r7m                ; w0
    movd                xm3, t2d
    vpbroadcastd         m3, xm3                ; w0, w1

%if %1 != 8
    pcmpeqw              m7, m7
    pxor                 m6, m6                 ; pixel min
    psrlw                m7, 16-%2              ; pixel max
%endif

    lea                 r6, [w_avg_%1 %+ SUFFIX %+ _table]
    tzcnt               wd, wm
    movsxd              wq, dword [r6+wq*4]

    pslld               xm4, xm2
    psrad               xm4, 1
    vpbroadcastd         m4, xm4                 ; offset

    movifnidn            hd, hm

    add                 wq, r6
    AVG_FN              %1, W_AVG, %3
%endmacro

INIT_YMM avx2

VVC_AVG_AVX2 16, 12, 0

VVC_W_AVG_AVX2 16, 12, 0

VVC_AVG_AVX2 16, 10, 1

VVC_W_AVG_AVX2 16, 10, 1

VVC_AVG_AVX2 8, 8, 1

VVC_W_AVG_AVX2 8, 8, 1
%endif

%endif