/*
 * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru>
 * H.263, MPEG-1, MPEG-2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/attributes.h"
#include "libavutil/avassert.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/mpegvideo.h"
#include "libavcodec/mpegvideodata.h"
#include "libavcodec/mpegvideo_unquantize.h"

#if HAVE_SSE2_INLINE

#define SPLATW(reg) "punpcklwd    %%" #reg ", %%" #reg "\n\t" \
                    "pshufd   $0, %%" #reg ", %%" #reg "\n\t"

#if HAVE_SSSE3_INLINE

static void dct_unquantize_h263_intra_ssse3(const MPVContext *s,
                                            int16_t *block, int n, int qscale)
{
    x86_reg qmul = (unsigned)qscale << 1;
    int level, qadd;

    av_assert2(s->block_last_index[n]>=0 || s->h263_aic);

    if (!s->h263_aic) {
        if (n < 4)
            level = block[0] * s->y_dc_scale;
        else
            level = block[0] * s->c_dc_scale;
        qadd = (qscale - 1) | 1;
    }else{
        qadd = 0;
        level= block[0];
    }
    x86_reg offset = s->ac_pred ? 63 << 1 : s->intra_scantable.raster_end[s->block_last_index[n]] << 1;

__asm__ volatile(
                "movd          %k1, %%xmm0     \n\t" //qmul
                "lea      (%2, %0), %1         \n\t"
                "neg            %0             \n\t"
                "movd           %3, %%xmm1     \n\t" //qadd
                SPLATW(xmm0)
                SPLATW(xmm1)

                ".p2align 4                    \n\t"
                "1:                            \n\t"
                "movdqa   (%1, %0), %%xmm2     \n\t"
                "movdqa 16(%1, %0), %%xmm3     \n\t"

                "movdqa     %%xmm1, %%xmm4     \n\t"
                "movdqa     %%xmm1, %%xmm5     \n\t"

                "psignw     %%xmm2, %%xmm4     \n\t" // sgn(block[i])*qadd
                "psignw     %%xmm3, %%xmm5     \n\t" // sgn(block[i])*qadd

                "pmullw     %%xmm0, %%xmm2     \n\t"
                "pmullw     %%xmm0, %%xmm3     \n\t"

                "paddw      %%xmm4, %%xmm2     \n\t"
                "paddw      %%xmm5, %%xmm3     \n\t"

                "movdqa     %%xmm2, (%1, %0)   \n\t"
                "movdqa     %%xmm3, 16(%1, %0) \n\t"

                "add           $32, %0         \n\t"
                "jng            1b             \n\t"
                : "+r"(offset), "+r"(qmul)
                : "r" (block), "rm" (qadd)
                : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5",) "memory"
        );
        block[0]= level;
}


static void dct_unquantize_h263_inter_ssse3(const MPVContext *s,
                                            int16_t *block, int n, int qscale)
{
    int qmul = qscale << 1;
    int qadd = (qscale - 1) | 1;

    av_assert2(s->block_last_index[n]>=0 || s->h263_aic);

    x86_reg offset = s->inter_scantable.raster_end[s->block_last_index[n]] << 1;

__asm__ volatile(
                "movd           %2, %%xmm0     \n\t" //qmul
                "movd           %3, %%xmm1     \n\t" //qadd
                "add            %1, %0         \n\t"
                "neg            %1             \n\t"
                SPLATW(xmm0)
                SPLATW(xmm1)

                ".p2align 4                    \n\t"
                "1:                            \n\t"
                "movdqa   (%0, %1), %%xmm2     \n\t"
                "movdqa 16(%0, %1), %%xmm3     \n\t"

                "movdqa     %%xmm1, %%xmm4     \n\t"
                "movdqa     %%xmm1, %%xmm5     \n\t"

                "psignw     %%xmm2, %%xmm4     \n\t" // sgn(block[i])*qadd
                "psignw     %%xmm3, %%xmm5     \n\t" // sgn(block[i])*qadd

                "pmullw     %%xmm0, %%xmm2     \n\t"
                "pmullw     %%xmm0, %%xmm3     \n\t"

                "paddw      %%xmm4, %%xmm2     \n\t"
                "paddw      %%xmm5, %%xmm3     \n\t"

                "movdqa     %%xmm2, (%0, %1)   \n\t"
                "movdqa     %%xmm3, 16(%0, %1) \n\t"

                "add           $32, %1         \n\t"
                "jng 1b                        \n\t"
                : "+r" (block), "+r" (offset)
                : "rm"(qmul), "rm" (qadd)
                : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5",) "memory"
        );
}

static void dct_unquantize_mpeg1_intra_ssse3(const MPVContext *s,
                                             int16_t *block, int n, int qscale)
{
    x86_reg nCoeffs;
    const uint16_t *quant_matrix;
    int block0;

    av_assert2(s->block_last_index[n]>=0);

    nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;

    if (n < 4)
        block0 = block[0] * s->y_dc_scale;
    else
        block0 = block[0] * s->c_dc_scale;
    /* XXX: only MPEG-1 */
    quant_matrix = s->intra_matrix;
    x86_reg offset = -2 * nCoeffs;
__asm__ volatile(
                "movd           %3, %%xmm6     \n\t"
                "pcmpeqw    %%xmm7, %%xmm7     \n\t"
                "psrlw         $15, %%xmm7     \n\t"
                SPLATW(xmm6)
                ".p2align 4                    \n\t"
                "1:                            \n\t"
                "movdqa   (%2, %0), %%xmm4     \n\t"
                "movdqa 16(%2, %0), %%xmm5     \n\t"
                "movdqa   (%1, %0), %%xmm0     \n\t"
                "movdqa 16(%1, %0), %%xmm1     \n\t"
                "pmullw     %%xmm6, %%xmm4     \n\t" // q=qscale*quant_matrix[i]
                "pmullw     %%xmm6, %%xmm5     \n\t" // q=qscale*quant_matrix[i]
                "pabsw      %%xmm0, %%xmm2     \n\t" // abs(block[i])
                "pabsw      %%xmm1, %%xmm3     \n\t" // abs(block[i])
                "pmullw     %%xmm4, %%xmm2     \n\t" // abs(block[i])*q
                "pmullw     %%xmm5, %%xmm3     \n\t" // abs(block[i])*q
                "psraw          $3, %%xmm2     \n\t"
                "psraw          $3, %%xmm3     \n\t"
                "psubw      %%xmm7, %%xmm2     \n\t"
                "psubw      %%xmm7, %%xmm3     \n\t"
                "por        %%xmm7, %%xmm2     \n\t"
                "por        %%xmm7, %%xmm3     \n\t"
                "psignw     %%xmm0, %%xmm2     \n\t"
                "psignw     %%xmm1, %%xmm3     \n\t"
                "movdqa     %%xmm2, (%1, %0)   \n\t"
                "movdqa     %%xmm3, 16(%1, %0) \n\t"

                "add           $32, %0         \n\t"
                "js 1b                         \n\t"
                : "+r" (offset)
                : "r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale)
                : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",)
                  "memory"
        );
    block[0]= block0;
}

static void dct_unquantize_mpeg1_inter_ssse3(const MPVContext *s,
                                             int16_t *block, int n, int qscale)
{
    x86_reg nCoeffs;
    const uint16_t *quant_matrix;

    av_assert2(s->block_last_index[n]>=0);

    nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;

        quant_matrix = s->inter_matrix;
    x86_reg offset = -2 * nCoeffs;
__asm__ volatile(
                "movd           %3, %%xmm6     \n\t"
                "pcmpeqw    %%xmm7, %%xmm7     \n\t"
                "psrlw         $15, %%xmm7     \n\t"
                SPLATW(xmm6)
                ".p2align 4                    \n\t"
                "1:                            \n\t"
                "movdqa   (%2, %0), %%xmm4     \n\t"
                "movdqa 16(%2, %0), %%xmm5     \n\t"
                "movdqa   (%1, %0), %%xmm0     \n\t"
                "movdqa 16(%1, %0), %%xmm1     \n\t"
                "pmullw     %%xmm6, %%xmm4     \n\t" // q=qscale*quant_matrix[i]
                "pmullw     %%xmm6, %%xmm5     \n\t" // q=qscale*quant_matrix[i]
                "pabsw      %%xmm0, %%xmm2     \n\t" // abs(block[i])
                "pabsw      %%xmm1, %%xmm3     \n\t" // abs(block[i])
                "paddw      %%xmm2, %%xmm2     \n\t" // abs(block[i])*2
                "paddw      %%xmm3, %%xmm3     \n\t" // abs(block[i])*2
                "paddw      %%xmm7, %%xmm2     \n\t" // abs(block[i])*2 + 1
                "paddw      %%xmm7, %%xmm3     \n\t" // abs(block[i])*2 + 1
                "pmullw     %%xmm4, %%xmm2     \n\t" // (abs(block[i])*2 + 1)*q
                "pmullw     %%xmm5, %%xmm3     \n\t" // (abs(block[i])*2 + 1)*q
                "psraw          $4, %%xmm2     \n\t"
                "psraw          $4, %%xmm3     \n\t"
                "psubw      %%xmm7, %%xmm2     \n\t"
                "psubw      %%xmm7, %%xmm3     \n\t"
                "por        %%xmm7, %%xmm2     \n\t"
                "por        %%xmm7, %%xmm3     \n\t"
                "psignw     %%xmm0, %%xmm2     \n\t"
                "psignw     %%xmm1, %%xmm3     \n\t"
                "movdqa     %%xmm2, (%1, %0)   \n\t"
                "movdqa     %%xmm3, 16(%1, %0) \n\t"

                "add           $32, %0         \n\t"
                "js 1b                         \n\t"
                : "+r" (offset)
                : "r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale)
                : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",)
                  "memory"
        );
}

#endif /* HAVE_SSSE3_INLINE */

static void dct_unquantize_mpeg2_intra_sse2(const MPVContext *s,
                                            int16_t *block, int n, int qscale)
{
    x86_reg nCoeffs;
    const uint16_t *quant_matrix;
    int block0;

    av_assert2(s->block_last_index[n]>=0);

    if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale];
    else                 qscale <<= 1;

    nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];

    if (n < 4)
        block0 = block[0] * s->y_dc_scale;
    else
        block0 = block[0] * s->c_dc_scale;
    quant_matrix = s->intra_matrix;
    x86_reg offset = -2 * nCoeffs;
__asm__ volatile(
                "movd           %3, %%xmm6     \n\t"
                SPLATW(xmm6)
                ".p2align 4                    \n\t"
                "1:                            \n\t"
                "movdqa   (%1, %0), %%xmm0     \n\t"
                "movdqa 16(%1, %0), %%xmm1     \n\t"
                "movdqa   (%2, %0), %%xmm4     \n\t"
                "movdqa 16(%2, %0), %%xmm5     \n\t"
                "pmullw     %%xmm6, %%xmm4     \n\t" // q=qscale*quant_matrix[i]
                "pmullw     %%xmm6, %%xmm5     \n\t" // q=qscale*quant_matrix[i]
                "movdqa     %%xmm0, %%xmm2     \n\t"
                "movdqa     %%xmm1, %%xmm3     \n\t"
                "psrlw         $12, %%xmm2     \n\t" // block[i] < 0 ? 0xf : 0
                "psrlw         $12, %%xmm3     \n\t" // (block[i] is in the -2048..2047 range)
                "pmullw     %%xmm4, %%xmm0     \n\t" // block[i]*q
                "pmullw     %%xmm5, %%xmm1     \n\t" // block[i]*q
                "paddw      %%xmm2, %%xmm0     \n\t" // bias negative block[i]
                "paddw      %%xmm3, %%xmm1     \n\t" // so that a right-shift
                "psraw          $4, %%xmm0     \n\t" // is equivalent to divide
                "psraw          $4, %%xmm1     \n\t" // with rounding towards zero
                "movdqa     %%xmm0, (%1, %0)   \n\t"
                "movdqa     %%xmm1, 16(%1, %0) \n\t"

                "add           $32, %0         \n\t"
                "jng 1b                        \n\t"
                : "+r" (offset)
                : "r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale)
                : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",)
                  "memory"
        );
    block[0]= block0;
        //Note, we do not do mismatch control for intra as errors cannot accumulate
}

#if HAVE_SSSE3_INLINE

static void dct_unquantize_mpeg2_inter_ssse3(const MPVContext *s,
                                             int16_t *block, int n, int qscale)
{
    av_assert2(s->block_last_index[n]>=0);

    x86_reg qscale2 = s->q_scale_type ? ff_mpeg2_non_linear_qscale[qscale] : (unsigned)qscale << 1;
    x86_reg offset  = s->intra_scantable.raster_end[s->block_last_index[n]] << 1;
    const void *quant_matrix = (const char*)s->inter_matrix + offset;


__asm__ volatile(
                "movd          %k1, %%xmm6     \n\t"
                "lea      (%2, %0), %1         \n\t"
                "neg            %0             \n\t"
                SPLATW(xmm6)
                "pcmpeqw    %%xmm7, %%xmm7     \n\t"
                "psrldq        $14, %%xmm7     \n\t"
                ".p2align 4                    \n\t"
                "1:                            \n\t"
                "movdqa   (%3, %0), %%xmm4     \n\t"
                "movdqa 16(%3, %0), %%xmm5     \n\t"
                "movdqa   (%1, %0), %%xmm0     \n\t"
                "movdqa 16(%1, %0), %%xmm1     \n\t"
                "pmullw     %%xmm6, %%xmm4     \n\t" // q=qscale*quant_matrix[i]
                "pmullw     %%xmm6, %%xmm5     \n\t" // q=qscale*quant_matrix[i]
                "pabsw      %%xmm0, %%xmm2     \n\t" // abs(block[i])
                "pabsw      %%xmm1, %%xmm3     \n\t" // abs(block[i])
                "paddw      %%xmm2, %%xmm2     \n\t" // abs(block[i])*2
                "paddw      %%xmm3, %%xmm3     \n\t" // abs(block[i])*2
                "pmullw     %%xmm4, %%xmm2     \n\t" // abs(block[i])*2*q
                "pmullw     %%xmm5, %%xmm3     \n\t" // abs(block[i])*2*q
                "paddw      %%xmm4, %%xmm2     \n\t" // (abs(block[i])*2 + 1)*q
                "paddw      %%xmm5, %%xmm3     \n\t" // (abs(block[i])*2 + 1)*q
                "psrlw          $5, %%xmm2     \n\t"
                "psrlw          $5, %%xmm3     \n\t"
                "psignw     %%xmm0, %%xmm2     \n\t"
                "psignw     %%xmm1, %%xmm3     \n\t"
                "movdqa     %%xmm2, (%1, %0)   \n\t"
                "movdqa     %%xmm3, 16(%1, %0) \n\t"
                "pxor       %%xmm2, %%xmm7     \n\t"
                "pxor       %%xmm3, %%xmm7     \n\t"

                "add           $32, %0         \n\t"
                "jng 1b                        \n\t"
                "movd      124(%2), %%xmm0     \n\t"
                "movhlps    %%xmm7, %%xmm6     \n\t"
                "pxor       %%xmm6, %%xmm7     \n\t"
                "pshufd $1, %%xmm7, %%xmm6     \n\t"
                "pxor       %%xmm6, %%xmm7     \n\t"
                "pshuflw $1, %%xmm7, %%xmm6    \n\t"
                "pxor       %%xmm6, %%xmm7     \n\t"
                "pslld         $31, %%xmm7     \n\t"
                "psrld         $15, %%xmm7     \n\t"
                "pxor       %%xmm7, %%xmm0     \n\t"
                "movd       %%xmm0, 124(%2)    \n\t"

                : "+r"(offset), "+r" (qscale2)
                : "r" (block), "r"(quant_matrix)
                : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",)
                  "memory"
        );
}

#endif /* HAVE_SSSE3_INLINE */
#endif /* HAVE_SSE2_INLINE */

av_cold void ff_mpv_unquantize_init_x86(MPVUnquantDSPContext *s, int bitexact)
{
#if HAVE_SSE2_INLINE
    int cpu_flags = av_get_cpu_flags();

    if (INLINE_SSE2(cpu_flags)) {
        if (!bitexact)
            s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_sse2;
    }
#if HAVE_SSSE3_INLINE
    if (INLINE_SSSE3(cpu_flags)) {
        s->dct_unquantize_h263_intra  = dct_unquantize_h263_intra_ssse3;
        s->dct_unquantize_h263_inter  = dct_unquantize_h263_inter_ssse3;
        s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_ssse3;
        s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_ssse3;
        s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_ssse3;
    }
#endif /* HAVE_SSSE3_INLINE */
#endif /* HAVE_SSE2_INLINE */
}