/* * Copyright (c) 2021 Loongson Technology Corporation Limited * Contributed by Shiyou Yin * * This file is part of FFmpeg. * * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "libavutil/loongarch/loongson_intrinsics.h" #include "hpeldsp_lasx.h" static av_always_inline void put_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) { int stride1_2, stride1_3, stride1_4; int stride2_2, stride2_3, stride2_4; __asm__ volatile ( "slli.d %[stride1_2], %[srcStride1], 1 \n\t" "slli.d %[stride2_2], %[srcStride2], 1 \n\t" "add.d %[stride1_3], %[stride1_2], %[srcStride1] \n\t" "add.d %[stride2_3], %[stride2_2], %[srcStride2] \n\t" "slli.d %[stride1_4], %[stride1_2], 1 \n\t" "slli.d %[stride2_4], %[stride2_2], 1 \n\t" "1: \n\t" "vld $vr0, %[src1], 0 \n\t" "vldx $vr1, %[src1], %[srcStride1] \n\t" "vldx $vr2, %[src1], %[stride1_2] \n\t" "vldx $vr3, %[src1], %[stride1_3] \n\t" "add.d %[src1], %[src1], %[stride1_4] \n\t" "vld $vr4, %[src2], 0 \n\t" "vldx $vr5, %[src2], %[srcStride2] \n\t" "vldx $vr6, %[src2], %[stride2_2] \n\t" "vldx $vr7, %[src2], %[stride2_3] \n\t" "add.d %[src2], %[src2], %[stride2_4] \n\t" "addi.d %[h], %[h], -4 \n\t" "vavgr.bu $vr0, $vr4, $vr0 \n\t" "vavgr.bu $vr1, $vr5, $vr1 \n\t" "vavgr.bu $vr2, $vr6, $vr2 \n\t" "vavgr.bu $vr3, $vr7, $vr3 \n\t" "vstelm.d $vr0, %[dst], 0, 0 \n\t" "add.d %[dst], %[dst], %[dstStride] \n\t" "vstelm.d $vr1, %[dst], 0, 0 \n\t" "add.d %[dst], %[dst], %[dstStride] \n\t" "vstelm.d $vr2, %[dst], 0, 0 \n\t" "add.d %[dst], %[dst], %[dstStride] \n\t" "vstelm.d $vr3, %[dst], 0, 0 \n\t" "add.d %[dst], %[dst], %[dstStride] \n\t" "bnez %[h], 1b \n\t" : [dst]"+&r"(dst), [src2]"+&r"(src2), [src1]"+&r"(src1), [h]"+&r"(h), [stride1_2]"=&r"(stride1_2), [stride1_3]"=&r"(stride1_3), [stride1_4]"=&r"(stride1_4), [stride2_2]"=&r"(stride2_2), [stride2_3]"=&r"(stride2_3), [stride2_4]"=&r"(stride2_4) : [dstStride]"r"(dst_stride), [srcStride1]"r"(src_stride1), [srcStride2]"r"(src_stride2) : "memory" ); } static av_always_inline void put_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) { int stride1_2, stride1_3, stride1_4; int stride2_2, stride2_3, stride2_4; int dststride2, dststride3, dststride4; __asm__ volatile ( "slli.d %[stride1_2], %[srcStride1], 1 \n\t" "slli.d %[stride2_2], %[srcStride2], 1 \n\t" "slli.d %[dststride2], %[dstStride], 1 \n\t" "add.d %[stride1_3], %[stride1_2], %[srcStride1] \n\t" "add.d %[stride2_3], %[stride2_2], %[srcStride2] \n\t" "add.d %[dststride3], %[dststride2], %[dstStride] \n\t" "slli.d %[stride1_4], %[stride1_2], 1 \n\t" "slli.d %[stride2_4], %[stride2_2], 1 \n\t" "slli.d %[dststride4], %[dststride2], 1 \n\t" "1: \n\t" "vld $vr0, %[src1], 0 \n\t" "vldx $vr1, %[src1], %[srcStride1] \n\t" "vldx $vr2, %[src1], %[stride1_2] \n\t" "vldx $vr3, %[src1], %[stride1_3] \n\t" "add.d %[src1], %[src1], %[stride1_4] \n\t" "vld $vr4, %[src2], 0 \n\t" "vldx $vr5, %[src2], %[srcStride2] \n\t" "vldx $vr6, %[src2], %[stride2_2] \n\t" "vldx $vr7, %[src2], %[stride2_3] \n\t" "add.d %[src2], %[src2], %[stride2_4] \n\t" "addi.d %[h], %[h], -4 \n\t" "vavgr.bu $vr0, $vr4, $vr0 \n\t" "vavgr.bu $vr1, $vr5, $vr1 \n\t" "vavgr.bu $vr2, $vr6, $vr2 \n\t" "vavgr.bu $vr3, $vr7, $vr3 \n\t" "vst $vr0, %[dst], 0 \n\t" "vstx $vr1, %[dst], %[dstStride] \n\t" "vstx $vr2, %[dst], %[dststride2] \n\t" "vstx $vr3, %[dst], %[dststride3] \n\t" "add.d %[dst], %[dst], %[dststride4] \n\t" "bnez %[h], 1b \n\t" : [dst]"+&r"(dst), [src2]"+&r"(src2), [src1]"+&r"(src1), [h]"+&r"(h), [stride1_2]"=&r"(stride1_2), [stride1_3]"=&r"(stride1_3), [stride1_4]"=&r"(stride1_4), [stride2_2]"=&r"(stride2_2), [stride2_3]"=&r"(stride2_3), [stride2_4]"=&r"(stride2_4), [dststride2]"=&r"(dststride2), [dststride3]"=&r"(dststride3), [dststride4]"=&r"(dststride4) : [dstStride]"r"(dst_stride), [srcStride1]"r"(src_stride1), [srcStride2]"r"(src_stride2) : "memory" ); } void ff_put_pixels8_8_lasx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) { uint64_t tmp[8]; int h_8 = h >> 3; int res = h & 7; ptrdiff_t stride2, stride3, stride4; __asm__ volatile ( "beqz %[h_8], 2f \n\t" "slli.d %[stride2], %[stride], 1 \n\t" "add.d %[stride3], %[stride2], %[stride] \n\t" "slli.d %[stride4], %[stride2], 1 \n\t" "1: \n\t" "ld.d %[tmp0], %[src], 0x0 \n\t" "ldx.d %[tmp1], %[src], %[stride] \n\t" "ldx.d %[tmp2], %[src], %[stride2] \n\t" "ldx.d %[tmp3], %[src], %[stride3] \n\t" "add.d %[src], %[src], %[stride4] \n\t" "ld.d %[tmp4], %[src], 0x0 \n\t" "ldx.d %[tmp5], %[src], %[stride] \n\t" "ldx.d %[tmp6], %[src], %[stride2] \n\t" "ldx.d %[tmp7], %[src], %[stride3] \n\t" "add.d %[src], %[src], %[stride4] \n\t" "addi.d %[h_8], %[h_8], -1 \n\t" "st.d %[tmp0], %[dst], 0x0 \n\t" "stx.d %[tmp1], %[dst], %[stride] \n\t" "stx.d %[tmp2], %[dst], %[stride2] \n\t" "stx.d %[tmp3], %[dst], %[stride3] \n\t" "add.d %[dst], %[dst], %[stride4] \n\t" "st.d %[tmp4], %[dst], 0x0 \n\t" "stx.d %[tmp5], %[dst], %[stride] \n\t" "stx.d %[tmp6], %[dst], %[stride2] \n\t" "stx.d %[tmp7], %[dst], %[stride3] \n\t" "add.d %[dst], %[dst], %[stride4] \n\t" "bnez %[h_8], 1b \n\t" "2: \n\t" "beqz %[res], 4f \n\t" "3: \n\t" "ld.d %[tmp0], %[src], 0x0 \n\t" "add.d %[src], %[src], %[stride] \n\t" "addi.d %[res], %[res], -1 \n\t" "st.d %[tmp0], %[dst], 0x0 \n\t" "add.d %[dst], %[dst], %[stride] \n\t" "bnez %[res], 3b \n\t" "4: \n\t" : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), [tmp2]"=&r"(tmp[2]), [tmp3]"=&r"(tmp[3]), [tmp4]"=&r"(tmp[4]), [tmp5]"=&r"(tmp[5]), [tmp6]"=&r"(tmp[6]), [tmp7]"=&r"(tmp[7]), [dst]"+&r"(block), [src]"+&r"(pixels), [h_8]"+&r"(h_8), [res]"+&r"(res), [stride2]"=&r"(stride2), [stride3]"=&r"(stride3), [stride4]"=&r"(stride4) : [stride]"r"(line_size) : "memory" ); } /** * For widths 16, h is always a positive multiple of 4. * The function processes 4 rows per iteration. */ void ff_put_pixels16_8_lsx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) { int h_4 = h >> 2; ptrdiff_t stride2 = line_size << 1; ptrdiff_t stride3 = stride2 + line_size; ptrdiff_t stride4 = line_size << 2; __m128i src0, src1, src2, src3; for (int i = 0; i < h_4; i++) { src0 = __lsx_vld(pixels, 0); src1 = __lsx_vldx(pixels, line_size); src2 = __lsx_vldx(pixels, stride2); src3 = __lsx_vldx(pixels, stride3); __lsx_vst(src0, block, 0); __lsx_vstx(src1, block, line_size); __lsx_vstx(src2, block, stride2); __lsx_vstx(src3, block, stride3); pixels += stride4; block += stride4; } } void ff_put_pixels8_x2_8_lasx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) { put_pixels8_l2_8_lsx(block, pixels, pixels + 1, line_size, line_size, line_size, h); } void ff_put_pixels8_y2_8_lasx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) { put_pixels8_l2_8_lsx(block, pixels, pixels + line_size, line_size, line_size, line_size, h); } void ff_put_pixels16_x2_8_lasx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) { put_pixels16_l2_8_lsx(block, pixels, pixels + 1, line_size, line_size, line_size, h); } void ff_put_pixels16_y2_8_lasx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) { put_pixels16_l2_8_lsx(block, pixels, pixels + line_size, line_size, line_size, line_size, h); } /** * For widths 16, h is always a positive multiple of 4. * The function processes 4 rows per iteration. */ void ff_put_no_rnd_pixels16_x2_8_lasx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) { __m256i src0, src1, src2, src3, src4, src5, src6, src7; int32_t h_4 = h >> 2; int32_t stride2x = line_size << 1; int32_t stride4x = line_size << 2; int32_t stride3x = stride2x + line_size; uint8_t* _src = (uint8_t*)pixels + 1; for (int i = 0; i < h_4; i++) { src0 = __lasx_xvld(pixels, 0); DUP2_ARG2(__lasx_xvldx, pixels, line_size, pixels, stride2x, src1, src2); src3 = __lasx_xvldx(pixels, stride3x); src4 = __lasx_xvld(_src, 0); DUP2_ARG2(__lasx_xvldx, _src, line_size, _src, stride2x, src5, src6); src7 = __lasx_xvldx(_src, stride3x); DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src5, src4, 0x20, src7, src6, 0x20, src0, src1, src2, src3); src0 = __lasx_xvavg_bu(src0, src2); src1 = __lasx_xvavg_bu(src1, src3); __lasx_xvstelm_d(src0, block, 0, 0); __lasx_xvstelm_d(src0, block, 8, 1); block += line_size; __lasx_xvstelm_d(src0, block, 0, 2); __lasx_xvstelm_d(src0, block, 8, 3); block += line_size; __lasx_xvstelm_d(src1, block, 0, 0); __lasx_xvstelm_d(src1, block, 8, 1); block += line_size; __lasx_xvstelm_d(src1, block, 0, 2); __lasx_xvstelm_d(src1, block, 8, 3); block += line_size; _src += stride4x; pixels += stride4x; } } /** * For widths 16, h is always a positive multiple of 4. * The function processes 4 rows per iteration. */ void ff_put_no_rnd_pixels16_y2_8_lasx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) { __m256i src0, src1, src2, src3, src4; int32_t stride2x = line_size << 1; int32_t stride4x = line_size << 2; int32_t stride3x = stride2x + line_size; uint8_t* _src = (uint8_t*)pixels; int32_t h_4 = h >> 2; for (int i = 0; i < h_4; i++) { src0 = __lasx_xvld(_src, 0); DUP2_ARG2(__lasx_xvldx, _src, line_size, _src, stride2x, src1, src2); src3 = __lasx_xvldx(_src, stride3x); _src += stride4x; src4 = __lasx_xvld(_src, 0); DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20, src4, src3, 0x20, src0, src1, src2, src3); DUP2_ARG2(__lasx_xvavg_bu, src0, src1, src2, src3, src0, src2); __lasx_xvstelm_d(src0, block, 0, 0); __lasx_xvstelm_d(src0, block, 8, 1); block += line_size; __lasx_xvstelm_d(src0, block, 0, 2); __lasx_xvstelm_d(src0, block, 8, 3); block += line_size; __lasx_xvstelm_d(src2, block, 0, 0); __lasx_xvstelm_d(src2, block, 8, 1); block += line_size; __lasx_xvstelm_d(src2, block, 0, 2); __lasx_xvstelm_d(src2, block, 8, 3); block += line_size; } } void ff_put_no_rnd_pixels16_xy2_8_lasx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) { __m256i src0, src1, src2, src3; __m256i sum0, sum1, sum2; src0 = __lasx_xvld(pixels, 0); src1 = __lasx_xvld(pixels, 1); src2 = __lasx_vext2xv_hu_bu(src0); src3 = __lasx_vext2xv_hu_bu(src1); sum0 = __lasx_xvadd_h(src2, src3); sum0 = __lasx_xvaddi_hu(sum0, 1); for (int i= 0; i < h; i++) { pixels += line_size; src0 = __lasx_xvld(pixels, 0); src1 = __lasx_xvld(pixels, 1); src2 = __lasx_vext2xv_hu_bu(src0); src3 = __lasx_vext2xv_hu_bu(src1); sum1 = __lasx_xvadd_h(src2, src3); sum2 = __lasx_xvadd_h(sum0, sum1); sum2 = __lasx_xvsrani_b_h(sum2, sum2, 2); sum0 = __lasx_xvaddi_hu(sum1, 1); __lasx_xvstelm_d(sum2, block, 0, 0); __lasx_xvstelm_d(sum2, block, 8, 3); block += line_size; } } /** * For widths 8, h is always a positive multiple of 4. * The function processes 4 rows per iteration. */ void ff_put_no_rnd_pixels8_x2_8_lasx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) { __m256i src0, src1, src2, src3, src4, src5, src6, src7; int32_t stride2x = line_size << 1; int32_t stride3x = stride2x + line_size; int32_t stride4x = line_size << 2; uint8_t *_src = (uint8_t*)pixels + 1; int32_t h_4 = h >> 2; for (int i = 0; i < h_4; i++) { src0 = __lasx_xvld(pixels, 0); DUP2_ARG2(__lasx_xvldx, pixels, line_size, pixels, stride2x, src1, src2); src3 = __lasx_xvldx(pixels, stride3x); src4 = __lasx_xvld(_src, 0); DUP2_ARG2(__lasx_xvldx, _src, line_size, _src, stride2x, src5, src6); src7 = __lasx_xvldx(_src, stride3x); DUP4_ARG2(__lasx_xvpickev_d, src1, src0, src3, src2, src5, src4, src7, src6, src0, src1, src2, src3); DUP2_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src0, src1); src0 = __lasx_xvavg_bu(src0, src1); __lasx_xvstelm_d(src0, block, 0, 0); block += line_size; __lasx_xvstelm_d(src0, block, 0, 1); block += line_size; __lasx_xvstelm_d(src0, block, 0, 2); block += line_size; __lasx_xvstelm_d(src0, block, 0, 3); block += line_size; pixels += stride4x; _src += stride4x; } } /** * For widths 8, h is always a positive multiple of 4. * The function processes 4 rows per iteration. */ void ff_put_no_rnd_pixels8_y2_8_lasx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) { __m256i src0, src1, src2, src3, src4; int32_t stride2x = line_size << 1; int32_t stride4x = line_size << 2; int32_t stride3x = stride2x + line_size; uint8_t* _src = (uint8_t*)pixels; int32_t h_4 = h >> 2; for (int i = 0; i < h_4; i++) { src0 = __lasx_xvld(_src, 0); DUP4_ARG2(__lasx_xvldx, _src, line_size, _src, stride2x, _src, stride3x, _src, stride4x, src1, src2, src3, src4); DUP4_ARG2(__lasx_xvpickev_d, src1, src0, src2, src1, src3, src2, src4, src3, src0, src1, src2, src3); DUP2_ARG3(__lasx_xvpermi_q, src2, src0, 0x20, src3, src1, 0x20, src0, src1); src0 = __lasx_xvavg_bu(src0, src1); __lasx_xvstelm_d(src0, block, 0, 0); block += line_size; __lasx_xvstelm_d(src0, block, 0, 1); block += line_size; __lasx_xvstelm_d(src0, block, 0, 2); block += line_size; __lasx_xvstelm_d(src0, block, 0, 3); block += line_size; _src += stride4x; } } void ff_put_no_rnd_pixels8_xy2_8_lsx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) { __m128i src0, src1, src2, src3; __m128i sum0, sum1, sum2; src0 = __lsx_vld(pixels, 0); src1 = __lsx_vld(pixels, 1); src2 = __lsx_vsllwil_hu_bu(src0, 0); src3 = __lsx_vsllwil_hu_bu(src1, 0); sum0 = __lsx_vadd_h(src2, src3); sum0 = __lsx_vaddi_hu(sum0, 1); for (int i = 0; i < h; i++) { pixels += line_size; src0 = __lsx_vld(pixels, 0); src1 = __lsx_vld(pixels, 1); src2 = __lsx_vsllwil_hu_bu(src0, 0); src3 = __lsx_vsllwil_hu_bu(src1, 0); sum1 = __lsx_vadd_h(src2, src3); sum2 = __lsx_vadd_h(sum0, sum1); sum2 = __lsx_vsrani_b_h(sum2, sum2, 2); sum0 = __lsx_vaddi_hu(sum1, 1); __lsx_vstelm_d(sum2, block, 0, 0); block += line_size; } } void ff_put_pixels16_xy2_8_lasx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) { __m256i src0, src1, src2, src3; __m256i sum0, sum1, sum2; src0 = __lasx_xvld(pixels, 0); src1 = __lasx_xvld(pixels, 1); src2 = __lasx_vext2xv_hu_bu(src0); src3 = __lasx_vext2xv_hu_bu(src1); sum0 = __lasx_xvadd_h(src2, src3); sum0 = __lasx_xvaddi_hu(sum0, 2); for (int i = 0; i < h; i++) { pixels += line_size; src0 = __lasx_xvld(pixels, 0); src1 = __lasx_xvld(pixels, 1); src2 = __lasx_vext2xv_hu_bu(src0); src3 = __lasx_vext2xv_hu_bu(src1); sum1 = __lasx_xvadd_h(src2, src3); sum2 = __lasx_xvadd_h(sum0, sum1); sum2 = __lasx_xvsrani_b_h(sum2, sum2, 2); sum0 = __lasx_xvaddi_hu(sum1, 2); __lasx_xvstelm_d(sum2, block, 0, 0); __lasx_xvstelm_d(sum2, block, 8, 3); block += line_size; } } static void common_hv_bil_8w_lasx(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t height) { __m256i src0, src1, src2, src3, src4, src5, src6, src7; __m256i src8, src9, sum0, sum1; uint8_t loop_cnt; int32_t src_stride_2x = src_stride << 1; int32_t src_stride_4x = src_stride << 2; int32_t dst_stride_2x = dst_stride << 1; int32_t dst_stride_4x = dst_stride << 2; int32_t dst_stride_3x = dst_stride_2x + dst_stride; int32_t src_stride_3x = src_stride_2x + src_stride; uint8_t* _src = (uint8_t*)src; DUP2_ARG2(__lasx_xvld, _src, 0, _src, 1, src0, src5); _src += src_stride; for (loop_cnt = (height >> 2); loop_cnt--;) { src1 = __lasx_xvld(_src, 0); DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src2, src3); src4 = __lasx_xvldx(_src, src_stride_3x); _src += 1; src6 = __lasx_xvld(_src, 0); DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src7, src8); src9 = __lasx_xvldx(_src, src_stride_3x); _src += (src_stride_4x - 1); DUP4_ARG2(__lasx_xvilvl_b, src5, src0, src6, src1, src7, src2, src8, src3, src0, src1, src2, src3); src5 = __lasx_xvilvl_b(src9, src4); DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20, src5, src3, 0x20, src0, src1, src2, src3); DUP4_ARG2(__lasx_xvhaddw_hu_bu, src0, src0, src1, src1, src2, src2, src3, src3, src0, src1, src2, src3); DUP2_ARG2(__lasx_xvadd_h, src0, src1, src2, src3, sum0, sum1); sum0 = __lasx_xvsrarni_b_h(sum1, sum0, 2); __lasx_xvstelm_d(sum0, dst, 0, 0); __lasx_xvstelm_d(sum0, dst + dst_stride, 0, 2); __lasx_xvstelm_d(sum0, dst + dst_stride_2x, 0, 1); __lasx_xvstelm_d(sum0, dst + dst_stride_3x, 0, 3); dst += dst_stride_4x; src0 = src4; src5 = src9; } } void ff_put_pixels8_xy2_8_lasx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) { common_hv_bil_8w_lasx(pixels, line_size, block, line_size, h); }