File size: 4,878 Bytes
8ead80b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
;******************************************************************************
;* VP9 IDCT SIMD optimizations
;*
;* Copyright (C) 2013 Clément Bœsch <u pkh me>
;* Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%macro VP9_IWHT4_1D 0
SWAP 1, 2, 3
paddw m0, m2
psubw m3, m1
psubw m4, m0, m3
psraw m4, 1
psubw m5, m4, m1
SWAP 5, 1
psubw m4, m2
SWAP 4, 2
psubw m0, m1
paddw m3, m2
SWAP 3, 2, 1
%endmacro
; (a*x + b*y + round) >> shift
%macro VP9_MULSUB_2W_2X 5 ; dst1, dst2/src, round, coefs1, coefs2
pmaddwd m%1, m%2, %4
pmaddwd m%2, %5
paddd m%1, %3
paddd m%2, %3
psrad m%1, 14
psrad m%2, 14
%endmacro
%macro VP9_MULSUB_2W_4X 7 ; dst1, dst2, coef1, coef2, rnd, tmp1/src, tmp2
VP9_MULSUB_2W_2X %7, %6, %5, [pw_m%3_%4], [pw_%4_%3]
VP9_MULSUB_2W_2X %1, %2, %5, [pw_m%3_%4], [pw_%4_%3]
packssdw m%1, m%7
packssdw m%2, m%6
%endmacro
%macro VP9_UNPACK_MULSUB_2W_4X 7-9 ; dst1, dst2, (src1, src2,) coef1, coef2, rnd, tmp1, tmp2
%if %0 == 7
punpckhwd m%6, m%2, m%1
punpcklwd m%2, m%1
VP9_MULSUB_2W_4X %1, %2, %3, %4, %5, %6, %7
%else
punpckhwd m%8, m%4, m%3
punpcklwd m%2, m%4, m%3
VP9_MULSUB_2W_4X %1, %2, %5, %6, %7, %8, %9
%endif
%endmacro
%macro VP9_IDCT4_1D_FINALIZE 0
SUMSUB_BA w, 3, 2, 4 ; m3=t3+t0, m2=-t3+t0
SUMSUB_BA w, 1, 0, 4 ; m1=t2+t1, m0=-t2+t1
SWAP 0, 3, 2 ; 3102 -> 0123
%endmacro
%macro VP9_IDCT4_1D 0
%if cpuflag(ssse3)
SUMSUB_BA w, 2, 0, 4 ; m2=IN(0)+IN(2) m0=IN(0)-IN(2)
pmulhrsw m2, m6 ; m2=t0
pmulhrsw m0, m6 ; m0=t1
%else ; <= sse2
VP9_UNPACK_MULSUB_2W_4X 0, 2, 11585, 11585, m7, 4, 5 ; m0=t1, m1=t0
%endif
VP9_UNPACK_MULSUB_2W_4X 1, 3, 15137, 6270, m7, 4, 5 ; m1=t2, m3=t3
VP9_IDCT4_1D_FINALIZE
%endmacro
%macro VP9_IADST4_1D 0
movq2dq xmm0, m0
movq2dq xmm1, m1
movq2dq xmm2, m2
movq2dq xmm3, m3
%if cpuflag(ssse3)
paddw m3, m0
%endif
punpcklwd xmm0, xmm1
punpcklwd xmm2, xmm3
pmaddwd xmm1, xmm0, [pw_5283_13377]
pmaddwd xmm4, xmm0, [pw_9929_13377]
%if notcpuflag(ssse3)
pmaddwd xmm6, xmm0, [pw_13377_0]
%endif
pmaddwd xmm0, [pw_15212_m13377]
pmaddwd xmm3, xmm2, [pw_15212_9929]
%if notcpuflag(ssse3)
pmaddwd xmm7, xmm2, [pw_m13377_13377]
%endif
pmaddwd xmm2, [pw_m5283_m15212]
%if cpuflag(ssse3)
psubw m3, m2
%else
paddd xmm6, xmm7
%endif
paddd xmm0, xmm2
paddd xmm3, xmm5
paddd xmm2, xmm5
%if notcpuflag(ssse3)
paddd xmm6, xmm5
%endif
paddd xmm1, xmm3
paddd xmm0, xmm3
paddd xmm4, xmm2
psrad xmm1, 14
psrad xmm0, 14
psrad xmm4, 14
%if cpuflag(ssse3)
pmulhrsw m3, [pw_13377x2] ; out2
%else
psrad xmm6, 14
%endif
packssdw xmm0, xmm0
packssdw xmm1, xmm1
packssdw xmm4, xmm4
%if notcpuflag(ssse3)
packssdw xmm6, xmm6
%endif
movdq2q m0, xmm0 ; out3
movdq2q m1, xmm1 ; out0
movdq2q m2, xmm4 ; out1
%if notcpuflag(ssse3)
movdq2q m3, xmm6 ; out2
%endif
SWAP 0, 1, 2, 3
%endmacro
|