File size: 7,151 Bytes
8ead80b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 |
;******************************************************************************
;* SIMD-optimized MLP DSP functions
;* Copyright (c) 2014 James Almer <[email protected]>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
%if ARCH_X86_64
%macro SHLX 2
%if cpuflag(bmi2)
shlx %1, %1, %2q
%else
shl %1, %2b
%endif
%endmacro
%macro REMATRIX 0
movdqa m0, [samplesq]
movdqa m1, [coeffsq ]
pshufd m2, m0, q2301
pshufd m3, m1, q2301
pmuldq m0, m1
pmuldq m3, m2
paddq m0, m3
%if notcpuflag(avx2)
movdqa m1, [samplesq + 16]
movdqa m2, [coeffsq + 16]
pshufd m3, m1, q2301
pshufd m4, m2, q2301
pmuldq m1, m2
pmuldq m4, m3
paddq m0, m1
paddq m0, m4
%else
vextracti128 xm1, m0, 1
paddq xm0, xm1
%endif
%endmacro
%macro LOOP_END 0
pshufd xm1, xm0, q0032
paddq xm0, xm1
movq accumq, xm0
movzx blsbsd, byte [blsbs_ptrq] ; load *bypassed_lsbs
sar accumq, 14 ; accum >>= 14
and accumd, maskd ; accum &= mask
add accumd, blsbsd ; accum += *bypassed_lsbs
mov [samplesq + dest_chq], accumd ; samples[dest_ch] = accum
add blsbs_ptrq, 8 ; bypassed_lsbs += MAX_CHANNELS;
add samplesq, 32 ; samples += MAX_CHANNELS;
cmp blsbs_ptrq, cntq
%endmacro
%macro LOOP_SHIFT_END 0
pshufd xm1, xm0, q0032
paddq xm0, xm1
movq accumq, xm0
and indexd, auspd ; index &= access_unit_size_pow2;
movsx noiseq, byte [noise_bufferq + indexq] ; load noise_buffer[index]
add indexd, index2d ; index += index2
SHLX noiseq, mns ; noise_buffer[index] <<= matrix_noise_shift
add accumq, noiseq ; accum += noise_buffer[index]
movzx noised, byte [blsbs_ptrq] ; load *bypassed_lsbs (reuse tmp noise register)
sar accumq, 14 ; accum >>= 14
and accumd, maskd ; accum &= mask
add accumd, noised ; accum += *bypassed_lsbs
mov [samplesq + dest_chq], accumd ; samples[dest_ch] = accum
add blsbs_ptrq, 8 ; bypassed_lsbs += MAX_CHANNELS;
add samplesq, 32 ; samples += MAX_CHANNELS;
cmp blsbs_ptrq, cntq
%endmacro
;void ff_mlp_rematrix_channel(int32_t *samples, const int32_t *coeffs,
; const uint8_t *bypassed_lsbs, const int8_t *noise_buffer,
; int index, unsigned int dest_ch, uint16_t blockpos,
; unsigned int maxchan, int matrix_noise_shift,
; int access_unit_size_pow2, int32_t mask)
%macro MLP_REMATRIX_CHANNEL 0
cglobal mlp_rematrix_channel, 0, 13, 5, samples, coeffs, blsbs_ptr, blsbs, \
index, dest_ch, blockpos, maxchan, mns, \
accum, mask, cnt
mov mnsd, mnsm ; load matrix_noise_shift
movzx blockposq, word blockposm ; load and zero extend blockpos (16bit)
mov maxchand, maxchanm ; load maxchan
mov maskd, maskm ; load mask
%if WIN64
mov dest_chd, dest_chm ; load dest_chd (not needed on UNIX64)
%endif
shl dest_chd, 2
lea cntq, [blsbs_ptrq + blockposq*8]
test mnsd, mnsd ; is matrix_noise_shift != 0?
jne .shift ; jump if true
cmp maxchand, 4 ; is maxchan < 4?
jl .loop4 ; jump if true
align 16
.loop8:
; Process 5 or more channels
REMATRIX
LOOP_END
jne .loop8
RET
align 16
.loop4:
; Process up to 4 channels
movdqa xm0, [samplesq]
movdqa xm1, [coeffsq ]
pshufd xm2, xm0, q2301
pshufd xm3, xm1, q2301
pmuldq xm0, xm1
pmuldq xm3, xm2
paddq xm0, xm3
LOOP_END
jne .loop4
RET
.shift:
%if WIN64
mov indexd, indexm ; load index (not needed on UNIX64)
%endif
mov r9d, r9m ; load access_unit_size_pow2
%if cpuflag(bmi2)
; bmi2 has shift functions that accept any gpr, not just cl, so keep things in place.
DEFINE_ARGS samples, coeffs, blsbs_ptr, noise_buffer, \
index, dest_ch, accum, index2, mns, \
ausp, mask, cnt, noise
add mnsd, 7 ; matrix_noise_shift += 7
%else ; sse4
mov r6, rcx ; move rcx elsewhere so we can use cl for matrix_noise_shift
%if WIN64
; r0 = rcx
DEFINE_ARGS mns, coeffs, blsbs_ptr, noise_buffer, index, dest_ch, samples, \
index2, accum, ausp, mask, cnt, noise
%else ; UNIX64
; r3 = rcx
DEFINE_ARGS samples, coeffs, blsbs_ptr, mns, index, dest_ch, noise_buffer, \
index2, accum, ausp, mask, cnt, noise
%endif
lea mnsd, [r8 + 7] ; rcx = matrix_noise_shift + 7
%endif ; cpuflag
sub auspd, 1 ; access_unit_size_pow2 -= 1
cmp r7d, 4 ; is maxchan < 4?
lea index2q, [indexq*2 + 1] ; index2 = 2 * index + 1;
jl .loop4_shift ; jump if maxchan < 4
align 16
.loop8_shift:
; Process 5 or more channels
REMATRIX
LOOP_SHIFT_END
jne .loop8_shift
RET
align 16
.loop4_shift:
; Process up to 4 channels
movdqa xm0, [samplesq]
movdqa xm1, [coeffsq ]
pshufd xm2, xm0, q2301
pshufd xm3, xm1, q2301
pmuldq xm0, xm1
pmuldq xm3, xm2
paddq xm0, xm3
LOOP_SHIFT_END
jne .loop4_shift
RET
%endmacro
INIT_XMM sse4
MLP_REMATRIX_CHANNEL
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2, bmi2
MLP_REMATRIX_CHANNEL
%endif
%endif ; ARCH_X86_64
|