|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
%include "libavutil/x86/x86util.asm" |
|
|
|
SECTION .text |
|
|
|
%if ARCH_X86_64 |
|
|
|
%macro SHLX 2 |
|
%if cpuflag(bmi2) |
|
shlx %1, %1, %2q |
|
%else |
|
shl %1, %2b |
|
%endif |
|
%endmacro |
|
|
|
%macro REMATRIX 0 |
|
movdqa m0, [samplesq] |
|
movdqa m1, [coeffsq ] |
|
pshufd m2, m0, q2301 |
|
pshufd m3, m1, q2301 |
|
pmuldq m0, m1 |
|
pmuldq m3, m2 |
|
paddq m0, m3 |
|
%if notcpuflag(avx2) |
|
movdqa m1, [samplesq + 16] |
|
movdqa m2, [coeffsq + 16] |
|
pshufd m3, m1, q2301 |
|
pshufd m4, m2, q2301 |
|
pmuldq m1, m2 |
|
pmuldq m4, m3 |
|
paddq m0, m1 |
|
paddq m0, m4 |
|
%else |
|
vextracti128 xm1, m0, 1 |
|
paddq xm0, xm1 |
|
%endif |
|
%endmacro |
|
|
|
%macro LOOP_END 0 |
|
pshufd xm1, xm0, q0032 |
|
paddq xm0, xm1 |
|
movq accumq, xm0 |
|
movzx blsbsd, byte [blsbs_ptrq] |
|
sar accumq, 14 |
|
and accumd, maskd |
|
add accumd, blsbsd |
|
mov [samplesq + dest_chq], accumd |
|
add blsbs_ptrq, 8 |
|
add samplesq, 32 |
|
cmp blsbs_ptrq, cntq |
|
%endmacro |
|
|
|
%macro LOOP_SHIFT_END 0 |
|
pshufd xm1, xm0, q0032 |
|
paddq xm0, xm1 |
|
movq accumq, xm0 |
|
and indexd, auspd |
|
movsx noiseq, byte [noise_bufferq + indexq] |
|
add indexd, index2d |
|
SHLX noiseq, mns |
|
add accumq, noiseq |
|
movzx noised, byte [blsbs_ptrq] |
|
sar accumq, 14 |
|
and accumd, maskd |
|
add accumd, noised |
|
mov [samplesq + dest_chq], accumd |
|
add blsbs_ptrq, 8 |
|
add samplesq, 32 |
|
cmp blsbs_ptrq, cntq |
|
%endmacro |
|
|
|
|
|
|
|
|
|
|
|
|
|
%macro MLP_REMATRIX_CHANNEL 0 |
|
cglobal mlp_rematrix_channel, 0, 13, 5, samples, coeffs, blsbs_ptr, blsbs, \ |
|
index, dest_ch, blockpos, maxchan, mns, \ |
|
accum, mask, cnt |
|
mov mnsd, mnsm |
|
movzx blockposq, word blockposm |
|
mov maxchand, maxchanm |
|
mov maskd, maskm |
|
%if WIN64 |
|
mov dest_chd, dest_chm |
|
%endif |
|
shl dest_chd, 2 |
|
lea cntq, [blsbs_ptrq + blockposq*8] |
|
test mnsd, mnsd |
|
jne .shift |
|
cmp maxchand, 4 |
|
jl .loop4 |
|
|
|
align 16 |
|
.loop8: |
|
|
|
REMATRIX |
|
LOOP_END |
|
jne .loop8 |
|
RET |
|
|
|
align 16 |
|
.loop4: |
|
|
|
movdqa xm0, [samplesq] |
|
movdqa xm1, [coeffsq ] |
|
pshufd xm2, xm0, q2301 |
|
pshufd xm3, xm1, q2301 |
|
pmuldq xm0, xm1 |
|
pmuldq xm3, xm2 |
|
paddq xm0, xm3 |
|
LOOP_END |
|
jne .loop4 |
|
RET |
|
|
|
.shift: |
|
%if WIN64 |
|
mov indexd, indexm |
|
%endif |
|
mov r9d, r9m |
|
%if cpuflag(bmi2) |
|
|
|
DEFINE_ARGS samples, coeffs, blsbs_ptr, noise_buffer, \ |
|
index, dest_ch, accum, index2, mns, \ |
|
ausp, mask, cnt, noise |
|
add mnsd, 7 |
|
%else |
|
mov r6, rcx |
|
%if WIN64 |
|
|
|
DEFINE_ARGS mns, coeffs, blsbs_ptr, noise_buffer, index, dest_ch, samples, \ |
|
index2, accum, ausp, mask, cnt, noise |
|
%else |
|
|
|
DEFINE_ARGS samples, coeffs, blsbs_ptr, mns, index, dest_ch, noise_buffer, \ |
|
index2, accum, ausp, mask, cnt, noise |
|
%endif |
|
lea mnsd, [r8 + 7] |
|
%endif |
|
sub auspd, 1 |
|
cmp r7d, 4 |
|
lea index2q, [indexq*2 + 1] |
|
jl .loop4_shift |
|
|
|
align 16 |
|
.loop8_shift: |
|
|
|
REMATRIX |
|
LOOP_SHIFT_END |
|
jne .loop8_shift |
|
RET |
|
|
|
align 16 |
|
.loop4_shift: |
|
|
|
movdqa xm0, [samplesq] |
|
movdqa xm1, [coeffsq ] |
|
pshufd xm2, xm0, q2301 |
|
pshufd xm3, xm1, q2301 |
|
pmuldq xm0, xm1 |
|
pmuldq xm3, xm2 |
|
paddq xm0, xm3 |
|
LOOP_SHIFT_END |
|
jne .loop4_shift |
|
RET |
|
%endmacro |
|
|
|
INIT_XMM sse4 |
|
MLP_REMATRIX_CHANNEL |
|
%if HAVE_AVX2_EXTERNAL |
|
INIT_YMM avx2, bmi2 |
|
MLP_REMATRIX_CHANNEL |
|
%endif |
|
|
|
%endif |
|
|