;***************************************************************************** | |
;* x86-optimized functions for afir filter | |
;* Copyright (c) 2017 Paul B Mahol | |
;* | |
;* This file is part of FFmpeg. | |
;* | |
;* FFmpeg is free software; you can redistribute it and/or | |
;* modify it under the terms of the GNU Lesser General Public | |
;* License as published by the Free Software Foundation; either | |
;* version 2.1 of the License, or (at your option) any later version. | |
;* | |
;* FFmpeg is distributed in the hope that it will be useful, | |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
;* Lesser General Public License for more details. | |
;* | |
;* You should have received a copy of the GNU Lesser General Public | |
;* License along with FFmpeg; if not, write to the Free Software | |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
;****************************************************************************** | |
%include "libavutil/x86/x86util.asm" | |
SECTION | |
;------------------------------------------------------------------------------ | |
; void ff_fcmul_add(float *sum, const float *t, const float *c, int len) | |
;------------------------------------------------------------------------------ | |
%macro FCMUL_ADD 0 | |
cglobal fcmul_add, 4,4,6, sum, t, c, len | |
shl lend, 3 | |
add tq, lenq | |
add cq, lenq | |
add sumq, lenq | |
neg lenq | |
ALIGN 16 | |
.loop: | |
movsldup m0, [tq + lenq] | |
movsldup m3, [tq + lenq+mmsize] | |
movaps m1, [cq + lenq] | |
movaps m4, [cq + lenq+mmsize] | |
mulps m0, m0, m1 | |
mulps m3, m3, m4 | |
shufps m1, m1, m1, 0xb1 | |
shufps m4, m4, m4, 0xb1 | |
movshdup m2, [tq + lenq] | |
movshdup m5, [tq + lenq+mmsize] | |
mulps m2, m2, m1 | |
mulps m5, m5, m4 | |
addsubps m0, m0, m2 | |
addsubps m3, m3, m5 | |
addps m0, m0, [sumq + lenq] | |
addps m3, m3, [sumq + lenq+mmsize] | |
movaps [sumq + lenq], m0 | |
movaps [sumq + lenq+mmsize], m3 | |
add lenq, mmsize*2 | |
jl .loop | |
movss xm0, [tq + lenq] | |
mulss xm0, [cq + lenq] | |
addss xm0, [sumq + lenq] | |
movss [sumq + lenq], xm0 | |
RET | |
%endmacro | |
INIT_XMM sse3 | |
FCMUL_ADD | |
INIT_YMM avx | |
FCMUL_ADD | |