camenduru
/

ffmpeg-cuda

Model card Files Files and versions Community

ffmpeg-cuda / libavfilter /x86 /af_afir.asm

camenduru's picture

thanks to ffmpeg ❤

8ead80b almost 2 years ago

history blame contribute delete

2.26 kB

	;*****************************************************************************
	;* x86-optimized functions for afir filter
	;* Copyright (c) 2017 Paul B Mahol
	;*
	;* This file is part of FFmpeg.
	;*
	;* FFmpeg is free software; you can redistribute it and/or
	;* modify it under the terms of the GNU Lesser General Public
	;* License as published by the Free Software Foundation; either
	;* version 2.1 of the License, or (at your option) any later version.
	;*
	;* FFmpeg is distributed in the hope that it will be useful,
	;* but WITHOUT ANY WARRANTY; without even the implied warranty of
	;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	;* Lesser General Public License for more details.
	;*
	;* You should have received a copy of the GNU Lesser General Public
	;* License along with FFmpeg; if not, write to the Free Software
	;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	;******************************************************************************

	%include "libavutil/x86/x86util.asm"

	SECTION .text

	;------------------------------------------------------------------------------
	; void ff_fcmul_add(float sum, const float t, const float *c, int len)
	;------------------------------------------------------------------------------

	%macro FCMUL_ADD 0
	cglobal fcmul_add, 4,4,6, sum, t, c, len
	shl lend, 3
	add tq, lenq
	add cq, lenq
	add sumq, lenq
	neg lenq
	ALIGN 16
	.loop:
	movsldup m0, [tq + lenq]
	movsldup m3, [tq + lenq+mmsize]
	movaps m1, [cq + lenq]
	movaps m4, [cq + lenq+mmsize]
	mulps m0, m0, m1
	mulps m3, m3, m4
	shufps m1, m1, m1, 0xb1
	shufps m4, m4, m4, 0xb1
	movshdup m2, [tq + lenq]
	movshdup m5, [tq + lenq+mmsize]
	mulps m2, m2, m1
	mulps m5, m5, m4
	addsubps m0, m0, m2
	addsubps m3, m3, m5
	addps m0, m0, [sumq + lenq]
	addps m3, m3, [sumq + lenq+mmsize]
	movaps [sumq + lenq], m0
	movaps [sumq + lenq+mmsize], m3
	add lenq, mmsize*2
	jl .loop
	movss xm0, [tq + lenq]
	mulss xm0, [cq + lenq]
	addss xm0, [sumq + lenq]
	movss [sumq + lenq], xm0
	RET
	%endmacro

	INIT_XMM sse3
	FCMUL_ADD
	INIT_YMM avx
	FCMUL_ADD