camenduru
/

ffmpeg-cuda

Model card Files Files and versions Community

ffmpeg-cuda / libavcodec /x86 /vp6dsp.asm

camenduru

thanks to ffmpeg ❤

8ead80b almost 2 years ago

raw

history blame contribute delete

4.66 kB

	;******************************************************************************
	;* MMX/SSE2-optimized functions for the VP6 decoder
	;* Copyright (C) 2009 Sebastien Lucas <[email protected]>
	;* Copyright (C) 2009 Zuxy Meng <[email protected]>
	;*
	;* This file is part of FFmpeg.
	;*
	;* FFmpeg is free software; you can redistribute it and/or
	;* modify it under the terms of the GNU Lesser General Public
	;* License as published by the Free Software Foundation; either
	;* version 2.1 of the License, or (at your option) any later version.
	;*
	;* FFmpeg is distributed in the hope that it will be useful,
	;* but WITHOUT ANY WARRANTY; without even the implied warranty of
	;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	;* Lesser General Public License for more details.
	;*
	;* You should have received a copy of the GNU Lesser General Public
	;* License along with FFmpeg; if not, write to the Free Software
	;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	;******************************************************************************

	%include "libavutil/x86/x86util.asm"

	cextern pw_64

	SECTION .text

	%macro DIAG4 6
	%if mmsize == 8
	movq m0, [%1+%2]
	movq m1, [%1+%3]
	movq m3, m0
	movq m4, m1
	punpcklbw m0, m7
	punpcklbw m1, m7
	punpckhbw m3, m7
	punpckhbw m4, m7
	pmullw m0, [rsp+811] ; src[x-8 ] biweight [0]
	pmullw m1, [rsp+812] ; src[x ] biweight [1]
	pmullw m3, [rsp+811] ; src[x-8 ] biweight [0]
	pmullw m4, [rsp+812] ; src[x ] biweight [1]
	paddw m0, m1
	paddw m3, m4
	movq m1, [%1+%4]
	movq m2, [%1+%5]
	movq m4, m1
	movq m5, m2
	punpcklbw m1, m7
	punpcklbw m2, m7
	punpckhbw m4, m7
	punpckhbw m5, m7
	pmullw m1, [rsp+813] ; src[x+8 ] biweight [2]
	pmullw m2, [rsp+814] ; src[x+16] biweight [3]
	pmullw m4, [rsp+813] ; src[x+8 ] biweight [2]
	pmullw m5, [rsp+814] ; src[x+16] biweight [3]
	paddw m1, m2
	paddw m4, m5
	paddsw m0, m1
	paddsw m3, m4
	paddsw m0, m6 ; Add 64
	paddsw m3, m6 ; Add 64
	psraw m0, 7
	psraw m3, 7
	packuswb m0, m3
	movq [%6], m0
	%else ; mmsize == 16
	movq m0, [%1+%2]
	movq m1, [%1+%3]
	punpcklbw m0, m7
	punpcklbw m1, m7
	pmullw m0, m4 ; src[x-8 ] * biweight [0]
	pmullw m1, m5 ; src[x ] * biweight [1]
	paddw m0, m1
	movq m1, [%1+%4]
	movq m2, [%1+%5]
	punpcklbw m1, m7
	punpcklbw m2, m7
	pmullw m1, m6 ; src[x+8 ] * biweight [2]
	pmullw m2, m3 ; src[x+16] * biweight [3]
	paddw m1, m2
	paddsw m0, m1
	paddsw m0, [pw_64] ; Add 64
	psraw m0, 7
	packuswb m0, m0
	movq [%6], m0
	%endif ; mmsize == 8/16
	%endmacro

	%macro SPLAT4REGS 0
	%if mmsize == 8
	movq m5, m3
	punpcklwd m3, m3
	movq m4, m3
	punpckldq m3, m3
	punpckhdq m4, m4
	punpckhwd m5, m5
	movq m2, m5
	punpckhdq m2, m2
	punpckldq m5, m5
	movq [rsp+8*11], m3
	movq [rsp+8*12], m4
	movq [rsp+8*13], m5
	movq [rsp+8*14], m2
	%else ; mmsize == 16
	pshuflw m4, m3, 0x0
	pshuflw m5, m3, 0x55
	pshuflw m6, m3, 0xAA
	pshuflw m3, m3, 0xFF
	punpcklqdq m4, m4
	punpcklqdq m5, m5
	punpcklqdq m6, m6
	punpcklqdq m3, m3
	%endif ; mmsize == 8/16
	%endmacro

	; void ff_vp6_filter_diag4_<opt>(uint8_t dst, uint8_t src, ptrdiff_t stride,
	; const int16_t h_weight[4], const int16_t v_weights[4])
	INIT_XMM sse2
	cglobal vp6_filter_diag4, 5, 7, 8
	mov r5, rsp ; backup stack pointer
	and rsp, ~(mmsize-1) ; align stack
	sub rsp, 8*11

	sub r1, r2

	pxor m7, m7
	movq m3, [r3]
	SPLAT4REGS

	mov r3, rsp
	mov r6, 11
	.nextrow:
	DIAG4 r1, -1, 0, 1, 2, r3
	add r3, 8
	add r1, r2
	dec r6
	jnz .nextrow

	movq m3, [r4]
	SPLAT4REGS

	lea r3, [rsp+8]
	mov r6, 8
	.nextcol:
	DIAG4 r3, -8, 0, 8, 16, r0
	add r3, 8
	add r0, r2
	dec r6
	jnz .nextcol

	mov rsp, r5 ; restore stack pointer
	RET