;****************************************************************************** | |
;* MMX/SSE2-optimized functions for the VP6 decoder | |
;* Copyright (C) 2009 Sebastien Lucas <[email protected]> | |
;* Copyright (C) 2009 Zuxy Meng <[email protected]> | |
;* | |
;* This file is part of FFmpeg. | |
;* | |
;* FFmpeg is free software; you can redistribute it and/or | |
;* modify it under the terms of the GNU Lesser General Public | |
;* License as published by the Free Software Foundation; either | |
;* version 2.1 of the License, or (at your option) any later version. | |
;* | |
;* FFmpeg is distributed in the hope that it will be useful, | |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
;* Lesser General Public License for more details. | |
;* | |
;* You should have received a copy of the GNU Lesser General Public | |
;* License along with FFmpeg; if not, write to the Free Software | |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
;****************************************************************************** | |
%include "libavutil/x86/x86util.asm" | |
cextern pw_64 | |
SECTION .text | |
%macro DIAG4 6 | |
%if mmsize == 8 | |
movq m0, [%1+%2] | |
movq m1, [%1+%3] | |
movq m3, m0 | |
movq m4, m1 | |
punpcklbw m0, m7 | |
punpcklbw m1, m7 | |
punpckhbw m3, m7 | |
punpckhbw m4, m7 | |
pmullw m0, [rsp+8*11] ; src[x-8 ] * biweight [0] | |
pmullw m1, [rsp+8*12] ; src[x ] * biweight [1] | |
pmullw m3, [rsp+8*11] ; src[x-8 ] * biweight [0] | |
pmullw m4, [rsp+8*12] ; src[x ] * biweight [1] | |
paddw m0, m1 | |
paddw m3, m4 | |
movq m1, [%1+%4] | |
movq m2, [%1+%5] | |
movq m4, m1 | |
movq m5, m2 | |
punpcklbw m1, m7 | |
punpcklbw m2, m7 | |
punpckhbw m4, m7 | |
punpckhbw m5, m7 | |
pmullw m1, [rsp+8*13] ; src[x+8 ] * biweight [2] | |
pmullw m2, [rsp+8*14] ; src[x+16] * biweight [3] | |
pmullw m4, [rsp+8*13] ; src[x+8 ] * biweight [2] | |
pmullw m5, [rsp+8*14] ; src[x+16] * biweight [3] | |
paddw m1, m2 | |
paddw m4, m5 | |
paddsw m0, m1 | |
paddsw m3, m4 | |
paddsw m0, m6 ; Add 64 | |
paddsw m3, m6 ; Add 64 | |
psraw m0, 7 | |
psraw m3, 7 | |
packuswb m0, m3 | |
movq [%6], m0 | |
%else ; mmsize == 16 | |
movq m0, [%1+%2] | |
movq m1, [%1+%3] | |
punpcklbw m0, m7 | |
punpcklbw m1, m7 | |
pmullw m0, m4 ; src[x-8 ] * biweight [0] | |
pmullw m1, m5 ; src[x ] * biweight [1] | |
paddw m0, m1 | |
movq m1, [%1+%4] | |
movq m2, [%1+%5] | |
punpcklbw m1, m7 | |
punpcklbw m2, m7 | |
pmullw m1, m6 ; src[x+8 ] * biweight [2] | |
pmullw m2, m3 ; src[x+16] * biweight [3] | |
paddw m1, m2 | |
paddsw m0, m1 | |
paddsw m0, [pw_64] ; Add 64 | |
psraw m0, 7 | |
packuswb m0, m0 | |
movq [%6], m0 | |
%endif ; mmsize == 8/16 | |
%endmacro | |
%macro SPLAT4REGS 0 | |
%if mmsize == 8 | |
movq m5, m3 | |
punpcklwd m3, m3 | |
movq m4, m3 | |
punpckldq m3, m3 | |
punpckhdq m4, m4 | |
punpckhwd m5, m5 | |
movq m2, m5 | |
punpckhdq m2, m2 | |
punpckldq m5, m5 | |
movq [rsp+8*11], m3 | |
movq [rsp+8*12], m4 | |
movq [rsp+8*13], m5 | |
movq [rsp+8*14], m2 | |
%else ; mmsize == 16 | |
pshuflw m4, m3, 0x0 | |
pshuflw m5, m3, 0x55 | |
pshuflw m6, m3, 0xAA | |
pshuflw m3, m3, 0xFF | |
punpcklqdq m4, m4 | |
punpcklqdq m5, m5 | |
punpcklqdq m6, m6 | |
punpcklqdq m3, m3 | |
%endif ; mmsize == 8/16 | |
%endmacro | |
; void ff_vp6_filter_diag4_<opt>(uint8_t *dst, uint8_t *src, ptrdiff_t stride, | |
; const int16_t h_weight[4], const int16_t v_weights[4]) | |
INIT_XMM sse2 | |
cglobal vp6_filter_diag4, 5, 7, 8 | |
mov r5, rsp ; backup stack pointer | |
and rsp, ~(mmsize-1) ; align stack | |
sub rsp, 8*11 | |
sub r1, r2 | |
pxor m7, m7 | |
movq m3, [r3] | |
SPLAT4REGS | |
mov r3, rsp | |
mov r6, 11 | |
.nextrow: | |
DIAG4 r1, -1, 0, 1, 2, r3 | |
add r3, 8 | |
add r1, r2 | |
dec r6 | |
jnz .nextrow | |
movq m3, [r4] | |
SPLAT4REGS | |
lea r3, [rsp+8] | |
mov r6, 8 | |
.nextcol: | |
DIAG4 r3, -8, 0, 8, 16, r0 | |
add r3, 8 | |
add r0, r2 | |
dec r6 | |
jnz .nextcol | |
mov rsp, r5 ; restore stack pointer | |
RET | |