File size: 6,214 Bytes
8ead80b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
/*
* Alpha optimized DSP utils
* Copyright (c) 2002 Falk Hueffner <[email protected]>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "regdef.h"
/* Some nicer register names. */
#define ta t10
#define tb t11
#define tc t12
#define td AT
/* Danger: these overlap with the argument list and the return value */
#define te a5
#define tf a4
#define tg a3
#define th v0
.set noat
.set noreorder
.arch pca56
.text
/*****************************************************************************
* int pix_abs16x16_mvi_asm(const uint8_t *pix1, const uint8_t *pix2, int line_size)
*
* This code is written with a pca56 in mind. For ev6, one should
* really take the increased latency of 3 cycles for MVI instructions
* into account.
*
* It is important to keep the loading and first use of a register as
* far apart as possible, because if a register is accessed before it
* has been fetched from memory, the CPU will stall.
*/
.align 4
.globl pix_abs16x16_mvi_asm
.ent pix_abs16x16_mvi_asm
pix_abs16x16_mvi_asm:
.frame sp, 0, ra, 0
.prologue 0
and a2, 7, t0
clr v0
beq t0, $aligned
.align 4
$unaligned:
/* Registers:
line 0:
t0: left_u -> left lo -> left
t1: mid
t2: right_u -> right hi -> right
t3: ref left
t4: ref right
line 1:
t5: left_u -> left lo -> left
t6: mid
t7: right_u -> right hi -> right
t8: ref left
t9: ref right
temp:
ta: left hi
tb: right lo
tc: error left
td: error right */
/* load line 0 */
ldq_u t0, 0(a2) # left_u
ldq_u t1, 8(a2) # mid
ldq_u t2, 16(a2) # right_u
ldq t3, 0(a1) # ref left
ldq t4, 8(a1) # ref right
addq a1, a3, a1 # pix1
addq a2, a3, a2 # pix2
/* load line 1 */
ldq_u t5, 0(a2) # left_u
ldq_u t6, 8(a2) # mid
ldq_u t7, 16(a2) # right_u
ldq t8, 0(a1) # ref left
ldq t9, 8(a1) # ref right
addq a1, a3, a1 # pix1
addq a2, a3, a2 # pix2
/* calc line 0 */
extql t0, a2, t0 # left lo
extqh t1, a2, ta # left hi
extql t1, a2, tb # right lo
or t0, ta, t0 # left
extqh t2, a2, t2 # right hi
perr t3, t0, tc # error left
or t2, tb, t2 # right
perr t4, t2, td # error right
addq v0, tc, v0 # add error left
addq v0, td, v0 # add error left
/* calc line 1 */
extql t5, a2, t5 # left lo
extqh t6, a2, ta # left hi
extql t6, a2, tb # right lo
or t5, ta, t5 # left
extqh t7, a2, t7 # right hi
perr t8, t5, tc # error left
or t7, tb, t7 # right
perr t9, t7, td # error right
addq v0, tc, v0 # add error left
addq v0, td, v0 # add error left
/* loop */
subq a4, 2, a4 # h -= 2
bne a4, $unaligned
ret
.align 4
$aligned:
/* load line 0 */
ldq t0, 0(a2) # left
ldq t1, 8(a2) # right
addq a2, a3, a2 # pix2
ldq t2, 0(a1) # ref left
ldq t3, 8(a1) # ref right
addq a1, a3, a1 # pix1
/* load line 1 */
ldq t4, 0(a2) # left
ldq t5, 8(a2) # right
addq a2, a3, a2 # pix2
ldq t6, 0(a1) # ref left
ldq t7, 8(a1) # ref right
addq a1, a3, a1 # pix1
/* load line 2 */
ldq t8, 0(a2) # left
ldq t9, 8(a2) # right
addq a2, a3, a2 # pix2
ldq ta, 0(a1) # ref left
ldq tb, 8(a1) # ref right
addq a1, a3, a1 # pix1
/* load line 3 */
ldq tc, 0(a2) # left
ldq td, 8(a2) # right
addq a2, a3, a2 # pix2
ldq te, 0(a1) # ref left
ldq a0, 8(a1) # ref right
/* calc line 0 */
perr t0, t2, t0 # error left
addq a1, a3, a1 # pix1
perr t1, t3, t1 # error right
addq v0, t0, v0 # add error left
/* calc line 1 */
perr t4, t6, t0 # error left
addq v0, t1, v0 # add error right
perr t5, t7, t1 # error right
addq v0, t0, v0 # add error left
/* calc line 2 */
perr t8, ta, t0 # error left
addq v0, t1, v0 # add error right
perr t9, tb, t1 # error right
addq v0, t0, v0 # add error left
/* calc line 3 */
perr tc, te, t0 # error left
addq v0, t1, v0 # add error right
perr td, a0, t1 # error right
addq v0, t0, v0 # add error left
addq v0, t1, v0 # add error right
/* loop */
subq a4, 4, a4 # h -= 4
bne a4, $aligned
ret
.end pix_abs16x16_mvi_asm
|