|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include "libavutil/arm/asm.S" |
|
|
|
|
|
#define W1 22725 |
|
#define W2 21407 |
|
#define W3 19266 |
|
#define W4 16383 |
|
#define W5 12873 |
|
#define W6 8867 |
|
#define W7 4520 |
|
#define MASK_MSHW 0xFFFF0000 |
|
|
|
#define ROW_SHIFT 11 |
|
#define ROW_SHIFT2MSHW (16-11) |
|
#define COL_SHIFT 20 |
|
#define ROW_SHIFTED_1 1024 |
|
#define COL_SHIFTED_1 524288 |
|
|
|
|
|
function ff_simple_idct_arm, export=1 |
|
void simple_idct_arm(int16_t *block) |
|
save stack for reg needed (take all of them), |
|
R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block |
|
so it must not be overwritten, if it is not saved!! |
|
R12 is another scratch register, so it should not be saved too |
|
save all registers |
|
stmfd sp!, {r4-r11, r14} |
|
|
|
R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block. |
|
add 2 temporary variables in the stack: R0 and R14 |
|
sub sp, sp, #8 |
|
save block in sp[0] |
|
stack status |
|
sp+4 free |
|
sp+0 R0 (block) |
|
|
|
|
|
at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free |
|
|
|
|
|
__row_loop: |
|
read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32 bits in two 16-bit words), at least it gives more usable registers :) |
|
ldr r1, [r14, #0] |
|
R2=(int32)(R12)[1]=ROWr32[1] |
|
ldr r3, [r14, #8] |
|
R4=ROWr32[3] |
|
check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop), |
|
if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row) |
|
else follow the complete algorithm. |
|
at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1], |
|
R3=ROWr32[2], R4=ROWr32[3], R5-R11 free |
|
orr r5, r4, r3 |
|
R5=R4 | R3 | R2 |
|
orrs r6, r5, r1 |
|
|
|
R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later) |
|
ldrsh r6, [r14, #0] |
|
R5=R4 | R3 | R2 | R7 |
|
beq __almost_empty_row |
|
|
|
__b_evaluation: |
|
at this point, R0=block (temp), R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3], |
|
R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free, |
|
R12=__const_ptr_, R14=&block[n] |
|
to save some registers/calls, proceed with b0-b3 first, followed by a0-a3 |
|
|
|
MUL16(b0, W1, row[1]); |
|
MUL16(b1, W3, row[1]); |
|
MUL16(b2, W5, row[1]); |
|
MUL16(b3, W7, row[1]); |
|
MAC16(b0, W3, row[3]); |
|
MAC16(b1, -W7, row[3]); |
|
MAC16(b2, -W1, row[3]); |
|
MAC16(b3, -W5, row[3]); |
|
ldr r8, =W1 |
|
R2=ROWr16[3] |
|
mul r0, r8, r7 |
|
R9=W3 |
|
ldr r10, =W5 |
|
R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) |
|
ldr r11, =W7 |
|
R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) |
|
mul r7, r11, r7 |
|
if null avoid muls |
|
itttt ne |
|
mlane r0, r9, r2, r0 |
|
R2=-ROWr16[3] |
|
mlane r1, r11, r2, r1 |
|
R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) |
|
it ne |
|
mlane r7, r10, r2, r7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
R2=ROWr32[2] | ROWr32[3] |
|
beq __end_b_evaluation |
|
|
|
at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3], |
|
R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, |
|
R12=__const_ptr_, R14=&block[n] |
|
MAC16(b0, W5, row[5]); |
|
MAC16(b2, W7, row[5]); |
|
MAC16(b3, W3, row[5]); |
|
MAC16(b1, -W1, row[5]); |
|
MAC16(b0, W7, row[7]); |
|
MAC16(b2, W3, row[7]); |
|
MAC16(b3, -W1, row[7]); |
|
MAC16(b1, -W5, row[7]); |
|
mov r3, r3, asr #16 |
|
if null avoid muls |
|
it ne |
|
mlane r0, r10, r3, r0 |
|
R4=ROWr16[7] |
|
itttt ne |
|
mlane r5, r11, r3, r5 |
|
R7+=W3*ROWr16[5]=b3 |
|
rsbne r3, r3, #0 |
|
R7-=W1*ROWr16[5]=b1 |
|
R3 is free now |
|
teq r4, #0 |
|
|
|
R0+=W7*ROWr16[7]=b0 |
|
mlane r5, r9, r4, r5 |
|
R4=-ROWr16[7] |
|
mlane r7, r8, r4, r7 |
|
|
|
R1-=W5*ROWr16[7]=b1 |
|
R4 is free now |
|
__end_b_evaluation: |
|
at this point, R0=b0, R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free), |
|
R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), |
|
R12=__const_ptr_, R14=&block[n] |
|
|
|
__a_evaluation: |
|
a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1)); |
|
a1 = a0 + W6 * row[2]; |
|
a2 = a0 - W6 * row[2]; |
|
a3 = a0 - W2 * row[2]; |
|
a0 = a0 + W2 * row[2]; |
|
ldr r9, =W4 |
|
R6=W4*ROWr16[0] |
|
ldr r10, =W6 |
|
R4=ROWr16[2] (a3 not defined yet) |
|
add r6, r6, #ROW_SHIFTED_1 |
|
|
|
R11=W6*ROWr16[2] |
|
ldr r8, =W2 |
|
R3=a0-W6*ROWr16[2] (a2) |
|
temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3]; |
|
if (temp != 0) {} |
|
teq r2, #0 |
|
beq __end_bef_a_evaluation |
|
|
|
add r2, r6, r11 |
|
R11=W2*ROWr16[2] |
|
sub r4, r6, r11 |
|
R6=a0+W2*ROWr16[2] (a0) |
|
|
|
|
|
at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, |
|
R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free), |
|
R12=__const_ptr_, R14=&block[n] |
|
|
|
|
|
a0 += W4*row[4] |
|
a1 -= W4*row[4] |
|
a2 -= W4*row[4] |
|
a3 += W4*row[4] |
|
ldrsh r11, [r14, #8] |
|
if null avoid muls |
|
it ne |
|
mulne r11, r9, r11 |
|
|
|
R9=ROWr16[6] |
|
itttt ne |
|
addne r6, r6, r11 |
|
R2-=W4*ROWr16[4] (a1) |
|
subne r3, r3, r11 |
|
R4+=W4*ROWr16[4] (a3) |
|
W6 alone is no more useful, save W2*ROWr16[6] in it instead |
|
teq r9, #0 |
|
|
|
R11=W6*ROWr16[6] |
|
addne r6, r6, r11 |
|
R10=W2*ROWr16[6] |
|
a0 += W6*row[6]; |
|
a3 -= W6*row[6]; |
|
a1 -= W2*row[6]; |
|
a2 += W2*row[6]; |
|
subne r4, r4, r11 |
|
|
|
R2-=W2*ROWr16[6] (a1) |
|
addne r3, r3, r10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
R8=a0+b0 |
|
add r9, r2, r1 |
|
|
|
|
|
R10=0xFFFF0000 |
|
and r9, r10, r9, lsl #ROW_SHIFT2MSHW |
|
R11= NOT R10= 0x0000FFFF |
|
and r8, r11, r8, asr #ROW_SHIFT |
|
|
|
|
|
|
|
R8=a2+b2 |
|
add r9, r4, r7 |
|
R9=0xFFFF0000 & ((a3+b3)<<5) |
|
and r8, r11, r8, asr #ROW_SHIFT |
|
|
|
|
|
|
|
R8=a3-b3 |
|
sub r9, r3, r5 |
|
R9=0xFFFF0000 & ((a2-b2)<<5) |
|
and r8, r11, r8, asr #ROW_SHIFT |
|
|
|
|
|
|
|
R8=a1-b1 |
|
sub r9, r6, r0 |
|
R9=0xFFFF0000 & ((a0-b0)<<5) |
|
and r8, r11, r8, asr #ROW_SHIFT |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run). |
|
sub r8, r8, #1 |
|
R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF |
|
orr r5, r5, r5, lsl #16 |
|
R14[0]=ROWr32[0]=R5 |
|
str r5, [r14, #4] |
|
R14[8]=ROWr32[2]=R5 |
|
str r5, [r14, #12] |
|
|
|
|
|
|
|
|
|
R0=block |
|
teq r0, r14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block. |
|
__col_loop: |
|
|
|
__b_evaluation2: |
|
at this point, R0=block (temp), R1-R11 (free) |
|
R12=__const_ptr_, R14=&block[n] |
|
proceed with b0-b3 first, followed by a0-a3 |
|
MUL16(b0, W1, col[8x1]); |
|
MUL16(b1, W3, col[8x1]); |
|
MUL16(b2, W5, col[8x1]); |
|
MUL16(b3, W7, col[8x1]); |
|
MAC16(b0, W3, col[8x3]); |
|
MAC16(b1, -W7, col[8x3]); |
|
MAC16(b2, -W1, col[8x3]); |
|
MAC16(b3, -W5, col[8x3]); |
|
ldr r8, =W1 |
|
|
|
R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) |
|
ldr r9, =W3 |
|
R10=W5 |
|
mul r1, r9, r7 |
|
R11=W7 |
|
mul r5, r10, r7 |
|
|
|
R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) |
|
teq r2, #0 |
|
|
|
R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) |
|
rsbne r2, r2, #0 |
|
R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) |
|
mlane r5, r8, r2, r5 |
|
|
|
R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) |
|
|
|
at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), |
|
R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, |
|
R12=__const_ptr_, R14=&block[n] |
|
MAC16(b0, W5, col[5x8]); |
|
MAC16(b2, W7, col[5x8]); |
|
MAC16(b3, W3, col[5x8]); |
|
MAC16(b1, -W1, col[5x8]); |
|
MAC16(b0, W7, col[7x8]); |
|
MAC16(b2, W3, col[7x8]); |
|
MAC16(b3, -W1, col[7x8]); |
|
MAC16(b1, -W5, col[7x8]); |
|
ldrsh r3, [r14, #80] |
|
if 0 then avoid muls |
|
itttt ne |
|
mlane r0, r10, r3, r0 |
|
R5+=W7*ROWr16[5x8]=b2 |
|
mlane r7, r9, r3, r7 |
|
R3=-ROWr16[5x8] |
|
ldrsh r4, [r14, #112] |
|
|
|
R7-=W1*ROWr16[5x8]=b1 |
|
R3 is free now |
|
teq r4, #0 |
|
|
|
R0+=W7*ROWr16[7x8]=b0 |
|
mlane r5, r9, r4, r5 |
|
R4=-ROWr16[7x8] |
|
mlane r7, r8, r4, r7 |
|
|
|
R1-=W5*ROWr16[7x8]=b1 |
|
R4 is free now |
|
__end_b_evaluation2: |
|
at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), |
|
R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), |
|
R12=__const_ptr_, R14=&block[n] |
|
|
|
__a_evaluation2: |
|
a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1)); |
|
a1 = a0 + W6 * row[2]; |
|
a2 = a0 - W6 * row[2]; |
|
a3 = a0 - W2 * row[2]; |
|
a0 = a0 + W2 * row[2]; |
|
ldrsh r6, [r14, #0] |
|
ldr r9, =W4 |
|
R6=W4*ROWr16[0] |
|
ldr r10, =W6 |
|
R4=ROWr16[2] (a3 not defined yet) |
|
add r6, r6, #COL_SHIFTED_1 |
|
R11=W6*ROWr16[2] |
|
ldr r8, =W2 |
|
R2=a0+W6*ROWr16[2] (a1) |
|
sub r3, r6, r11 |
|
R11=W2*ROWr16[2] |
|
sub r4, r6, r11 |
|
R6=a0+W2*ROWr16[2] (a0) |
|
|
|
at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, |
|
R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free), |
|
R12=__const_ptr_, R14=&block[n] |
|
a0 += W4*row[4] |
|
a1 -= W4*row[4] |
|
a2 -= W4*row[4] |
|
a3 += W4*row[4] |
|
ldrsh r11, [r14, #64] |
|
if null avoid muls |
|
itttt ne |
|
mulne r11, r9, r11 |
|
|
|
R6+=W4*ROWr16[4] (a0) |
|
subne r2, r2, r11 |
|
R3-=W4*ROWr16[4] (a2) |
|
ldrsh r9, [r14, #96] |
|
|
|
R4+=W4*ROWr16[4] (a3) |
|
W6 alone is no more useful, save W2*ROWr16[6] in it instead |
|
teq r9, #0 |
|
|
|
R11=W6*ROWr16[6] |
|
addne r6, r6, r11 |
|
R10=W2*ROWr16[6] |
|
a0 += W6*row[6]; |
|
a3 -= W6*row[6]; |
|
a1 -= W2*row[6]; |
|
a2 += W2*row[6]; |
|
subne r4, r4, r11 |
|
|
|
R2-=W2*ROWr16[6] (a1) |
|
addne r3, r3, r10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
no optimization here |
|
R8=a0+b0 |
|
add r9, r2, r1 |
|
|
|
|
|
|
|
|
|
R8=a2+b2 |
|
add r9, r4, r7 |
|
|
|
|
|
|
|
|
|
R8=a3-b3 |
|
sub r9, r3, r5 |
|
|
|
|
|
|
|
|
|
R8=a1-b1 |
|
sub r9, r6, r0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
R0=block |
|
teq r0, r14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
R2=a0+W6*ROWr16[2] (a1) |
|
mul r11, r8, r4 |
|
R4=a0-W2*ROWr16[2] (a3) |
|
add r6, r6, r11 |
|
|
|
|
|
|