#include "libavutil/arm/asm.S"
IMDCT .req r0
ORIG_P_SB .req r1
P_SB_OFF .req r2
I .req r0
P_SB2_UP .req r1
OLDFPSCR .req r2
P_SB2_DN .req r3
P_WIN_DN .req r4
P_OUT_DN .req r5
P_SB .req r6
J_WRAP .req r7
P_WIN_UP .req r12
P_OUT_UP .req r14
SCALE .req s0
SBUF_DAT_REV0 .req s4
SBUF_DAT_REV1 .req s5
SBUF_DAT_REV2 .req s6
SBUF_DAT_REV3 .req s7
VA0 .req s8
VA3 .req s11
VB0 .req s12
VB3 .req s15
VC0 .req s8
VC3 .req s11
VD0 .req s12
VD3 .req s15
SBUF_DAT0 .req s16
SBUF_DAT1 .req s17
SBUF_DAT2 .req s18
SBUF_DAT3 .req s19
SBUF_DAT_ALT0 .req s20
SBUF_DAT_ALT1 .req s21
SBUF_DAT_ALT2 .req s22
SBUF_DAT_ALT3 .req s23
WIN_DN_DAT0 .req s24
WIN_UP_DAT0 .req s28
.macro inner_loop half, tail, head
.if (OFFSET & (64*4)) == 0 @ even numbered call
SBUF_DAT_THIS0 .req SBUF_DAT0
SBUF_DAT_THIS1 .req SBUF_DAT1
SBUF_DAT_THIS2 .req SBUF_DAT2
SBUF_DAT_THIS3 .req SBUF_DAT3
.ifnc "\head",""
vldr d8, [P_SB, #OFFSET] @ d8 = SBUF_DAT
vldr d9, [P_SB, #OFFSET+8]
.endif
.else
SBUF_DAT_THIS0 .req SBUF_DAT_ALT0
SBUF_DAT_THIS1 .req SBUF_DAT_ALT1
SBUF_DAT_THIS2 .req SBUF_DAT_ALT2
SBUF_DAT_THIS3 .req SBUF_DAT_ALT3
.ifnc "\head",""
vldr d10, [P_SB, #OFFSET] @ d10 = SBUF_DAT_ALT
vldr d11, [P_SB, #OFFSET+8]
.endif
.endif
.ifnc "\tail",""
.ifc "\half","ab"
vmls.f VA0, SBUF_DAT_REV0, WIN_DN_DAT0 @ all operands treated as vectors
.else
vmla.f VD0, SBUF_DAT_REV0, WIN_DN_DAT0 @ all operands treated as vectors
.endif
.endif
.ifnc "\head",""
vldr d14, [P_WIN_UP, #OFFSET] @ d14 = WIN_UP_DAT
vldr d15, [P_WIN_UP, #OFFSET+8]
vldr d12, [P_WIN_DN, #OFFSET] @ d12 = WIN_DN_DAT
vldr d13, [P_WIN_DN, #OFFSET+8]
vmov SBUF_DAT_REV3, SBUF_DAT_THIS0
vmov SBUF_DAT_REV2, SBUF_DAT_THIS1
vmov SBUF_DAT_REV1, SBUF_DAT_THIS2
vmov SBUF_DAT_REV0, SBUF_DAT_THIS3
.ifc "\half","ab"
vmla.f VB0, SBUF_DAT_THIS0, WIN_UP_DAT0
.else
vmla.f VC0, SBUF_DAT_THIS0, WIN_UP_DAT0
.endif
teq J_WRAP, #J
bne 2f @ strongly predictable, so better than cond exec in this case
sub P_SB, P_SB, #512*4
2:
.set J, J - 64
.set OFFSET, OFFSET + 64*4
.endif
.unreq SBUF_DAT_THIS0
.unreq SBUF_DAT_THIS1
.unreq SBUF_DAT_THIS2
.unreq SBUF_DAT_THIS3
.endm
function ff_synth_filter_float_vfp, export=1
push {r3-r7,lr}
vpush {s16-s31}
ldr lr, [P_SB_OFF]
add a2, ORIG_P_SB, lr, lsl #2 @ calculate synth_buf to pass to imdct_half
mov P_SB, a2 @ and keep a copy for ourselves
bic J_WRAP, lr, #63 @ mangled to make testing for wrap easier in inner loop
sub lr, lr, #32
and lr, lr, #512-32
str lr, [P_SB_OFF] @ rotate offset, modulo buffer size, ready for next call
ldr a3, [sp, #(16+6+2)*4] @ fetch in from stack, to pass to imdct_half
VFP vmov s16, SCALE @ imdct_half is free to corrupt s0, but it contains one of our arguments in hardfp case
bl X(ff_imdct_half_vfp)
VFP vmov SCALE, s16
fmrx OLDFPSCR, FPSCR
ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
fmxr FPSCR, lr
ldr P_SB2_DN, [sp, #16*4]
ldr P_WIN_DN, [sp, #(16+6+0)*4]
ldr P_OUT_DN, [sp, #(16+6+1)*4]
NOVFP vldr SCALE, [sp, #(16+6+3)*4]
#define IMM_OFF_SKEW 956
add P_SB, P_SB, #IMM_OFF_SKEW @ so we can use -ve offsets to use full immediate offset range
add P_SB2_UP, P_SB2_DN, #16*4
add P_WIN_UP, P_WIN_DN, #16*4+IMM_OFF_SKEW
add P_OUT_UP, P_OUT_DN, #16*4
add P_SB2_DN, P_SB2_DN, #16*4
add P_WIN_DN, P_WIN_DN, #12*4+IMM_OFF_SKEW
add P_OUT_DN, P_OUT_DN, #16*4
mov I, #4
1:
vldmia P_SB2_UP!, {VB0-VB3}
vldmdb P_SB2_DN!, {VA0-VA3}
.set J, 512 - 64
.set OFFSET, -IMM_OFF_SKEW
inner_loop ab,, head
.rept 7
inner_loop ab, tail, head
.endr
inner_loop ab, tail
add P_WIN_UP, P_WIN_UP, #4*4
sub P_WIN_DN, P_WIN_DN, #4*4
vmul.f VB0, VB0, SCALE @ SCALE treated as scalar
add P_SB, P_SB, #(512+4)*4
subs I, I, #1
vmul.f VA0, VA0, SCALE
vstmia P_OUT_UP!, {VB0-VB3}
vstmdb P_OUT_DN!, {VA0-VA3}
bne 1b
add P_SB2_DN, P_SB2_DN, #(16+28-12)*4
sub P_SB2_UP, P_SB2_UP, #(16+16)*4
add P_WIN_DN, P_WIN_DN, #(32+16+28-12)*4
mov I, #4
1:
vldr.d d4, zero @ d4 = VC0
vldr.d d5, zero
vldr.d d6, zero @ d6 = VD0
vldr.d d7, zero
.set J, 512 - 64
.set OFFSET, -IMM_OFF_SKEW
inner_loop cd,, head
.rept 7
inner_loop cd, tail, head
.endr
inner_loop cd, tail
add P_WIN_UP, P_WIN_UP, #4*4
sub P_WIN_DN, P_WIN_DN, #4*4
add P_SB, P_SB, #(512+4)*4
subs I, I, #1
vstmia P_SB2_UP!, {VC0-VC3}
vstmdb P_SB2_DN!, {VD0-VD3}
bne 1b
fmxr FPSCR, OLDFPSCR
vpop {s16-s31}
pop {r3-r7,pc}
endfunc
.unreq IMDCT
.unreq ORIG_P_SB
.unreq P_SB_OFF
.unreq I
.unreq P_SB2_UP
.unreq OLDFPSCR
.unreq P_SB2_DN
.unreq P_WIN_DN
.unreq P_OUT_DN
.unreq P_SB
.unreq J_WRAP
.unreq P_WIN_UP
.unreq P_OUT_UP
.unreq SCALE
.unreq SBUF_DAT_REV0
.unreq SBUF_DAT_REV1
.unreq SBUF_DAT_REV2
.unreq SBUF_DAT_REV3
.unreq VA0
.unreq VA3
.unreq VB0
.unreq VB3
.unreq VC0
.unreq VC3
.unreq VD0
.unreq VD3
.unreq SBUF_DAT0
.unreq SBUF_DAT1
.unreq SBUF_DAT2
.unreq SBUF_DAT3
.unreq SBUF_DAT_ALT0
.unreq SBUF_DAT_ALT1
.unreq SBUF_DAT_ALT2
.unreq SBUF_DAT_ALT3
.unreq WIN_DN_DAT0
.unreq WIN_UP_DAT0
.align 3
zero: .word 0, 0