#include "asm-offsets.h"
#include "libavutil/aarch64/asm.S"
.macro inner_loop
ld1 {v29.4s}, [x9], x15
ld1 {v28.4s}, [x8], x15
ld1 {v30.4s}, [x10], x15
ld1 {v31.4s}, [x11], x15
rev64 v28.4s, v28.4s
ld1 {v24.4s}, [x4], x15
ld1 {v25.4s}, [x5], x15
rev64 v31.4s, v31.4s
ld1 {v26.4s}, [x6], x15
fmla v5.4s, v25.4s, v29.4s
ld1 {v27.4s}, [x7], x15
ext v28.16b, v28.16b, v28.16b, #8
ext v31.16b, v31.16b, v31.16b, #8
fmla v6.4s, v26.4s, v30.4s
fmls v4.4s, v24.4s, v28.4s
fmla v7.4s, v27.4s, v31.4s
.endm
function ff_synth_filter_float_neon, export=1
ldr w7, [x2]
ldr x9, [x0, #IMDCT_HALF]
sxtw x7, w7
stp x3, x4, [sp, #-64]!
add x1, x1, x7, lsl #2
sub w8, w7, #32
stp x5, x1, [sp, #16]
and x7, x7, #~63
and w8, w8, #511
stp x7, x30, [sp, #32]
str w8, [x2]
str s0, [sp, #48]
mov x2, x6
blr x9
ldp x2, x4, [sp]
ldp x13, x9, [sp, #16]
ldp x0, x30, [sp, #32]
ldr s0, [sp, #48]
add x3, x2, #16*4
add x14, x13, #16*4
add x8, x9, #12*4
mov x15, #64*4
mov x1, #4
1:
add x10, x9, #16*4
add x11, x8, #16*4
add x5, x4, #16*4
add x6, x4, #32*4
add x7, x4, #48*4
ld1 {v4.4s}, [x2]
ld1 {v5.4s}, [x3]
movi v6.4s, #0
movi v7.4s, #0
mov x12, #512
2:
sub x12, x12, #64
cmp x12, x0
inner_loop
b.gt 2b
sub x8, x8, #512*4
sub x9, x9, #512*4
cbz x12, 4f
sub x10, x10, #512*4
sub x11, x11, #512*4
3:
subs x12, x12, #64
inner_loop
b.gt 3b
4:
subs x1, x1, #1
fmul v4.4s, v4.4s, v0.s[0]
fmul v5.4s, v5.4s, v0.s[0]
st1 {v6.4s}, [x2], #16
st1 {v7.4s}, [x3], #16
st1 {v4.4s}, [x13], #16
st1 {v5.4s}, [x14], #16
b.le 10f
sub x4, x4, #508*4
add x9, x9, #4*4
sub x8, x8, #4*4
b 1b
10:
add sp, sp, #64
ret
endfunc