#include "libavutil/aarch64/asm.S"
#include "neon.S"
const itxfm4_coeffs, align=4
.short 11585, 0, 6270, 15137
iadst4_coeffs:
.short 5283, 15212, 9929, 13377
endconst
const iadst8_coeffs, align=4
.short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
idct_coeffs:
.short 11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
.short 1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
.short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
.short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
endconst
const iadst16_coeffs, align=4
.short 16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
.short 14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
endconst
.macro transpose_4x4s r0, r1, r2, r3, r4, r5, r6, r7
trn1 \r4\().4s, \r0\().4s, \r1\().4s
trn2 \r5\().4s, \r0\().4s, \r1\().4s
trn1 \r6\().4s, \r2\().4s, \r3\().4s
trn2 \r7\().4s, \r2\().4s, \r3\().4s
trn1 \r0\().2d, \r4\().2d, \r6\().2d
trn2 \r2\().2d, \r4\().2d, \r6\().2d
trn1 \r1\().2d, \r5\().2d, \r7\().2d
trn2 \r3\().2d, \r5\().2d, \r7\().2d
.endm
.macro transpose_8x8s r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15, t0, t1, t2, t3
transpose_4x4s \r0, \r2, \r4, \r6, \t0, \t1, \t2, \t3
transpose_4x4s \r9, \r11, \r13, \r15, \t0, \t1, \t2, \t3
trn1 \t0\().4s, \r1\().4s, \r3\().4s
trn2 \t1\().4s, \r1\().4s, \r3\().4s
trn1 \t2\().4s, \r5\().4s, \r7\().4s
trn2 \t3\().4s, \r5\().4s, \r7\().4s
trn1 \r1\().4s, \r8\().4s, \r10\().4s
trn2 \r3\().4s, \r8\().4s, \r10\().4s
trn1 \r5\().4s, \r12\().4s, \r14\().4s
trn2 \r7\().4s, \r12\().4s, \r14\().4s
trn1 \r8\().2d, \t0\().2d, \t2\().2d
trn2 \r12\().2d, \t0\().2d, \t2\().2d
trn1 \r10\().2d, \t1\().2d, \t3\().2d
trn2 \r14\().2d, \t1\().2d, \t3\().2d
trn1 \t0\().2d, \r1\().2d, \r5\().2d
trn2 \r5\().2d, \r1\().2d, \r5\().2d
trn1 \t1\().2d, \r3\().2d, \r7\().2d
trn2 \r7\().2d, \r3\().2d, \r7\().2d
mov \r1\().16b, \t0\().16b
mov \r3\().16b, \t1\().16b
.endm
.macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
.if \neg > 0
neg \tmp4\().4s, v0.4s
.endif
add \tmp1\().4s, \in1\().4s, \in2\().4s
sub \tmp2\().4s, \in1\().4s, \in2\().4s
.if \neg > 0
smull \tmp3\().2d, \tmp1\().2s, \tmp4\().s[0]
smull2 \tmp4\().2d, \tmp1\().4s, \tmp4\().s[0]
.else
smull \tmp3\().2d, \tmp1\().2s, v0.s[0]
smull2 \tmp4\().2d, \tmp1\().4s, v0.s[0]
.endif
.ifb \tmp5
rshrn \out1\().2s, \tmp3\().2d, #14
rshrn2 \out1\().4s, \tmp4\().2d, #14
smull \tmp3\().2d, \tmp2\().2s, v0.s[0]
smull2 \tmp4\().2d, \tmp2\().4s, v0.s[0]
rshrn \out2\().2s, \tmp3\().2d, #14
rshrn2 \out2\().4s, \tmp4\().2d, #14
.else
smull \tmp5\().2d, \tmp2\().2s, v0.s[0]
smull2 \tmp6\().2d, \tmp2\().4s, v0.s[0]
rshrn \out1\().2s, \tmp3\().2d, #14
rshrn2 \out1\().4s, \tmp4\().2d, #14
rshrn \out2\().2s, \tmp5\().2d, #14
rshrn2 \out2\().4s, \tmp6\().2d, #14
.endif
.endm
.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
smull \tmp1\().2d, \in1\().2s, v0.s[0]
smull2 \tmp2\().2d, \in1\().4s, v0.s[0]
rshrn \out1\().2s, \tmp1\().2d, #14
rshrn2 \out1\().4s, \tmp2\().2d, #14
rshrn \out2\().2s, \tmp1\().2d, #14
rshrn2 \out2\().4s, \tmp2\().2d, #14
.endm
.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
smull \out1\().2d, \in1\().2s, \coef1
smull2 \out2\().2d, \in1\().4s, \coef1
smull \out3\().2d, \in1\().2s, \coef2
smull2 \out4\().2d, \in1\().4s, \coef2
smlsl \out1\().2d, \in2\().2s, \coef2
smlsl2 \out2\().2d, \in2\().4s, \coef2
smlal \out3\().2d, \in2\().2s, \coef1
smlal2 \out4\().2d, \in2\().4s, \coef1
.endm
.macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
dmbutterfly_l \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
.if \neg > 0
neg \tmp3\().2d, \tmp3\().2d
neg \tmp4\().2d, \tmp4\().2d
.endif
rshrn \inout1\().2s, \tmp1\().2d, #14
rshrn2 \inout1\().4s, \tmp2\().2d, #14
rshrn \inout2\().2s, \tmp3\().2d, #14
rshrn2 \inout2\().4s, \tmp4\().2d, #14
.endm
.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
smull \tmp1\().2d, \inout1\().2s, \coef1
smull2 \tmp2\().2d, \inout1\().4s, \coef1
smull \tmp3\().2d, \inout1\().2s, \coef2
smull2 \tmp4\().2d, \inout1\().4s, \coef2
rshrn \inout1\().2s, \tmp1\().2d, #14
rshrn2 \inout1\().4s, \tmp2\().2d, #14
rshrn \inout2\().2s, \tmp3\().2d, #14
rshrn2 \inout2\().4s, \tmp4\().2d, #14
.endm
.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
smull \tmp1\().2d, \inout2\().2s, \coef2
smull2 \tmp2\().2d, \inout2\().4s, \coef2
smull \tmp3\().2d, \inout2\().2s, \coef1
smull2 \tmp4\().2d, \inout2\().4s, \coef1
neg \tmp1\().2d, \tmp1\().2d
neg \tmp2\().2d, \tmp2\().2d
rshrn \inout2\().2s, \tmp3\().2d, #14
rshrn2 \inout2\().4s, \tmp4\().2d, #14
rshrn \inout1\().2s, \tmp1\().2d, #14
rshrn2 \inout1\().4s, \tmp2\().2d, #14
.endm
.macro dsmull_h out1, out2, in, coef
smull \out1\().2d, \in\().2s, \coef
smull2 \out2\().2d, \in\().4s, \coef
.endm
.macro drshrn_h out, in1, in2, shift
rshrn \out\().2s, \in1\().2d, \shift
rshrn2 \out\().4s, \in2\().2d, \shift
.endm
.macro butterfly_4s out1, out2, in1, in2
add \out1\().4s, \in1\().4s, \in2\().4s
sub \out2\().4s, \in1\().4s, \in2\().4s
.endm
.macro butterfly_4s_r out1, out2, in1, in2
sub \out1\().4s, \in1\().4s, \in2\().4s
add \out2\().4s, \in1\().4s, \in2\().4s
.endm
.macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
add \tmp1\().2d, \in1\().2d, \in3\().2d
add \tmp2\().2d, \in2\().2d, \in4\().2d
sub \tmp3\().2d, \in1\().2d, \in3\().2d
sub \tmp4\().2d, \in2\().2d, \in4\().2d
rshrn \out1\().2s, \tmp1\().2d, #14
rshrn2 \out1\().4s, \tmp2\().2d, #14
rshrn \out2\().2s, \tmp3\().2d, #14
rshrn2 \out2\().4s, \tmp4\().2d, #14
.endm
.macro iwht4_10 c0, c1, c2, c3
add \c0\().4s, \c0\().4s, \c1\().4s
sub v17.4s, \c2\().4s, \c3\().4s
sub v16.4s, \c0\().4s, v17.4s
sshr v16.4s, v16.4s, #1
sub \c2\().4s, v16.4s, \c1\().4s
sub \c1\().4s, v16.4s, \c3\().4s
add \c3\().4s, v17.4s, \c2\().4s
sub \c0\().4s, \c0\().4s, \c1\().4s
.endm
.macro iwht4_12 c0, c1, c2, c3
iwht4_10 \c0, \c1, \c2, \c3
.endm
.macro idct4_10 c0, c1, c2, c3
mul v22.4s, \c1\().4s, v0.s[3]
mul v20.4s, \c1\().4s, v0.s[2]
add v16.4s, \c0\().4s, \c2\().4s
sub v17.4s, \c0\().4s, \c2\().4s
mla v22.4s, \c3\().4s, v0.s[2]
mul v18.4s, v16.4s, v0.s[0]
mul v24.4s, v17.4s, v0.s[0]
mls v20.4s, \c3\().4s, v0.s[3]
srshr v22.4s, v22.4s, #14
srshr v18.4s, v18.4s, #14
srshr v24.4s, v24.4s, #14
srshr v20.4s, v20.4s, #14
add \c0\().4s, v18.4s, v22.4s
sub \c3\().4s, v18.4s, v22.4s
add \c1\().4s, v24.4s, v20.4s
sub \c2\().4s, v24.4s, v20.4s
.endm
.macro idct4_12 c0, c1, c2, c3
smull v22.2d, \c1\().2s, v0.s[3]
smull2 v23.2d, \c1\().4s, v0.s[3]
smull v20.2d, \c1\().2s, v0.s[2]
smull2 v21.2d, \c1\().4s, v0.s[2]
add v16.4s, \c0\().4s, \c2\().4s
sub v17.4s, \c0\().4s, \c2\().4s
smlal v22.2d, \c3\().2s, v0.s[2]
smlal2 v23.2d, \c3\().4s, v0.s[2]
smull v18.2d, v16.2s, v0.s[0]
smull2 v19.2d, v16.4s, v0.s[0]
smull v24.2d, v17.2s, v0.s[0]
smull2 v25.2d, v17.4s, v0.s[0]
smlsl v20.2d, \c3\().2s, v0.s[3]
smlsl2 v21.2d, \c3\().4s, v0.s[3]
rshrn v22.2s, v22.2d, #14
rshrn2 v22.4s, v23.2d, #14
rshrn v18.2s, v18.2d, #14
rshrn2 v18.4s, v19.2d, #14
rshrn v24.2s, v24.2d, #14
rshrn2 v24.4s, v25.2d, #14
rshrn v20.2s, v20.2d, #14
rshrn2 v20.4s, v21.2d, #14
add \c0\().4s, v18.4s, v22.4s
sub \c3\().4s, v18.4s, v22.4s
add \c1\().4s, v24.4s, v20.4s
sub \c2\().4s, v24.4s, v20.4s
.endm
.macro iadst4_10 c0, c1, c2, c3
mul v16.4s, \c0\().4s, v1.s[0]
mla v16.4s, \c2\().4s, v1.s[1]
mla v16.4s, \c3\().4s, v1.s[2]
mul v18.4s, \c0\().4s, v1.s[2]
mls v18.4s, \c2\().4s, v1.s[0]
sub \c0\().4s, \c0\().4s, \c2\().4s
mls v18.4s, \c3\().4s, v1.s[1]
add \c0\().4s, \c0\().4s, \c3\().4s
mul v22.4s, \c1\().4s, v1.s[3]
mul v20.4s, \c0\().4s, v1.s[3]
add v24.4s, v16.4s, v22.4s
add v26.4s, v18.4s, v22.4s
srshr \c0\().4s, v24.4s, #14
add v16.4s, v16.4s, v18.4s
srshr \c1\().4s, v26.4s, #14
sub v16.4s, v16.4s, v22.4s
srshr \c2\().4s, v20.4s, #14
srshr \c3\().4s, v16.4s, #14
.endm
.macro iadst4_12 c0, c1, c2, c3
smull v16.2d, \c0\().2s, v1.s[0]
smull2 v17.2d, \c0\().4s, v1.s[0]
smlal v16.2d, \c2\().2s, v1.s[1]
smlal2 v17.2d, \c2\().4s, v1.s[1]
smlal v16.2d, \c3\().2s, v1.s[2]
smlal2 v17.2d, \c3\().4s, v1.s[2]
smull v18.2d, \c0\().2s, v1.s[2]
smull2 v19.2d, \c0\().4s, v1.s[2]
smlsl v18.2d, \c2\().2s, v1.s[0]
smlsl2 v19.2d, \c2\().4s, v1.s[0]
sub \c0\().4s, \c0\().4s, \c2\().4s
smlsl v18.2d, \c3\().2s, v1.s[1]
smlsl2 v19.2d, \c3\().4s, v1.s[1]
add \c0\().4s, \c0\().4s, \c3\().4s
smull v22.2d, \c1\().2s, v1.s[3]
smull2 v23.2d, \c1\().4s, v1.s[3]
smull v20.2d, \c0\().2s, v1.s[3]
smull2 v21.2d, \c0\().4s, v1.s[3]
add v24.2d, v16.2d, v22.2d
add v25.2d, v17.2d, v23.2d
add v26.2d, v18.2d, v22.2d
add v27.2d, v19.2d, v23.2d
rshrn \c0\().2s, v24.2d, #14
rshrn2 \c0\().4s, v25.2d, #14
add v16.2d, v16.2d, v18.2d
add v17.2d, v17.2d, v19.2d
rshrn \c1\().2s, v26.2d, #14
rshrn2 \c1\().4s, v27.2d, #14
sub v16.2d, v16.2d, v22.2d
sub v17.2d, v17.2d, v23.2d
rshrn \c2\().2s, v20.2d, #14
rshrn2 \c2\().4s, v21.2d, #14
rshrn \c3\().2s, v16.2d, #14
rshrn2 \c3\().4s, v17.2d, #14
.endm
.macro itxfm_func4x4 txfm1, txfm2, bpp
function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_\bpp\()_neon, export=1
.ifc \txfm1,\txfm2
.ifc \txfm1,idct
movrel x4, itxfm4_coeffs
ld1 {v0.4h}, [x4]
sxtl v0.4s, v0.4h
.endif
.ifc \txfm1,iadst
movrel x4, iadst4_coeffs
ld1 {v0.d}[1], [x4]
sxtl2 v1.4s, v0.8h
.endif
.else
movrel x4, itxfm4_coeffs
ld1 {v0.8h}, [x4]
sxtl2 v1.4s, v0.8h
sxtl v0.4s, v0.4h
.endif
movi v30.4s, #0
movi v31.4s, #0
.ifc \txfm1\()_\txfm2,idct_idct
cmp w3, #1
b.ne 1f
ld1 {v2.s}[0], [x2]
smull v2.2d, v2.2s, v0.s[0]
rshrn v2.2s, v2.2d, #14
smull v2.2d, v2.2s, v0.s[0]
rshrn v2.2s, v2.2d, #14
st1 {v31.s}[0], [x2]
dup v4.4s, v2.s[0]
mov v5.16b, v4.16b
mov v6.16b, v4.16b
mov v7.16b, v4.16b
b 2f
.endif
1:
ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2]
st1 {v30.4s,v31.4s}, [x2], #32
.ifc \txfm1,iwht
sshr v4.4s, v4.4s, #2
sshr v5.4s, v5.4s, #2
sshr v6.4s, v6.4s, #2
sshr v7.4s, v7.4s, #2
.endif
\txfm1\()4_\bpp v4, v5, v6, v7
st1 {v30.4s,v31.4s}, [x2], #32
transpose_4x4s v4, v5, v6, v7, v16, v17, v18, v19
\txfm2\()4_\bpp v4, v5, v6, v7
2:
mvni v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
ld1 {v0.4h}, [x0], x1
ld1 {v1.4h}, [x0], x1
.ifnc \txfm1,iwht
srshr v4.4s, v4.4s, #4
srshr v5.4s, v5.4s, #4
srshr v6.4s, v6.4s, #4
srshr v7.4s, v7.4s, #4
.endif
uaddw v4.4s, v4.4s, v0.4h
uaddw v5.4s, v5.4s, v1.4h
ld1 {v2.4h}, [x0], x1
ld1 {v3.4h}, [x0], x1
sqxtun v0.4h, v4.4s
sqxtun2 v0.8h, v5.4s
sub x0, x0, x1, lsl #2
uaddw v6.4s, v6.4s, v2.4h
umin v0.8h, v0.8h, v31.8h
uaddw v7.4s, v7.4s, v3.4h
st1 {v0.4h}, [x0], x1
sqxtun v2.4h, v6.4s
sqxtun2 v2.8h, v7.4s
umin v2.8h, v2.8h, v31.8h
st1 {v0.d}[1], [x0], x1
st1 {v2.4h}, [x0], x1
st1 {v2.d}[1], [x0], x1
ret
endfunc
.endm
.macro itxfm_funcs4x4 bpp
itxfm_func4x4 idct, idct, \bpp
itxfm_func4x4 iadst, idct, \bpp
itxfm_func4x4 idct, iadst, \bpp
itxfm_func4x4 iadst, iadst, \bpp
itxfm_func4x4 iwht, iwht, \bpp
.endm
itxfm_funcs4x4 10
itxfm_funcs4x4 12
function idct8x8_dc_add_neon
movrel x4, idct_coeffs
ld1 {v0.4h}, [x4]
movi v1.4h, #0
sxtl v0.4s, v0.4h
ld1 {v2.s}[0], [x2]
smull v2.2d, v2.2s, v0.s[0]
rshrn v2.2s, v2.2d, #14
smull v2.2d, v2.2s, v0.s[0]
rshrn v2.2s, v2.2d, #14
st1 {v1.s}[0], [x2]
dup v2.4s, v2.s[0]
srshr v2.4s, v2.4s, #5
mov x4, #8
mov x3, x0
dup v31.8h, w5
1:
subs x4, x4, #2
ld1 {v3.8h}, [x0], x1
ld1 {v4.8h}, [x0], x1
uaddw v16.4s, v2.4s, v3.4h
uaddw2 v17.4s, v2.4s, v3.8h
uaddw v18.4s, v2.4s, v4.4h
uaddw2 v19.4s, v2.4s, v4.8h
sqxtun v3.4h, v16.4s
sqxtun2 v3.8h, v17.4s
sqxtun v4.4h, v18.4s
sqxtun2 v4.8h, v19.4s
umin v3.8h, v3.8h, v31.8h
umin v4.8h, v4.8h, v31.8h
st1 {v3.8h}, [x3], x1
st1 {v4.8h}, [x3], x1
b.ne 1b
ret
endfunc
.macro idct8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
dmbutterfly0 \r0, \r4, \r0, \r4, \t0, \t1, \t2, \t3, \t4, \t5
dmbutterfly \r2, \r6, v0.s[2], v0.s[3], \t0, \t1, \t2, \t3
dmbutterfly \r1, \r7, v1.s[0], v1.s[1], \t0, \t1, \t2, \t3
dmbutterfly \r5, \r3, v1.s[2], v1.s[3], \t0, \t1, \t2, \t3
butterfly_4s \t0, \t1, \r0, \r6
butterfly_4s \t2, \r5, \r1, \r5
butterfly_4s \t3, \r6, \r7, \r3
butterfly_4s \r7, \r4, \r4, \r2
dmbutterfly0 \r6, \r5, \r6, \r5, \r0, \r1, \r2, \r3, \t4, \t5
butterfly_4s \r1, \r6, \r7, \r6
butterfly_4s \r0, \r7, \t0, \t3
butterfly_4s \r2, \r5, \r4, \r5
butterfly_4s \r3, \r4, \t1, \t2
.endm
.macro iadst8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
dmbutterfly_l \t2, \t3, \t0, \t1, \r7, \r0, v2.s[1], v2.s[0]
dmbutterfly_l \r0, \r7, \t4, \t5, \r3, \r4, v3.s[1], v3.s[0]
dbutterfly_n \r3, \t0, \t0, \t1, \t4, \t5, \r3, \r4, \t0, \t1
dbutterfly_n \r4, \t1, \t2, \t3, \r0, \r7, \r4, \t1, \t4, \t5
dmbutterfly_l \t4, \t5, \t2, \t3, \r5, \r2, v2.s[3], v2.s[2]
dmbutterfly_l \r2, \r5, \r0, \r7, \r1, \r6, v3.s[3], v3.s[2]
dbutterfly_n \r1, \t2, \t2, \t3, \r0, \r7, \r1, \r6, \t2, \t3
dbutterfly_n \r0, \t4, \t4, \t5, \r2, \r5, \r0, \r7, \t4, \t5
butterfly_4s \r7, \r4, \r4, \r0
neg \r7\().4s, \r7\().4s
butterfly_4s \r0, \r1, \r3, \r1
dmbutterfly_l \r2, \r3, \t3, \t5, \t0, \t1, v0.s[2], v0.s[3]
dmbutterfly_l \t0, \t1, \r5, \r6, \t4, \t2, v0.s[3], v0.s[2]
dbutterfly_n \r6, \t2, \r2, \r3, \r5, \r6, \t2, \t4, \r2, \r3
dmbutterfly0 \r3, \r4, \r1, \r4, \t4, \r5, \r1, \r2
neg \r3\().4s, \r3\().4s
dbutterfly_n \r1, \t0, \t3, \t5, \t0, \t1, \r1, \r2, \t0, \t1
neg \r1\().4s, \r1\().4s
dmbutterfly0 \r2, \r5, \t0, \t2, \t1, \t3, \t4, \t5
neg \r5\().4s, \r5\().4s
.endm
.macro itxfm_func8x8 txfm1, txfm2
function vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
.ifc \txfm1\()_\txfm2,idct_idct
cmp w3, #1
b.eq idct8x8_dc_add_neon
.endif
.ifc \txfm1\()_\txfm2,idct_idct
movrel x4, idct_coeffs
.else
movrel x4, iadst8_coeffs
ld1 {v1.8h}, [x4], #16
stp d8, d9, [sp, #-0x10]!
sxtl2 v3.4s, v1.8h
sxtl v2.4s, v1.4h
.endif
ld1 {v0.8h}, [x4]
sxtl2 v1.4s, v0.8h
sxtl v0.4s, v0.4h
movi v4.4s, #0
movi v5.4s, #0
movi v6.4s, #0
movi v7.4s, #0
1:
ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2], #64
ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2], #64
ld1 {v24.4s,v25.4s,v26.4s,v27.4s}, [x2], #64
ld1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2], #64
sub x2, x2, #256
st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64
st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64
st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64
st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64
.ifc \txfm1\()_\txfm2,idct_idct
idct8 v16, v18, v20, v22, v24, v26, v28, v30, v2, v3, v4, v5, v6, v7
idct8 v17, v19, v21, v23, v25, v27, v29, v31, v2, v3, v4, v5, v6, v7
.else
\txfm1\()8 v16, v18, v20, v22, v24, v26, v28, v30, v4, v5, v6, v7, v8, v9
\txfm1\()8 v17, v19, v21, v23, v25, v27, v29, v31, v4, v5, v6, v7, v8, v9
.endif
transpose_8x8s v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v4, v5, v6, v7
.ifc \txfm1\()_\txfm2,idct_idct
idct8 v16, v18, v20, v22, v24, v26, v28, v30, v2, v3, v4, v5, v6, v7
idct8 v17, v19, v21, v23, v25, v27, v29, v31, v2, v3, v4, v5, v6, v7
.else
\txfm2\()8 v16, v18, v20, v22, v24, v26, v28, v30, v4, v5, v6, v7, v8, v9
\txfm2\()8 v17, v19, v21, v23, v25, v27, v29, v31, v4, v5, v6, v7, v8, v9
.endif
2:
mov x3, x0
ld1 {v0.8h}, [x0], x1
srshr v16.4s, v16.4s, #5
srshr v17.4s, v17.4s, #5
ld1 {v1.8h}, [x0], x1
srshr v18.4s, v18.4s, #5
srshr v19.4s, v19.4s, #5
ld1 {v2.8h}, [x0], x1
srshr v20.4s, v20.4s, #5
srshr v21.4s, v21.4s, #5
uaddw v16.4s, v16.4s, v0.4h
uaddw2 v17.4s, v17.4s, v0.8h
ld1 {v3.8h}, [x0], x1
srshr v22.4s, v22.4s, #5
srshr v23.4s, v23.4s, #5
uaddw v18.4s, v18.4s, v1.4h
uaddw2 v19.4s, v19.4s, v1.8h
ld1 {v4.8h}, [x0], x1
srshr v24.4s, v24.4s, #5
srshr v25.4s, v25.4s, #5
uaddw v20.4s, v20.4s, v2.4h
uaddw2 v21.4s, v21.4s, v2.8h
sqxtun v0.4h, v16.4s
sqxtun2 v0.8h, v17.4s
dup v16.8h, w5
ld1 {v5.8h}, [x0], x1
srshr v26.4s, v26.4s, #5
srshr v27.4s, v27.4s, #5
uaddw v22.4s, v22.4s, v3.4h
uaddw2 v23.4s, v23.4s, v3.8h
sqxtun v1.4h, v18.4s
sqxtun2 v1.8h, v19.4s
umin v0.8h, v0.8h, v16.8h
ld1 {v6.8h}, [x0], x1
srshr v28.4s, v28.4s, #5
srshr v29.4s, v29.4s, #5
uaddw v24.4s, v24.4s, v4.4h
uaddw2 v25.4s, v25.4s, v4.8h
sqxtun v2.4h, v20.4s
sqxtun2 v2.8h, v21.4s
umin v1.8h, v1.8h, v16.8h
ld1 {v7.8h}, [x0], x1
srshr v30.4s, v30.4s, #5
srshr v31.4s, v31.4s, #5
uaddw v26.4s, v26.4s, v5.4h
uaddw2 v27.4s, v27.4s, v5.8h
sqxtun v3.4h, v22.4s
sqxtun2 v3.8h, v23.4s
umin v2.8h, v2.8h, v16.8h
st1 {v0.8h}, [x3], x1
uaddw v28.4s, v28.4s, v6.4h
uaddw2 v29.4s, v29.4s, v6.8h
st1 {v1.8h}, [x3], x1
sqxtun v4.4h, v24.4s
sqxtun2 v4.8h, v25.4s
umin v3.8h, v3.8h, v16.8h
st1 {v2.8h}, [x3], x1
uaddw v30.4s, v30.4s, v7.4h
uaddw2 v31.4s, v31.4s, v7.8h
st1 {v3.8h}, [x3], x1
sqxtun v5.4h, v26.4s
sqxtun2 v5.8h, v27.4s
umin v4.8h, v4.8h, v16.8h
st1 {v4.8h}, [x3], x1
sqxtun v6.4h, v28.4s
sqxtun2 v6.8h, v29.4s
umin v5.8h, v5.8h, v16.8h
st1 {v5.8h}, [x3], x1
sqxtun v7.4h, v30.4s
sqxtun2 v7.8h, v31.4s
umin v6.8h, v6.8h, v16.8h
st1 {v6.8h}, [x3], x1
umin v7.8h, v7.8h, v16.8h
st1 {v7.8h}, [x3], x1
.ifnc \txfm1\()_\txfm2,idct_idct
ldp d8, d9, [sp], 0x10
.endif
ret
endfunc
function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_10_neon, export=1
mov x5, #0x03ff
b vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
endfunc
function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_12_neon, export=1
mov x5, #0x0fff
b vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
endfunc
.endm
itxfm_func8x8 idct, idct
itxfm_func8x8 iadst, idct
itxfm_func8x8 idct, iadst
itxfm_func8x8 iadst, iadst
function idct16x16_dc_add_neon
movrel x4, idct_coeffs
ld1 {v0.4h}, [x4]
sxtl v0.4s, v0.4h
movi v1.4h, #0
ld1 {v2.s}[0], [x2]
smull v2.2d, v2.2s, v0.s[0]
rshrn v2.2s, v2.2d, #14
smull v2.2d, v2.2s, v0.s[0]
rshrn v2.2s, v2.2d, #14
st1 {v1.s}[0], [x2]
dup v2.4s, v2.s[0]
srshr v0.4s, v2.4s, #6
mov x3, x0
mov x4, #16
dup v31.8h, w13
1:
subs x4, x4, #2
ld1 {v1.8h,v2.8h}, [x0], x1
uaddw v16.4s, v0.4s, v1.4h
uaddw2 v17.4s, v0.4s, v1.8h
ld1 {v3.8h,v4.8h}, [x0], x1
uaddw v18.4s, v0.4s, v2.4h
uaddw2 v19.4s, v0.4s, v2.8h
uaddw v20.4s, v0.4s, v3.4h
uaddw2 v21.4s, v0.4s, v3.8h
uaddw v22.4s, v0.4s, v4.4h
uaddw2 v23.4s, v0.4s, v4.8h
sqxtun v1.4h, v16.4s
sqxtun2 v1.8h, v17.4s
sqxtun v2.4h, v18.4s
sqxtun2 v2.8h, v19.4s
sqxtun v3.4h, v20.4s
sqxtun2 v3.8h, v21.4s
sqxtun v4.4h, v22.4s
sqxtun2 v4.8h, v23.4s
umin v1.8h, v1.8h, v31.8h
umin v2.8h, v2.8h, v31.8h
st1 {v1.8h,v2.8h}, [x3], x1
umin v3.8h, v3.8h, v31.8h
umin v4.8h, v4.8h, v31.8h
st1 {v3.8h,v4.8h}, [x3], x1
b.ne 1b
ret
endfunc
.macro idct16_end
butterfly_4s v18, v7, v4, v7
butterfly_4s v19, v22, v5, v22
butterfly_4s v4, v26, v20, v26
butterfly_4s v5, v6, v28, v6
butterfly_4s v20, v28, v16, v24
butterfly_4s v24, v21, v23, v21
butterfly_4s v23, v27, v25, v27
butterfly_4s v25, v29, v29, v17
dmbutterfly0 v8, v9, v27, v21, v8, v9, v16, v17, v30, v31
dmbutterfly0 v28, v27, v29, v28, v21, v29, v16, v17, v30, v31
butterfly_4s v16, v31, v18, v25
butterfly_4s v17, v30, v19, v23
butterfly_4s_r v25, v22, v22, v24
butterfly_4s v23, v24, v7, v20
butterfly_4s v18, v29, v4, v8
butterfly_4s v19, v28, v5, v28
butterfly_4s v20, v27, v6, v27
butterfly_4s v21, v26, v26, v9
ret
.endm
function idct16
dmbutterfly0 v16, v24, v16, v24, v4, v5, v6, v7, v8, v9
dmbutterfly v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7
dmbutterfly v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7
dmbutterfly v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7
dmbutterfly v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7
dmbutterfly v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7
dmbutterfly v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7
dmbutterfly v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7
butterfly_4s v4, v28, v16, v28
butterfly_4s v5, v20, v24, v20
butterfly_4s v6, v26, v18, v26
butterfly_4s v7, v22, v30, v22
butterfly_4s v16, v25, v17, v25
butterfly_4s v24, v21, v29, v21
butterfly_4s v17, v27, v19, v27
butterfly_4s v29, v23, v31, v23
dmbutterfly0 v22, v26, v22, v26, v8, v9, v18, v19, v30, v31
dmbutterfly v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31
dmbutterfly v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1
idct16_end
endfunc
function idct16_half
dmbutterfly0_h v16, v24, v16, v24, v4, v5, v6, v7, v8, v9
dmbutterfly_h1 v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7
dmbutterfly_h1 v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7
dmbutterfly_h2 v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7
dmbutterfly_h1 v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7
dmbutterfly_h2 v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7
dmbutterfly_h1 v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7
dmbutterfly_h2 v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7
butterfly_4s v4, v28, v16, v28
butterfly_4s v5, v20, v24, v20
butterfly_4s v6, v26, v18, v26
butterfly_4s v7, v22, v30, v22
butterfly_4s v16, v25, v17, v25
butterfly_4s v24, v21, v29, v21
butterfly_4s v17, v27, v19, v27
butterfly_4s v29, v23, v31, v23
dmbutterfly0 v22, v26, v22, v26, v8, v9, v18, v19, v30, v31
dmbutterfly v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31
dmbutterfly v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1
idct16_end
endfunc
function idct16_quarter
dsmull_h v24, v25, v19, v3.s[3]
dsmull_h v4, v5, v17, v2.s[0]
dsmull_h v7, v6, v18, v1.s[1]
dsmull_h v30, v31, v18, v1.s[0]
neg v24.2d, v24.2d
neg v25.2d, v25.2d
dsmull_h v29, v28, v17, v2.s[1]
dsmull_h v26, v27, v19, v3.s[2]
dsmull_h v22, v23, v16, v0.s[0]
drshrn_h v24, v24, v25, #14
drshrn_h v16, v4, v5, #14
drshrn_h v7, v7, v6, #14
drshrn_h v6, v30, v31, #14
drshrn_h v29, v29, v28, #14
drshrn_h v17, v26, v27, #14
drshrn_h v28, v22, v23, #14
dmbutterfly_l v20, v21, v22, v23, v17, v24, v0.s[2], v0.s[3]
dmbutterfly_l v18, v19, v30, v31, v29, v16, v0.s[2], v0.s[3]
neg v22.2d, v22.2d
neg v23.2d, v23.2d
drshrn_h v27, v20, v21, #14
drshrn_h v21, v22, v23, #14
drshrn_h v23, v18, v19, #14
drshrn_h v25, v30, v31, #14
mov v4.16b, v28.16b
mov v5.16b, v28.16b
dmbutterfly0 v22, v26, v7, v6, v18, v19, v30, v31
mov v20.16b, v28.16b
idct16_end
endfunc
function iadst16
ld1 {v0.8h,v1.8h}, [x11]
sxtl v2.4s, v1.4h
sxtl2 v3.4s, v1.8h
sxtl2 v1.4s, v0.8h
sxtl v0.4s, v0.4h
dmbutterfly_l v6, v7, v4, v5, v31, v16, v0.s[1], v0.s[0]
dmbutterfly_l v10, v11, v8, v9, v23, v24, v1.s[1], v1.s[0]
dbutterfly_n v31, v24, v6, v7, v10, v11, v12, v13, v10, v11
dmbutterfly_l v14, v15, v12, v13, v29, v18, v0.s[3], v0.s[2]
dbutterfly_n v16, v23, v4, v5, v8, v9, v6, v7, v8, v9
dmbutterfly_l v6, v7, v4, v5, v21, v26, v1.s[3], v1.s[2]
dbutterfly_n v29, v26, v14, v15, v6, v7, v8, v9, v6, v7
dmbutterfly_l v10, v11, v8, v9, v27, v20, v2.s[1], v2.s[0]
dbutterfly_n v18, v21, v12, v13, v4, v5, v6, v7, v4, v5
dmbutterfly_l v14, v15, v12, v13, v19, v28, v3.s[1], v3.s[0]
dbutterfly_n v20, v28, v10, v11, v14, v15, v4, v5, v14, v15
dmbutterfly_l v6, v7, v4, v5, v25, v22, v2.s[3], v2.s[2]
dbutterfly_n v27, v19, v8, v9, v12, v13, v10, v11, v12, v13
dmbutterfly_l v10, v11, v8, v9, v17, v30, v3.s[3], v3.s[2]
ld1 {v0.8h}, [x10]
dbutterfly_n v22, v30, v6, v7, v10, v11, v12, v13, v10, v11
sxtl2 v1.4s, v0.8h
sxtl v0.4s, v0.4h
dmbutterfly_l v14, v15, v12, v13, v23, v24, v1.s[0], v1.s[1]
dbutterfly_n v25, v17, v4, v5, v8, v9, v6, v7, v8, v9
dmbutterfly_l v4, v5, v6, v7, v28, v19, v1.s[1], v1.s[0]
dbutterfly_n v23, v19, v12, v13, v4, v5, v8, v9, v4, v5
dmbutterfly_l v10, v11, v8, v9, v21, v26, v1.s[2], v1.s[3]
butterfly_4s_r v4, v27, v16, v27
dbutterfly_n v24, v28, v14, v15, v6, v7, v12, v13, v6, v7
dmbutterfly_l v12, v13, v14, v15, v30, v17, v1.s[3], v1.s[2]
butterfly_4s_r v5, v20, v31, v20
dbutterfly_n v21, v17, v8, v9, v12, v13, v6, v7, v12, v13
dbutterfly_n v26, v30, v10, v11, v14, v15, v8, v9, v14, v15
butterfly_4s_r v6, v25, v18, v25
butterfly_4s_r v7, v22, v29, v22
dmbutterfly_l v10, v11, v8, v9, v19, v28, v0.s[2], v0.s[3]
dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.s[3], v0.s[2]
dbutterfly_n v18, v30, v8, v9, v12, v13, v16, v17, v12, v13
dbutterfly_n v29, v17, v10, v11, v14, v15, v12, v13, v14, v15
neg v29.4s, v29.4s
dmbutterfly_l v10, v11, v8, v9, v4, v5, v0.s[2], v0.s[3]
dmbutterfly_l v12, v13, v14, v15, v7, v6, v0.s[3], v0.s[2]
butterfly_4s v2, v6, v27, v25
butterfly_4s v3, v7, v23, v21
dbutterfly_n v19, v31, v8, v9, v12, v13, v4, v5, v8, v9
neg v19.4s, v19.4s
dbutterfly_n v28, v16, v10, v11, v14, v15, v4, v5, v10, v11
butterfly_4s v5, v8, v20, v22
butterfly_4s v4, v9, v24, v26
dmbutterfly0 v23, v24, v6, v8, v10, v11, v12, v13, v14, v15, 1
dmbutterfly0 v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1
dmbutterfly0 v20, v27, v16, v31, v10, v11, v12, v13, v14, v15
dmbutterfly0 v22, v25, v9, v7, v10, v11, v12, v13, v14, v15
neg v31.4s, v5.4s
neg v17.4s, v3.4s
mov v16.16b, v2.16b
mov v30.16b, v4.16b
ret
endfunc
.macro load i, src, inc
ld1 {v\i\().4s}, [\src], \inc
.endm
.macro store i, dst, inc
st1 {v\i\().4s}, [\dst], \inc
.endm
.macro movi_v i, size, imm
movi v\i\()\size, \imm
.endm
.macro load_clear i, src, inc
ld1 {v\i\().4s}, [\src]
st1 {v4.4s}, [\src], \inc
.endm
.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7
srshr \coef0, \coef0, #6
ld1 {v4.4h}, [x0], x1
srshr \coef1, \coef1, #6
ld1 {v4.d}[1], [x3], x1
srshr \coef2, \coef2, #6
ld1 {v5.4h}, [x0], x1
srshr \coef3, \coef3, #6
uaddw \coef0, \coef0, v4.4h
ld1 {v5.d}[1], [x3], x1
srshr \coef4, \coef4, #6
uaddw2 \coef1, \coef1, v4.8h
ld1 {v6.4h}, [x0], x1
srshr \coef5, \coef5, #6
uaddw \coef2, \coef2, v5.4h
ld1 {v6.d}[1], [x3], x1
sqxtun v4.4h, \coef0
srshr \coef6, \coef6, #6
uaddw2 \coef3, \coef3, v5.8h
ld1 {v7.4h}, [x0], x1
sqxtun2 v4.8h, \coef1
srshr \coef7, \coef7, #6
uaddw \coef4, \coef4, v6.4h
ld1 {v7.d}[1], [x3], x1
umin v4.8h, v4.8h, v8.8h
sub x0, x0, x1, lsl #2
sub x3, x3, x1, lsl #2
sqxtun v5.4h, \coef2
uaddw2 \coef5, \coef5, v6.8h
st1 {v4.4h}, [x0], x1
sqxtun2 v5.8h, \coef3
uaddw \coef6, \coef6, v7.4h
st1 {v4.d}[1], [x3], x1
umin v5.8h, v5.8h, v8.8h
sqxtun v6.4h, \coef4
uaddw2 \coef7, \coef7, v7.8h
st1 {v5.4h}, [x0], x1
sqxtun2 v6.8h, \coef5
st1 {v5.d}[1], [x3], x1
umin v6.8h, v6.8h, v8.8h
sqxtun v7.4h, \coef6
st1 {v6.4h}, [x0], x1
sqxtun2 v7.8h, \coef7
st1 {v6.d}[1], [x3], x1
umin v7.8h, v7.8h, v8.8h
st1 {v7.4h}, [x0], x1
st1 {v7.d}[1], [x3], x1
.endm
.macro itxfm16_1d_funcs txfm
function \txfm\()16_1d_4x16_pass1_neon
mov x14, x30
movi v4.4s, #0
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
load_clear \i, x2, x9
.endr
bl \txfm\()16
transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7
transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7
transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7
transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7
cmp x1, #12
b.eq 1f
.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
store \i, x0, #16
.endr
br x14
1:
add x0, x0, #16
st1 {v20.4s}, [x0], #16
st1 {v24.4s}, [x0], #16
st1 {v28.4s}, [x0], #16
add x0, x0, #16
st1 {v21.4s}, [x0], #16
st1 {v25.4s}, [x0], #16
st1 {v29.4s}, [x0], #16
add x0, x0, #16
st1 {v22.4s}, [x0], #16
st1 {v26.4s}, [x0], #16
st1 {v30.4s}, [x0], #16
add x0, x0, #16
st1 {v23.4s}, [x0], #16
st1 {v27.4s}, [x0], #16
st1 {v31.4s}, [x0], #16
mov v28.16b, v16.16b
mov v29.16b, v17.16b
mov v30.16b, v18.16b
mov v31.16b, v19.16b
br x14
endfunc
function \txfm\()16_1d_4x16_pass2_neon
mov x14, x30
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
load \i, x2, x9
.endr
cbz x3, 1f
.irp i, 28, 29, 30, 31
load \i, x2, x9
.endr
1:
add x3, x0, x1
lsl x1, x1, #1
bl \txfm\()16
dup v8.8h, w13
load_add_store v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
br x14
endfunc
.endm
itxfm16_1d_funcs idct
itxfm16_1d_funcs iadst
const min_eob_idct_idct_16, align=4
.short 0, 10, 38, 89
endconst
.macro itxfm_func16x16 txfm1, txfm2
function vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
.ifc \txfm1\()_\txfm2,idct_idct
cmp w3, #1
b.eq idct16x16_dc_add_neon
.endif
mov x15, x30
.ifnc \txfm1\()_\txfm2,idct_idct
stp d14, d15, [sp, #-0x10]!
stp d12, d13, [sp, #-0x10]!
stp d10, d11, [sp, #-0x10]!
.endif
stp d8, d9, [sp, #-0x10]!
sub sp, sp, #1024
mov x4, x0
mov x5, x1
mov x6, x2
movrel x10, idct_coeffs
.ifnc \txfm1\()_\txfm2,idct_idct
movrel x11, iadst16_coeffs
.endif
.ifc \txfm1,idct
ld1 {v0.8h,v1.8h}, [x10]
sxtl v2.4s, v1.4h
sxtl2 v3.4s, v1.8h
sxtl2 v1.4s, v0.8h
sxtl v0.4s, v0.4h
.endif
mov x9, #64
.ifc \txfm1\()_\txfm2,idct_idct
cmp w3, #10
b.le idct16x16_quarter_add_16_neon
cmp w3, #38
b.le idct16x16_half_add_16_neon
movrel x12, min_eob_idct_idct_16, 2
.endif
.irp i, 0, 4, 8, 12
add x0, sp, #(\i*64)
.ifc \txfm1\()_\txfm2,idct_idct
.if \i > 0
ldrh w1, [x12], #2
cmp w3, w1
mov x1, #(16 - \i)/4
b.le 1f
.endif
.endif
mov x1, #\i
add x2, x6, #(\i*4)
bl \txfm1\()16_1d_4x16_pass1_neon
.endr
.ifc \txfm1\()_\txfm2,iadst_idct
ld1 {v0.8h,v1.8h}, [x10]
sxtl v2.4s, v1.4h
sxtl2 v3.4s, v1.8h
sxtl2 v1.4s, v0.8h
sxtl v0.4s, v0.4h
.endif
.ifc \txfm1\()_\txfm2,idct_idct
b 3f
1:
movi v28.4s, #0
movi v29.4s, #0
movi v30.4s, #0
movi v31.4s, #0
2:
subs x1, x1, #1
.rept 4
st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x0], x9
.endr
b.ne 2b
3:
.endif
.irp i, 0, 4, 8, 12
add x0, x4, #(\i*2)
mov x1, x5
add x2, sp, #(\i*4)
mov x3, #\i
bl \txfm2\()16_1d_4x16_pass2_neon
.endr
add sp, sp, #1024
ldp d8, d9, [sp], 0x10
.ifnc \txfm1\()_\txfm2,idct_idct
ldp d10, d11, [sp], 0x10
ldp d12, d13, [sp], 0x10
ldp d14, d15, [sp], 0x10
.endif
br x15
endfunc
function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_10_neon, export=1
mov x13, #0x03ff
b vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
endfunc
function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_12_neon, export=1
mov x13, #0x0fff
b vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
endfunc
.endm
itxfm_func16x16 idct, idct
itxfm_func16x16 iadst, idct
itxfm_func16x16 idct, iadst
itxfm_func16x16 iadst, iadst
function idct16_1d_4x16_pass1_quarter_neon
mov x14, x30
movi v4.4s, #0
.irp i, 16, 17, 18, 19
load_clear \i, x2, x9
.endr
bl idct16_quarter
transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7
transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7
transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7
transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7
add x0, x0, #16
st1 {v20.4s}, [x0], #16
st1 {v24.4s}, [x0], #16
st1 {v28.4s}, [x0], #16
add x0, x0, #16
st1 {v21.4s}, [x0], #16
st1 {v25.4s}, [x0], #16
st1 {v29.4s}, [x0], #16
add x0, x0, #16
st1 {v22.4s}, [x0], #16
st1 {v26.4s}, [x0], #16
st1 {v30.4s}, [x0], #16
add x0, x0, #16
st1 {v23.4s}, [x0], #16
st1 {v27.4s}, [x0], #16
st1 {v31.4s}, [x0], #16
br x14
endfunc
function idct16_1d_4x16_pass2_quarter_neon
mov x14, x30
cbz x3, 1f
.irp i, 16, 17, 18, 19
load \i, x2, x9
.endr
1:
add x3, x0, x1
lsl x1, x1, #1
bl idct16_quarter
dup v8.8h, w13
load_add_store v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
br x14
endfunc
function idct16_1d_4x16_pass1_half_neon
mov x14, x30
movi v4.4s, #0
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
load_clear \i, x2, x9
.endr
bl idct16_half
transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7
transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7
transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7
transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7
cmp x1, #4
b.eq 1f
.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
store \i, x0, #16
.endr
br x14
1:
add x0, x0, #16
st1 {v20.4s}, [x0], #16
st1 {v24.4s}, [x0], #16
st1 {v28.4s}, [x0], #16
add x0, x0, #16
st1 {v21.4s}, [x0], #16
st1 {v25.4s}, [x0], #16
st1 {v29.4s}, [x0], #16
add x0, x0, #16
st1 {v22.4s}, [x0], #16
st1 {v26.4s}, [x0], #16
st1 {v30.4s}, [x0], #16
add x0, x0, #16
st1 {v23.4s}, [x0], #16
st1 {v27.4s}, [x0], #16
st1 {v31.4s}, [x0], #16
mov v20.16b, v16.16b
mov v21.16b, v17.16b
mov v22.16b, v18.16b
mov v23.16b, v19.16b
br x14
endfunc
function idct16_1d_4x16_pass2_half_neon
mov x14, x30
.irp i, 16, 17, 18, 19
load \i, x2, x9
.endr
cbz x3, 1f
.irp i, 20, 21, 22, 23
load \i, x2, x9
.endr
1:
add x3, x0, x1
lsl x1, x1, #1
bl idct16_half
dup v8.8h, w13
load_add_store v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
br x14
endfunc
.macro idct16_partial size
function idct16x16_\size\()_add_16_neon
add x0, sp, #(0*64)
mov x1, #0
add x2, x6, #(0*4)
bl idct16_1d_4x16_pass1_\size\()_neon
.ifc \size,half
add x0, sp, #(4*64)
mov x1, #4
add x2, x6, #(4*4)
bl idct16_1d_4x16_pass1_\size\()_neon
.endif
.irp i, 0, 4, 8, 12
add x0, x4, #(\i*2)
mov x1, x5
add x2, sp, #(\i*4)
mov x3, #\i
bl idct16_1d_4x16_pass2_\size\()_neon
.endr
add sp, sp, #1024
ldp d8, d9, [sp], 0x10
br x15
endfunc
.endm
idct16_partial quarter
idct16_partial half
function idct32x32_dc_add_neon
movrel x4, idct_coeffs
ld1 {v0.4h}, [x4]
sxtl v0.4s, v0.4h
movi v1.4h, #0
ld1 {v2.s}[0], [x2]
smull v2.2d, v2.2s, v0.s[0]
rshrn v2.2s, v2.2d, #14
smull v2.2d, v2.2s, v0.s[0]
rshrn v2.2s, v2.2d, #14
st1 {v1.s}[0], [x2]
dup v2.4s, v2.s[0]
srshr v0.4s, v2.4s, #6
mov x3, x0
mov x4, #32
sub x1, x1, #32
dup v31.8h, w13
1:
subs x4, x4, #1
ld1 {v1.8h,v2.8h}, [x0], #32
uaddw v16.4s, v0.4s, v1.4h
uaddw2 v17.4s, v0.4s, v1.8h
ld1 {v3.8h,v4.8h}, [x0], x1
uaddw v18.4s, v0.4s, v2.4h
uaddw2 v19.4s, v0.4s, v2.8h
uaddw v20.4s, v0.4s, v3.4h
uaddw2 v21.4s, v0.4s, v3.8h
uaddw v22.4s, v0.4s, v4.4h
uaddw2 v23.4s, v0.4s, v4.8h
sqxtun v1.4h, v16.4s
sqxtun2 v1.8h, v17.4s
sqxtun v2.4h, v18.4s
sqxtun2 v2.8h, v19.4s
sqxtun v3.4h, v20.4s
sqxtun2 v3.8h, v21.4s
sqxtun v4.4h, v22.4s
sqxtun2 v4.8h, v23.4s
umin v1.8h, v1.8h, v31.8h
umin v2.8h, v2.8h, v31.8h
st1 {v1.8h,v2.8h}, [x3], #32
umin v3.8h, v3.8h, v31.8h
umin v4.8h, v4.8h, v31.8h
st1 {v3.8h,v4.8h}, [x3], x1
b.ne 1b
ret
endfunc
.macro idct32_end
butterfly_4s v16, v5, v4, v5
butterfly_4s v17, v20, v23, v20
butterfly_4s v18, v6, v7, v6
butterfly_4s v19, v21, v22, v21
butterfly_4s v4, v28, v28, v30
butterfly_4s v23, v26, v25, v26
butterfly_4s v7, v8, v29, v31
butterfly_4s v22, v27, v24, v27
dmbutterfly v27, v20, v0.s[2], v0.s[3], v24, v25, v30, v31
dmbutterfly v8, v5, v0.s[2], v0.s[3], v24, v25, v30, v31
dmbutterfly v28, v6, v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1
dmbutterfly v26, v21, v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1
butterfly_4s v31, v24, v7, v4
butterfly_4s v30, v25, v22, v23
butterfly_4s_r v23, v16, v16, v18
butterfly_4s_r v22, v17, v17, v19
butterfly_4s v18, v21, v27, v21
butterfly_4s_r v27, v28, v5, v28
butterfly_4s v29, v26, v20, v26
butterfly_4s v19, v20, v8, v6
dmbutterfly0 v27, v20, v27, v20, v4, v5, v6, v7, v8, v9
dmbutterfly0 v26, v21, v26, v21, v4, v5, v6, v7, v8, v9
dmbutterfly0 v25, v22, v25, v22, v4, v5, v6, v7, v8, v9
dmbutterfly0 v24, v23, v24, v23, v4, v5, v6, v7, v8, v9
ret
.endm
function idct32_odd
dmbutterfly v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7
dmbutterfly v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7
dmbutterfly v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7
dmbutterfly v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7
dmbutterfly v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7
dmbutterfly v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7
dmbutterfly v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7
dmbutterfly v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7
butterfly_4s v4, v24, v16, v24
butterfly_4s v5, v20, v28, v20
butterfly_4s v6, v26, v18, v26
butterfly_4s v7, v22, v30, v22
butterfly_4s v28, v25, v17, v25
butterfly_4s v30, v21, v29, v21
butterfly_4s v29, v23, v31, v23
butterfly_4s v31, v27, v19, v27
dmbutterfly v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19
dmbutterfly v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1
dmbutterfly v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19
dmbutterfly v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1
idct32_end
endfunc
function idct32_odd_half
dmbutterfly_h1 v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7
dmbutterfly_h2 v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7
dmbutterfly_h1 v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7
dmbutterfly_h2 v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7
dmbutterfly_h1 v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7
dmbutterfly_h2 v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7
dmbutterfly_h1 v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7
dmbutterfly_h2 v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7
butterfly_4s v4, v24, v16, v24
butterfly_4s v5, v20, v28, v20
butterfly_4s v6, v26, v18, v26
butterfly_4s v7, v22, v30, v22
butterfly_4s v28, v25, v17, v25
butterfly_4s v30, v21, v29, v21
butterfly_4s v29, v23, v31, v23
butterfly_4s v31, v27, v19, v27
dmbutterfly v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19
dmbutterfly v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1
dmbutterfly v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19
dmbutterfly v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1
idct32_end
endfunc
function idct32_odd_quarter
dsmull_h v4, v5, v16, v10.s[0]
dsmull_h v28, v29, v19, v11.s[3]
dsmull_h v30, v31, v16, v10.s[1]
dsmull_h v22, v23, v17, v13.s[2]
dsmull_h v7, v6, v17, v13.s[3]
dsmull_h v26, v27, v19, v11.s[2]
dsmull_h v20, v21, v18, v12.s[0]
dsmull_h v24, v25, v18, v12.s[1]
neg v28.2d, v28.2d
neg v29.2d, v29.2d
neg v7.2d, v7.2d
neg v6.2d, v6.2d
drshrn_h v4, v4, v5, #14
drshrn_h v5, v28, v29, #14
drshrn_h v29, v30, v31, #14
drshrn_h v28, v22, v23, #14
drshrn_h v7, v7, v6, #14
drshrn_h v31, v26, v27, #14
drshrn_h v6, v20, v21, #14
drshrn_h v30, v24, v25, #14
dmbutterfly_l v16, v17, v18, v19, v29, v4, v1.s[0], v1.s[1]
dmbutterfly_l v27, v26, v20, v21, v31, v5, v1.s[0], v1.s[1]
drshrn_h v23, v16, v17, #14
drshrn_h v24, v18, v19, #14
neg v20.2d, v20.2d
neg v21.2d, v21.2d
drshrn_h v27, v27, v26, #14
drshrn_h v20, v20, v21, #14
dmbutterfly_l v16, v17, v18, v19, v30, v6, v1.s[2], v1.s[3]
drshrn_h v21, v16, v17, #14
drshrn_h v26, v18, v19, #14
dmbutterfly_l v16, v17, v18, v19, v28, v7, v1.s[2], v1.s[3]
drshrn_h v25, v16, v17, #14
neg v18.2d, v18.2d
neg v19.2d, v19.2d
drshrn_h v22, v18, v19, #14
idct32_end
endfunc
.macro idct32_funcs suffix
function idct32_1d_4x32_pass1\suffix\()_neon
mov x14, x30
movi v4.4s, #0
.ifb \suffix
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
load_clear \i, x2, x9
.endr
.endif
.ifc \suffix,_quarter
.irp i, 16, 17, 18, 19
load_clear \i, x2, x9
.endr
.endif
.ifc \suffix,_half
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
load_clear \i, x2, x9
.endr
.endif
bl idct16\suffix
transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7
transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7
transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7
transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7
.macro store_rev a, b, c, d
rev64 v7.4s, \d
st1 {\a}, [x0], #16
ext v7.16b, v7.16b, v7.16b, #8
st1 {\b}, [x0], #16
rev64 v6.4s, \c
st1 {\c}, [x0], #16
ext v6.16b, v6.16b, v6.16b, #8
st1 {\d}, [x0], #16
rev64 v5.4s, \b
st1 {v7.4s}, [x0], #16
ext v5.16b, v5.16b, v5.16b, #8
st1 {v6.4s}, [x0], #16
rev64 v4.4s, \a
st1 {v5.4s}, [x0], #16
ext v4.16b, v4.16b, v4.16b, #8
st1 {v4.4s}, [x0], #16
.endm
store_rev v16.4s, v20.4s, v24.4s, v28.4s
store_rev v17.4s, v21.4s, v25.4s, v29.4s
store_rev v18.4s, v22.4s, v26.4s, v30.4s
store_rev v19.4s, v23.4s, v27.4s, v31.4s
sub x0, x0, #512
.purgem store_rev
.ifb \suffix
sub x2, x2, x9, lsl #4
.endif
.ifc \suffix,_quarter
sub x2, x2, x9, lsl #2
.endif
.ifc \suffix,_half
sub x2, x2, x9, lsl #3
.endif
add x2, x2, #128
movi v4.4s, #0
.ifb \suffix
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
load_clear \i, x2, x9
.endr
.endif
.ifc \suffix,_quarter
.irp i, 16, 17, 18, 19
load_clear \i, x2, x9
.endr
.endif
.ifc \suffix,_half
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
load_clear \i, x2, x9
.endr
.endif
bl idct32_odd\suffix
transpose_4x4s v31, v30, v29, v28, v4, v5, v6, v7
transpose_4x4s v27, v26, v25, v24, v4, v5, v6, v7
transpose_4x4s v23, v22, v21, v20, v4, v5, v6, v7
transpose_4x4s v19, v18, v17, v16, v4, v5, v6, v7
.macro store_rev a, b, c, d, a16b, b16b
ld1 {v4.4s}, [x0]
rev64 v9.4s, \d
add v4.4s, v4.4s, \a
st1 {v4.4s}, [x0], #16
rev64 v8.4s, \c
ld1 {v4.4s}, [x0]
ext v9.16b, v9.16b, v9.16b, #8
add v4.4s, v4.4s, \b
st1 {v4.4s}, [x0], #16
ext v8.16b, v8.16b, v8.16b, #8
ld1 {v4.4s}, [x0]
rev64 \b, \b
add v4.4s, v4.4s, \c
st1 {v4.4s}, [x0], #16
rev64 \a, \a
ld1 {v4.4s}, [x0]
ext \b16b, \b16b, \b16b, #8
add v4.4s, v4.4s, \d
st1 {v4.4s}, [x0], #16
ext \a16b, \a16b, \a16b, #8
ld1 {v4.4s}, [x0]
sub v4.4s, v4.4s, v9.4s
st1 {v4.4s}, [x0], #16
ld1 {v4.4s}, [x0]
sub v4.4s, v4.4s, v8.4s
st1 {v4.4s}, [x0], #16
ld1 {v4.4s}, [x0]
sub v4.4s, v4.4s, \b
st1 {v4.4s}, [x0], #16
ld1 {v4.4s}, [x0]
sub v4.4s, v4.4s, \a
st1 {v4.4s}, [x0], #16
.endm
store_rev v31.4s, v27.4s, v23.4s, v19.4s, v31.16b, v27.16b
store_rev v30.4s, v26.4s, v22.4s, v18.4s, v30.16b, v26.16b
store_rev v29.4s, v25.4s, v21.4s, v17.4s, v29.16b, v25.16b
store_rev v28.4s, v24.4s, v20.4s, v16.4s, v28.16b, v24.16b
.purgem store_rev
br x14
endfunc
function idct32_1d_4x32_pass2\suffix\()_neon
mov x14, x30
.ifb \suffix
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
load \i, x2, x9
.endr
sub x2, x2, x9, lsl #4
.endif
.ifc \suffix,_quarter
.irp i, 16, 17, 18, 19
load \i, x2, x9
.endr
sub x2, x2, x9, lsl #2
.endif
.ifc \suffix,_half
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
load \i, x2, x9
.endr
sub x2, x2, x9, lsl #3
.endif
bl idct16\suffix
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
store \i, x2, x9
.endr
sub x2, x2, x9, lsl #4
add x2, x2, #128
.ifb \suffix
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
load \i, x2, x9
.endr
sub x2, x2, x9, lsl #4
.endif
.ifc \suffix,_quarter
.irp i, 16, 17, 18, 19
load \i, x2, x9
.endr
sub x2, x2, x9, lsl #2
.endif
.ifc \suffix,_half
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
load \i, x2, x9
.endr
sub x2, x2, x9, lsl #3
.endif
sub x2, x2, #128
bl idct32_odd\suffix
.macro load_acc_store a, b, c, d, neg=0
.if \neg == 0
ld1 {v4.4s}, [x2], x9
ld1 {v5.4s}, [x2], x9
add v4.4s, v4.4s, \a
ld1 {v6.4s}, [x2], x9
add v5.4s, v5.4s, \b
ld1 {v7.4s}, [x2], x9
add v6.4s, v6.4s, \c
add v7.4s, v7.4s, \d
.else
ld1 {v4.4s}, [x2], x7
ld1 {v5.4s}, [x2], x7
sub v4.4s, v4.4s, \a
ld1 {v6.4s}, [x2], x7
sub v5.4s, v5.4s, \b
ld1 {v7.4s}, [x2], x7
sub v6.4s, v6.4s, \c
sub v7.4s, v7.4s, \d
.endif
ld1 {v8.4h}, [x0], x1
ld1 {v8.d}[1], [x0], x1
srshr v4.4s, v4.4s, #6
ld1 {v9.4h}, [x0], x1
srshr v5.4s, v5.4s, #6
uaddw v4.4s, v4.4s, v8.4h
ld1 {v9.d}[1], [x0], x1
srshr v6.4s, v6.4s, #6
uaddw2 v5.4s, v5.4s, v8.8h
srshr v7.4s, v7.4s, #6
sub x0, x0, x1, lsl #2
uaddw v6.4s, v6.4s, v9.4h
sqxtun v4.4h, v4.4s
uaddw2 v7.4s, v7.4s, v9.8h
sqxtun2 v4.8h, v5.4s
umin v4.8h, v4.8h, v15.8h
st1 {v4.4h}, [x0], x1
sqxtun v5.4h, v6.4s
st1 {v4.d}[1], [x0], x1
sqxtun2 v5.8h, v7.4s
umin v5.8h, v5.8h, v15.8h
st1 {v5.4h}, [x0], x1
st1 {v5.d}[1], [x0], x1
.endm
load_acc_store v31.4s, v30.4s, v29.4s, v28.4s
load_acc_store v27.4s, v26.4s, v25.4s, v24.4s
load_acc_store v23.4s, v22.4s, v21.4s, v20.4s
load_acc_store v19.4s, v18.4s, v17.4s, v16.4s
sub x2, x2, x9
load_acc_store v16.4s, v17.4s, v18.4s, v19.4s, 1
load_acc_store v20.4s, v21.4s, v22.4s, v23.4s, 1
load_acc_store v24.4s, v25.4s, v26.4s, v27.4s, 1
load_acc_store v28.4s, v29.4s, v30.4s, v31.4s, 1
.purgem load_acc_store
br x14
endfunc
.endm
idct32_funcs
idct32_funcs _quarter
idct32_funcs _half
const min_eob_idct_idct_32, align=4
.short 0, 9, 34, 70, 135, 240, 336, 448
endconst
function vp9_idct_idct_32x32_add_16_neon
cmp w3, #1
b.eq idct32x32_dc_add_neon
movrel x10, idct_coeffs
mov x15, x30
stp d8, d9, [sp, #-0x10]!
stp d10, d11, [sp, #-0x10]!
stp d12, d13, [sp, #-0x10]!
stp d14, d15, [sp, #-0x10]!
sub sp, sp, #4096
mov x4, x0
mov x5, x1
mov x6, x2
mov x9, #256
neg x7, x9
ld1 {v0.8h,v1.8h}, [x10], #32
sxtl v2.4s, v1.4h
sxtl2 v3.4s, v1.8h
sxtl2 v1.4s, v0.8h
sxtl v0.4s, v0.4h
ld1 {v10.8h,v11.8h}, [x10]
sxtl v12.4s, v11.4h
sxtl2 v13.4s, v11.8h
sxtl2 v11.4s, v10.8h
sxtl v10.4s, v10.4h
dup v15.8h, w13
cmp w3, #34
b.le idct32x32_quarter_add_16_neon
cmp w3, #135
b.le idct32x32_half_add_16_neon
movrel x12, min_eob_idct_idct_32, 2
.irp i, 0, 4, 8, 12, 16, 20, 24, 28
add x0, sp, #(\i*128)
.if \i > 0
ldrh w1, [x12], #2
cmp w3, w1
mov x1, #(32 - \i)/4
b.le 1f
.endif
add x2, x6, #(\i*4)
bl idct32_1d_4x32_pass1_neon
.endr
b 3f
1:
movi v16.4s, #0
movi v17.4s, #0
movi v18.4s, #0
movi v19.4s, #0
2:
subs x1, x1, #1
.rept 4
st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
.endr
b.ne 2b
3:
.irp i, 0, 4, 8, 12, 16, 20, 24, 28
add x0, x4, #(\i*2)
mov x1, x5
add x2, sp, #(\i*4)
bl idct32_1d_4x32_pass2_neon
.endr
add sp, sp, #4096
ldp d14, d15, [sp], 0x10
ldp d12, d13, [sp], 0x10
ldp d10, d11, [sp], 0x10
ldp d8, d9, [sp], 0x10
br x15
endfunc
function ff_vp9_idct_idct_32x32_add_10_neon, export=1
mov x13, #0x03ff
b vp9_idct_idct_32x32_add_16_neon
endfunc
function ff_vp9_idct_idct_32x32_add_12_neon, export=1
mov x13, #0x0fff
b vp9_idct_idct_32x32_add_16_neon
endfunc
.macro idct32_partial size
function idct32x32_\size\()_add_16_neon
.irp i, 0, 4
add x0, sp, #(\i*128)
.ifc \size,quarter
.if \i == 4
cmp w3, #9
b.le 1f
.endif
.endif
add x2, x6, #(\i*4)
bl idct32_1d_4x32_pass1_\size\()_neon
.endr
.ifc \size,half
.irp i, 8, 12
add x0, sp, #(\i*128)
.if \i == 12
cmp w3, #70
b.le 1f
.endif
add x2, x6, #(\i*4)
bl idct32_1d_4x32_pass1_\size\()_neon
.endr
.endif
b 3f
1:
movi v16.4s, #0
movi v17.4s, #0
movi v18.4s, #0
movi v19.4s, #0
.rept 4
st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
.endr
3:
.irp i, 0, 4, 8, 12, 16, 20, 24, 28
add x0, x4, #(\i*2)
mov x1, x5
add x2, sp, #(\i*4)
bl idct32_1d_4x32_pass2_\size\()_neon
.endr
add sp, sp, #4096
ldp d14, d15, [sp], 0x10
ldp d12, d13, [sp], 0x10
ldp d10, d11, [sp], 0x10
ldp d8, d9, [sp], 0x10
br x15
endfunc
.endm
idct32_partial quarter
idct32_partial half