#include "libavutil/aarch64/asm.S"
#include "neon.S"
const itxfm4_coeffs, align=4
.short 11585, 0, 6270, 15137
iadst4_coeffs:
.short 5283, 15212, 9929, 13377
endconst
const iadst8_coeffs, align=4
.short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
idct_coeffs:
.short 11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
.short 1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
.short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
.short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
endconst
const iadst16_coeffs, align=4
.short 16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
.short 14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
endconst
.macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
.if \neg > 0
neg \tmp4\().4h, v0.4h
.endif
add \tmp1\().8h, \in1\().8h, \in2\().8h
sub \tmp2\().8h, \in1\().8h, \in2\().8h
.if \neg > 0
smull \tmp3\().4s, \tmp1\().4h, \tmp4\().h[0]
smull2 \tmp4\().4s, \tmp1\().8h, \tmp4\().h[0]
.else
smull \tmp3\().4s, \tmp1\().4h, v0.h[0]
smull2 \tmp4\().4s, \tmp1\().8h, v0.h[0]
.endif
.ifb \tmp5
rshrn \out1\().4h, \tmp3\().4s, #14
rshrn2 \out1\().8h, \tmp4\().4s, #14
smull \tmp3\().4s, \tmp2\().4h, v0.h[0]
smull2 \tmp4\().4s, \tmp2\().8h, v0.h[0]
rshrn \out2\().4h, \tmp3\().4s, #14
rshrn2 \out2\().8h, \tmp4\().4s, #14
.else
smull \tmp5\().4s, \tmp2\().4h, v0.h[0]
smull2 \tmp6\().4s, \tmp2\().8h, v0.h[0]
rshrn \out1\().4h, \tmp3\().4s, #14
rshrn2 \out1\().8h, \tmp4\().4s, #14
rshrn \out2\().4h, \tmp5\().4s, #14
rshrn2 \out2\().8h, \tmp6\().4s, #14
.endif
.endm
.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
smull \tmp1\().4s, \in1\().4h, v0.h[0]
smull2 \tmp2\().4s, \in1\().8h, v0.h[0]
rshrn \out1\().4h, \tmp1\().4s, #14
rshrn2 \out1\().8h, \tmp2\().4s, #14
rshrn \out2\().4h, \tmp1\().4s, #14
rshrn2 \out2\().8h, \tmp2\().4s, #14
.endm
.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
smull \out1\().4s, \in1\().4h, \coef1
smull2 \out2\().4s, \in1\().8h, \coef1
smull \out3\().4s, \in1\().4h, \coef2
smull2 \out4\().4s, \in1\().8h, \coef2
smlsl \out1\().4s, \in2\().4h, \coef2
smlsl2 \out2\().4s, \in2\().8h, \coef2
smlal \out3\().4s, \in2\().4h, \coef1
smlal2 \out4\().4s, \in2\().8h, \coef1
.endm
.macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
dmbutterfly_l \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
.if \neg > 0
neg \tmp3\().4s, \tmp3\().4s
neg \tmp4\().4s, \tmp4\().4s
.endif
rshrn \inout1\().4h, \tmp1\().4s, #14
rshrn2 \inout1\().8h, \tmp2\().4s, #14
rshrn \inout2\().4h, \tmp3\().4s, #14
rshrn2 \inout2\().8h, \tmp4\().4s, #14
.endm
.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
smull \tmp1\().4s, \inout1\().4h, \coef1
smull2 \tmp2\().4s, \inout1\().8h, \coef1
smull \tmp3\().4s, \inout1\().4h, \coef2
smull2 \tmp4\().4s, \inout1\().8h, \coef2
rshrn \inout1\().4h, \tmp1\().4s, #14
rshrn2 \inout1\().8h, \tmp2\().4s, #14
rshrn \inout2\().4h, \tmp3\().4s, #14
rshrn2 \inout2\().8h, \tmp4\().4s, #14
.endm
.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
smull \tmp1\().4s, \inout2\().4h, \coef2
smull2 \tmp2\().4s, \inout2\().8h, \coef2
smull \tmp3\().4s, \inout2\().4h, \coef1
smull2 \tmp4\().4s, \inout2\().8h, \coef1
neg \tmp1\().4s, \tmp1\().4s
neg \tmp2\().4s, \tmp2\().4s
rshrn \inout2\().4h, \tmp3\().4s, #14
rshrn2 \inout2\().8h, \tmp4\().4s, #14
rshrn \inout1\().4h, \tmp1\().4s, #14
rshrn2 \inout1\().8h, \tmp2\().4s, #14
.endm
.macro dsmull_h out1, out2, in, coef
smull \out1\().4s, \in\().4h, \coef
smull2 \out2\().4s, \in\().8h, \coef
.endm
.macro drshrn_h out, in1, in2, shift
rshrn \out\().4h, \in1\().4s, \shift
rshrn2 \out\().8h, \in2\().4s, \shift
.endm
.macro butterfly_8h out1, out2, in1, in2
add \out1\().8h, \in1\().8h, \in2\().8h
sub \out2\().8h, \in1\().8h, \in2\().8h
.endm
.macro butterfly_8h_r out1, out2, in1, in2
sub \out1\().8h, \in1\().8h, \in2\().8h
add \out2\().8h, \in1\().8h, \in2\().8h
.endm
.macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
add \tmp1\().4s, \in1\().4s, \in3\().4s
add \tmp2\().4s, \in2\().4s, \in4\().4s
sub \tmp3\().4s, \in1\().4s, \in3\().4s
sub \tmp4\().4s, \in2\().4s, \in4\().4s
rshrn \out1\().4h, \tmp1\().4s, #14
rshrn2 \out1\().8h, \tmp2\().4s, #14
rshrn \out2\().4h, \tmp3\().4s, #14
rshrn2 \out2\().8h, \tmp4\().4s, #14
.endm
.macro iwht4 c0, c1, c2, c3
add \c0\().4h, \c0\().4h, \c1\().4h
sub v17.4h, \c2\().4h, \c3\().4h
sub v16.4h, \c0\().4h, v17.4h
sshr v16.4h, v16.4h, #1
sub \c2\().4h, v16.4h, \c1\().4h
sub \c1\().4h, v16.4h, \c3\().4h
add \c3\().4h, v17.4h, \c2\().4h
sub \c0\().4h, \c0\().4h, \c1\().4h
.endm
.macro idct4 c0, c1, c2, c3
smull v22.4s, \c1\().4h, v0.h[3]
smull v20.4s, \c1\().4h, v0.h[2]
add v16.4h, \c0\().4h, \c2\().4h
sub v17.4h, \c0\().4h, \c2\().4h
smlal v22.4s, \c3\().4h, v0.h[2]
smull v18.4s, v16.4h, v0.h[0]
smull v19.4s, v17.4h, v0.h[0]
smlsl v20.4s, \c3\().4h, v0.h[3]
rshrn v22.4h, v22.4s, #14
rshrn v18.4h, v18.4s, #14
rshrn v19.4h, v19.4s, #14
rshrn v20.4h, v20.4s, #14
add \c0\().4h, v18.4h, v22.4h
sub \c3\().4h, v18.4h, v22.4h
add \c1\().4h, v19.4h, v20.4h
sub \c2\().4h, v19.4h, v20.4h
.endm
.macro iadst4 c0, c1, c2, c3
smull v16.4s, \c0\().4h, v0.h[4]
smlal v16.4s, \c2\().4h, v0.h[5]
smlal v16.4s, \c3\().4h, v0.h[6]
smull v17.4s, \c0\().4h, v0.h[6]
smlsl v17.4s, \c2\().4h, v0.h[4]
sub \c0\().4h, \c0\().4h, \c2\().4h
smlsl v17.4s, \c3\().4h, v0.h[5]
add \c0\().4h, \c0\().4h, \c3\().4h
smull v19.4s, \c1\().4h, v0.h[7]
smull v18.4s, \c0\().4h, v0.h[7]
add v20.4s, v16.4s, v19.4s
add v21.4s, v17.4s, v19.4s
rshrn \c0\().4h, v20.4s, #14
add v16.4s, v16.4s, v17.4s
rshrn \c1\().4h, v21.4s, #14
sub v16.4s, v16.4s, v19.4s
rshrn \c2\().4h, v18.4s, #14
rshrn \c3\().4h, v16.4s, #14
.endm
.macro itxfm_func4x4 txfm1, txfm2
function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
.ifc \txfm1,\txfm2
.ifc \txfm1,idct
movrel x4, itxfm4_coeffs
ld1 {v0.4h}, [x4]
.endif
.ifc \txfm1,iadst
movrel x4, iadst4_coeffs
ld1 {v0.d}[1], [x4]
.endif
.else
movrel x4, itxfm4_coeffs
ld1 {v0.8h}, [x4]
.endif
movi v31.8h, #0
.ifc \txfm1\()_\txfm2,idct_idct
cmp w3, #1
b.ne 1f
ld1 {v2.h}[0], [x2]
smull v2.4s, v2.4h, v0.h[0]
rshrn v2.4h, v2.4s, #14
smull v2.4s, v2.4h, v0.h[0]
rshrn v2.4h, v2.4s, #14
st1 {v31.h}[0], [x2]
dup v4.4h, v2.h[0]
mov v5.16b, v4.16b
mov v6.16b, v4.16b
mov v7.16b, v4.16b
b 2f
.endif
1:
ld1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x2]
st1 {v31.8h}, [x2], #16
.ifc \txfm1,iwht
sshr v4.4h, v4.4h, #2
sshr v5.4h, v5.4h, #2
sshr v6.4h, v6.4h, #2
sshr v7.4h, v7.4h, #2
.endif
\txfm1\()4 v4, v5, v6, v7
st1 {v31.8h}, [x2], #16
transpose_4x4H v4, v5, v6, v7, v16, v17, v18, v19
\txfm2\()4 v4, v5, v6, v7
2:
ld1 {v0.s}[0], [x0], x1
ld1 {v1.s}[0], [x0], x1
.ifnc \txfm1,iwht
srshr v4.4h, v4.4h, #4
srshr v5.4h, v5.4h, #4
srshr v6.4h, v6.4h, #4
srshr v7.4h, v7.4h, #4
.endif
uaddw v4.8h, v4.8h, v0.8b
uaddw v5.8h, v5.8h, v1.8b
ld1 {v2.s}[0], [x0], x1
ld1 {v3.s}[0], [x0], x1
sqxtun v0.8b, v4.8h
sqxtun v1.8b, v5.8h
sub x0, x0, x1, lsl #2
uaddw v6.8h, v6.8h, v2.8b
uaddw v7.8h, v7.8h, v3.8b
st1 {v0.s}[0], [x0], x1
sqxtun v2.8b, v6.8h
sqxtun v3.8b, v7.8h
st1 {v1.s}[0], [x0], x1
st1 {v2.s}[0], [x0], x1
st1 {v3.s}[0], [x0], x1
ret
endfunc
.endm
itxfm_func4x4 idct, idct
itxfm_func4x4 iadst, idct
itxfm_func4x4 idct, iadst
itxfm_func4x4 iadst, iadst
itxfm_func4x4 iwht, iwht
.macro idct8
dmbutterfly0 v16, v20, v16, v20, v2, v3, v4, v5, v6, v7
dmbutterfly v18, v22, v0.h[2], v0.h[3], v2, v3, v4, v5
dmbutterfly v17, v23, v0.h[4], v0.h[5], v2, v3, v4, v5
dmbutterfly v21, v19, v0.h[6], v0.h[7], v2, v3, v4, v5
butterfly_8h v24, v25, v16, v22
butterfly_8h v28, v29, v17, v21
butterfly_8h v30, v31, v23, v19
butterfly_8h v26, v27, v20, v18
dmbutterfly0 v31, v29, v31, v29, v2, v3, v4, v5, v6, v7
butterfly_8h v16, v23, v24, v30
butterfly_8h v17, v22, v26, v31
butterfly_8h v18, v21, v27, v29
butterfly_8h v19, v20, v25, v28
.endm
.macro iadst8
dmbutterfly_l v24, v25, v26, v27, v23, v16, v1.h[1], v1.h[0]
dmbutterfly_l v28, v29, v30, v31, v21, v18, v1.h[3], v1.h[2]
dmbutterfly_l v2, v3, v4, v5, v19, v20, v1.h[5], v1.h[4]
dmbutterfly_l v16, v18, v21, v23, v17, v22, v1.h[7], v1.h[6]
dbutterfly_n v4, v5, v26, v27, v4, v5, v6, v7, v26, v27
dbutterfly_n v2, v3, v24, v25, v2, v3, v6, v7, v26, v27
dbutterfly_n v24, v25, v30, v31, v21, v23, v6, v7, v26, v27
dbutterfly_n v30, v31, v28, v29, v16, v18, v6, v7, v26, v27
butterfly_8h v16, v6, v4, v24
butterfly_8h v23, v7, v2, v30
neg v23.8h, v23.8h
dmbutterfly0 v19, v20, v6, v7, v24, v26, v27, v28, v29, v30
neg v19.8h, v19.8h
dmbutterfly_l v26, v27, v28, v29, v5, v3, v0.h[2], v0.h[3]
dmbutterfly_l v2, v3, v4, v5, v31, v25, v0.h[3], v0.h[2]
dbutterfly_n v17, v30, v28, v29, v2, v3, v6, v7, v24, v25
dbutterfly_n v22, v31, v26, v27, v4, v5, v6, v7, v24, v25
neg v17.8h, v17.8h
dmbutterfly0 v18, v21, v30, v31, v2, v3, v4, v5, v6, v7
neg v21.8h, v21.8h
.endm
.macro itxfm_func8x8 txfm1, txfm2
function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
.ifc \txfm1\()_\txfm2,idct_idct
movrel x4, idct_coeffs
.else
movrel x4, iadst8_coeffs
ld1 {v1.8h}, [x4], #16
.endif
ld1 {v0.8h}, [x4]
movi v2.8h, #0
movi v3.8h, #0
movi v4.8h, #0
movi v5.8h, #0
.ifc \txfm1\()_\txfm2,idct_idct
cmp w3, #1
b.ne 1f
ld1 {v2.h}[0], [x2]
smull v2.4s, v2.4h, v0.h[0]
rshrn v2.4h, v2.4s, #14
smull v2.4s, v2.4h, v0.h[0]
rshrn v2.4h, v2.4s, #14
st1 {v3.h}[0], [x2]
dup v16.8h, v2.h[0]
mov v17.16b, v16.16b
mov v18.16b, v16.16b
mov v19.16b, v16.16b
mov v20.16b, v16.16b
mov v21.16b, v16.16b
mov v22.16b, v16.16b
mov v23.16b, v16.16b
b 2f
.endif
1:
ld1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x2], #64
ld1 {v20.8h,v21.8h,v22.8h,v23.8h}, [x2], #64
sub x2, x2, #128
st1 {v2.8h,v3.8h,v4.8h,v5.8h}, [x2], #64
st1 {v2.8h,v3.8h,v4.8h,v5.8h}, [x2], #64
\txfm1\()8
transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
\txfm2\()8
2:
mov x3, x0
ld1 {v0.8b}, [x0], x1
srshr v16.8h, v16.8h, #5
ld1 {v1.8b}, [x0], x1
srshr v17.8h, v17.8h, #5
ld1 {v2.8b}, [x0], x1
srshr v18.8h, v18.8h, #5
uaddw v16.8h, v16.8h, v0.8b
ld1 {v3.8b}, [x0], x1
srshr v19.8h, v19.8h, #5
uaddw v17.8h, v17.8h, v1.8b
ld1 {v4.8b}, [x0], x1
srshr v20.8h, v20.8h, #5
uaddw v18.8h, v18.8h, v2.8b
sqxtun v0.8b, v16.8h
ld1 {v5.8b}, [x0], x1
srshr v21.8h, v21.8h, #5
uaddw v19.8h, v19.8h, v3.8b
sqxtun v1.8b, v17.8h
ld1 {v6.8b}, [x0], x1
srshr v22.8h, v22.8h, #5
uaddw v20.8h, v20.8h, v4.8b
sqxtun v2.8b, v18.8h
ld1 {v7.8b}, [x0], x1
srshr v23.8h, v23.8h, #5
uaddw v21.8h, v21.8h, v5.8b
sqxtun v3.8b, v19.8h
st1 {v0.8b}, [x3], x1
uaddw v22.8h, v22.8h, v6.8b
st1 {v1.8b}, [x3], x1
sqxtun v4.8b, v20.8h
st1 {v2.8b}, [x3], x1
uaddw v23.8h, v23.8h, v7.8b
st1 {v3.8b}, [x3], x1
sqxtun v5.8b, v21.8h
st1 {v4.8b}, [x3], x1
sqxtun v6.8b, v22.8h
st1 {v5.8b}, [x3], x1
sqxtun v7.8b, v23.8h
st1 {v6.8b}, [x3], x1
st1 {v7.8b}, [x3], x1
ret
endfunc
.endm
itxfm_func8x8 idct, idct
itxfm_func8x8 iadst, idct
itxfm_func8x8 idct, iadst
itxfm_func8x8 iadst, iadst
function idct16x16_dc_add_neon
movrel x4, idct_coeffs
ld1 {v0.4h}, [x4]
movi v1.4h, #0
ld1 {v2.h}[0], [x2]
smull v2.4s, v2.4h, v0.h[0]
rshrn v2.4h, v2.4s, #14
smull v2.4s, v2.4h, v0.h[0]
rshrn v2.4h, v2.4s, #14
dup v2.8h, v2.h[0]
st1 {v1.h}[0], [x2]
srshr v2.8h, v2.8h, #6
mov x3, x0
mov x4, #16
1:
subs x4, x4, #2
ld1 {v3.16b}, [x0], x1
ld1 {v4.16b}, [x0], x1
uaddw v16.8h, v2.8h, v3.8b
uaddw2 v17.8h, v2.8h, v3.16b
uaddw v18.8h, v2.8h, v4.8b
uaddw2 v19.8h, v2.8h, v4.16b
sqxtun v3.8b, v16.8h
sqxtun2 v3.16b, v17.8h
sqxtun v4.8b, v18.8h
sqxtun2 v4.16b, v19.8h
st1 {v3.16b}, [x3], x1
st1 {v4.16b}, [x3], x1
b.ne 1b
ret
endfunc
.macro idct16_end
butterfly_8h v18, v7, v4, v7
butterfly_8h v19, v22, v5, v22
butterfly_8h v4, v26, v20, v26
butterfly_8h v5, v6, v28, v6
butterfly_8h v20, v28, v16, v24
butterfly_8h v24, v21, v23, v21
butterfly_8h v23, v27, v25, v27
butterfly_8h v25, v29, v29, v17
dmbutterfly0 v2, v3, v27, v21, v2, v3, v16, v17, v30, v31
dmbutterfly0 v28, v27, v29, v28, v21, v29, v16, v17, v30, v31
butterfly_8h v16, v31, v18, v25
butterfly_8h v17, v30, v19, v23
butterfly_8h_r v25, v22, v22, v24
butterfly_8h v23, v24, v7, v20
butterfly_8h v18, v29, v4, v2
butterfly_8h v19, v28, v5, v28
butterfly_8h v20, v27, v6, v27
butterfly_8h v21, v26, v26, v3
ret
.endm
function idct16
dmbutterfly0 v16, v24, v16, v24, v2, v3, v4, v5, v6, v7
dmbutterfly v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5
dmbutterfly v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5
dmbutterfly v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5
dmbutterfly v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5
dmbutterfly v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5
dmbutterfly v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5
dmbutterfly v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5
butterfly_8h v4, v28, v16, v28
butterfly_8h v5, v20, v24, v20
butterfly_8h v6, v26, v18, v26
butterfly_8h v7, v22, v30, v22
butterfly_8h v16, v25, v17, v25
butterfly_8h v24, v21, v29, v21
butterfly_8h v17, v27, v19, v27
butterfly_8h v29, v23, v31, v23
dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31
dmbutterfly v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31
dmbutterfly v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1
idct16_end
endfunc
function idct16_half
dmbutterfly0_h v16, v24, v16, v24, v2, v3, v4, v5, v6, v7
dmbutterfly_h1 v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5
dmbutterfly_h1 v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5
dmbutterfly_h2 v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5
dmbutterfly_h1 v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5
dmbutterfly_h2 v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5
dmbutterfly_h1 v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5
dmbutterfly_h2 v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5
butterfly_8h v4, v28, v16, v28
butterfly_8h v5, v20, v24, v20
butterfly_8h v6, v26, v18, v26
butterfly_8h v7, v22, v30, v22
butterfly_8h v16, v25, v17, v25
butterfly_8h v24, v21, v29, v21
butterfly_8h v17, v27, v19, v27
butterfly_8h v29, v23, v31, v23
dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31
dmbutterfly v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31
dmbutterfly v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1
idct16_end
endfunc
function idct16_quarter
dsmull_h v24, v25, v19, v1.h[7]
dsmull_h v4, v5, v17, v1.h[0]
dsmull_h v7, v6, v18, v0.h[5]
dsmull_h v30, v31, v18, v0.h[4]
neg v24.4s, v24.4s
neg v25.4s, v25.4s
dsmull_h v29, v28, v17, v1.h[1]
dsmull_h v26, v27, v19, v1.h[6]
dsmull_h v22, v23, v16, v0.h[0]
drshrn_h v24, v24, v25, #14
drshrn_h v16, v4, v5, #14
drshrn_h v7, v7, v6, #14
drshrn_h v6, v30, v31, #14
drshrn_h v29, v29, v28, #14
drshrn_h v17, v26, v27, #14
drshrn_h v28, v22, v23, #14
dmbutterfly_l v20, v21, v22, v23, v17, v24, v0.h[2], v0.h[3]
dmbutterfly_l v18, v19, v30, v31, v29, v16, v0.h[2], v0.h[3]
neg v22.4s, v22.4s
neg v23.4s, v23.4s
drshrn_h v27, v20, v21, #14
drshrn_h v21, v22, v23, #14
drshrn_h v23, v18, v19, #14
drshrn_h v25, v30, v31, #14
mov v4.16b, v28.16b
mov v5.16b, v28.16b
dmbutterfly0 v22, v26, v7, v6, v18, v19, v30, v31
mov v20.16b, v28.16b
idct16_end
endfunc
function iadst16
ld1 {v0.8h,v1.8h}, [x11]
dmbutterfly_l v6, v7, v4, v5, v31, v16, v0.h[1], v0.h[0]
dmbutterfly_l v10, v11, v8, v9, v23, v24, v0.h[5], v0.h[4]
dbutterfly_n v31, v24, v6, v7, v10, v11, v12, v13, v10, v11
dmbutterfly_l v14, v15, v12, v13, v29, v18, v0.h[3], v0.h[2]
dbutterfly_n v16, v23, v4, v5, v8, v9, v6, v7, v8, v9
dmbutterfly_l v6, v7, v4, v5, v21, v26, v0.h[7], v0.h[6]
dbutterfly_n v29, v26, v14, v15, v6, v7, v8, v9, v6, v7
dmbutterfly_l v10, v11, v8, v9, v27, v20, v1.h[1], v1.h[0]
dbutterfly_n v18, v21, v12, v13, v4, v5, v6, v7, v4, v5
dmbutterfly_l v14, v15, v12, v13, v19, v28, v1.h[5], v1.h[4]
dbutterfly_n v20, v28, v10, v11, v14, v15, v4, v5, v14, v15
dmbutterfly_l v6, v7, v4, v5, v25, v22, v1.h[3], v1.h[2]
dbutterfly_n v27, v19, v8, v9, v12, v13, v10, v11, v12, v13
dmbutterfly_l v10, v11, v8, v9, v17, v30, v1.h[7], v1.h[6]
ld1 {v0.8h}, [x10]
dbutterfly_n v22, v30, v6, v7, v10, v11, v12, v13, v10, v11
dmbutterfly_l v14, v15, v12, v13, v23, v24, v0.h[4], v0.h[5]
dbutterfly_n v25, v17, v4, v5, v8, v9, v6, v7, v8, v9
dmbutterfly_l v4, v5, v6, v7, v28, v19, v0.h[5], v0.h[4]
dbutterfly_n v23, v19, v12, v13, v4, v5, v8, v9, v4, v5
dmbutterfly_l v10, v11, v8, v9, v21, v26, v0.h[6], v0.h[7]
butterfly_8h_r v4, v27, v16, v27
dbutterfly_n v24, v28, v14, v15, v6, v7, v12, v13, v6, v7
dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[7], v0.h[6]
butterfly_8h_r v5, v20, v31, v20
dbutterfly_n v21, v17, v8, v9, v12, v13, v6, v7, v12, v13
dbutterfly_n v26, v30, v10, v11, v14, v15, v8, v9, v14, v15
butterfly_8h_r v6, v25, v18, v25
butterfly_8h_r v7, v22, v29, v22
dmbutterfly_l v10, v11, v8, v9, v19, v28, v0.h[2], v0.h[3]
dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[3], v0.h[2]
dbutterfly_n v18, v30, v8, v9, v12, v13, v16, v17, v12, v13
dbutterfly_n v29, v17, v10, v11, v14, v15, v12, v13, v14, v15
neg v29.8h, v29.8h
dmbutterfly_l v10, v11, v8, v9, v4, v5, v0.h[2], v0.h[3]
dmbutterfly_l v12, v13, v14, v15, v7, v6, v0.h[3], v0.h[2]
butterfly_8h v2, v6, v27, v25
butterfly_8h v3, v7, v23, v21
dbutterfly_n v19, v31, v8, v9, v12, v13, v4, v5, v8, v9
neg v19.8h, v19.8h
dbutterfly_n v28, v16, v10, v11, v14, v15, v4, v5, v10, v11
butterfly_8h v5, v8, v20, v22
butterfly_8h v4, v9, v24, v26
dmbutterfly0 v23, v24, v6, v8, v10, v11, v12, v13, v14, v15, 1
dmbutterfly0 v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1
dmbutterfly0 v20, v27, v16, v31, v10, v11, v12, v13, v14, v15
dmbutterfly0 v22, v25, v9, v7, v10, v11, v12, v13, v14, v15
neg v31.8h, v5.8h
neg v17.8h, v3.8h
mov v16.16b, v2.16b
mov v30.16b, v4.16b
ret
endfunc
.macro load i, src, inc
ld1 {v\i\().8h}, [\src], \inc
.endm
.macro store i, dst, inc
st1 {v\i\().8h}, [\dst], \inc
.endm
.macro movi_v i, size, imm
movi v\i\()\size, \imm
.endm
.macro load_clear i, src, inc
ld1 {v\i\().8h}, [\src]
st1 {v2.8h}, [\src], \inc
.endm
.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2
srshr \coef0, \coef0, #6
ld1 {v2.8b}, [x0], x1
srshr \coef1, \coef1, #6
ld1 {v3.8b}, [x3], x1
srshr \coef2, \coef2, #6
ld1 {v4.8b}, [x0], x1
srshr \coef3, \coef3, #6
uaddw \coef0, \coef0, v2.8b
ld1 {v5.8b}, [x3], x1
uaddw \coef1, \coef1, v3.8b
srshr \coef4, \coef4, #6
ld1 {v6.8b}, [x0], x1
srshr \coef5, \coef5, #6
ld1 {v7.8b}, [x3], x1
sqxtun v2.8b, \coef0
srshr \coef6, \coef6, #6
sqxtun v3.8b, \coef1
srshr \coef7, \coef7, #6
uaddw \coef2, \coef2, v4.8b
ld1 {\tmp1}, [x0], x1
uaddw \coef3, \coef3, v5.8b
ld1 {\tmp2}, [x3], x1
sqxtun v4.8b, \coef2
sub x0, x0, x1, lsl #2
sub x3, x3, x1, lsl #2
sqxtun v5.8b, \coef3
uaddw \coef4, \coef4, v6.8b
st1 {v2.8b}, [x0], x1
uaddw \coef5, \coef5, v7.8b
st1 {v3.8b}, [x3], x1
sqxtun v6.8b, \coef4
st1 {v4.8b}, [x0], x1
sqxtun v7.8b, \coef5
st1 {v5.8b}, [x3], x1
uaddw \coef6, \coef6, \tmp1
st1 {v6.8b}, [x0], x1
uaddw \coef7, \coef7, \tmp2
st1 {v7.8b}, [x3], x1
sqxtun \tmp1, \coef6
sqxtun \tmp2, \coef7
st1 {\tmp1}, [x0], x1
st1 {\tmp2}, [x3], x1
.endm
.macro itxfm16_1d_funcs txfm
function \txfm\()16_1d_8x16_pass1_neon
mov x14, x30
movi v2.8h, #0
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
load_clear \i, x2, x9
.endr
bl \txfm\()16
transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
cmp x1, #8
b.eq 1f
.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
store \i, x0, #16
.endr
br x14
1:
.irp i, 24, 25, 26, 27, 28, 29, 30, 31
add x0, x0, #16
store \i, x0, #16
.endr
mov v24.16b, v16.16b
mov v25.16b, v17.16b
mov v26.16b, v18.16b
mov v27.16b, v19.16b
mov v28.16b, v20.16b
mov v29.16b, v21.16b
mov v30.16b, v22.16b
mov v31.16b, v23.16b
br x14
endfunc
function \txfm\()16_1d_8x16_pass2_neon
mov x14, x30
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
load \i, x2, x9
.endr
cbz x3, 1f
.irp i, 24, 25, 26, 27, 28, 29, 30, 31
load \i, x2, x9
.endr
1:
add x3, x0, x1
lsl x1, x1, #1
bl \txfm\()16
load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
br x14
endfunc
.endm
itxfm16_1d_funcs idct
itxfm16_1d_funcs iadst
.macro itxfm_func16x16 txfm1, txfm2
function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
.ifc \txfm1\()_\txfm2,idct_idct
cmp w3, #1
b.eq idct16x16_dc_add_neon
.endif
mov x15, x30
.ifnc \txfm1\()_\txfm2,idct_idct
stp d14, d15, [sp, #-0x10]!
stp d12, d13, [sp, #-0x10]!
stp d10, d11, [sp, #-0x10]!
stp d8, d9, [sp, #-0x10]!
.endif
sub sp, sp, #512
mov x4, x0
mov x5, x1
mov x6, x2
movrel x10, idct_coeffs
.ifnc \txfm1\()_\txfm2,idct_idct
movrel x11, iadst16_coeffs
.endif
.ifc \txfm1,idct
ld1 {v0.8h,v1.8h}, [x10]
.endif
mov x9, #32
.ifc \txfm1\()_\txfm2,idct_idct
cmp w3, #10
b.le idct16x16_quarter_add_neon
cmp w3, #38
b.le idct16x16_half_add_neon
.endif
.irp i, 0, 8
add x0, sp, #(\i*32)
.ifc \txfm1\()_\txfm2,idct_idct
.if \i == 8
cmp w3, #38
b.le 1f
.endif
.endif
mov x1, #\i
add x2, x6, #(\i*2)
bl \txfm1\()16_1d_8x16_pass1_neon
.endr
.ifc \txfm1\()_\txfm2,iadst_idct
ld1 {v0.8h,v1.8h}, [x10]
.endif
.ifc \txfm1\()_\txfm2,idct_idct
b 3f
1:
add x0, x0, #16
.irp i, 24, 25, 26, 27, 28, 29, 30, 31
movi_v \i, .16b, #0
st1 {v24.8h}, [x0], x9
.endr
3:
.endif
.irp i, 0, 8
add x0, x4, #(\i)
mov x1, x5
add x2, sp, #(\i*2)
mov x3, #\i
bl \txfm2\()16_1d_8x16_pass2_neon
.endr
add sp, sp, #512
.ifnc \txfm1\()_\txfm2,idct_idct
ldp d8, d9, [sp], 0x10
ldp d10, d11, [sp], 0x10
ldp d12, d13, [sp], 0x10
ldp d14, d15, [sp], 0x10
.endif
br x15
endfunc
.endm
itxfm_func16x16 idct, idct
itxfm_func16x16 iadst, idct
itxfm_func16x16 idct, iadst
itxfm_func16x16 iadst, iadst
function idct16_1d_8x16_pass1_quarter_neon
mov x14, x30
movi v2.8h, #0
.irp i, 16, 17, 18, 19
load_clear \i, x2, x9
.endr
bl idct16_quarter
transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
add x0, x0, #16
.irp i, 24, 25, 26, 27
store \i, x0, x9
.endr
br x14
endfunc
function idct16_1d_8x16_pass2_quarter_neon
mov x14, x30
cbz x3, 1f
.irp i, 16, 17, 18, 19
load \i, x2, x9
.endr
1:
add x3, x0, x1
lsl x1, x1, #1
bl idct16_quarter
load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
br x14
endfunc
function idct16_1d_8x16_pass1_half_neon
mov x14, x30
movi v2.8h, #0
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
load_clear \i, x2, x9
.endr
bl idct16_half
transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
add x0, x0, #16
.irp i, 24, 25, 26, 27, 28, 29, 30, 31
store \i, x0, x9
.endr
br x14
endfunc
function idct16_1d_8x16_pass2_half_neon
mov x14, x30
cbz x3, 1f
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
load \i, x2, x9
.endr
1:
add x3, x0, x1
lsl x1, x1, #1
bl idct16_half
load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
br x14
endfunc
.macro idct16_partial size
function idct16x16_\size\()_add_neon
add x0, sp, #(0*32)
add x2, x6, #(0*2)
bl idct16_1d_8x16_pass1_\size\()_neon
.irp i, 0, 8
add x0, x4, #(\i)
mov x1, x5
add x2, sp, #(\i*2)
mov x3, #\i
bl idct16_1d_8x16_pass2_\size\()_neon
.endr
add sp, sp, #512
br x15
endfunc
.endm
idct16_partial quarter
idct16_partial half
function idct32x32_dc_add_neon
movrel x4, idct_coeffs
ld1 {v0.4h}, [x4]
movi v1.4h, #0
ld1 {v2.h}[0], [x2]
smull v2.4s, v2.4h, v0.h[0]
rshrn v2.4h, v2.4s, #14
smull v2.4s, v2.4h, v0.h[0]
rshrn v2.4h, v2.4s, #14
dup v2.8h, v2.h[0]
st1 {v1.h}[0], [x2]
srshr v0.8h, v2.8h, #6
mov x3, x0
mov x4, #32
1:
subs x4, x4, #2
ld1 {v1.16b,v2.16b}, [x0], x1
uaddw v16.8h, v0.8h, v1.8b
uaddw2 v17.8h, v0.8h, v1.16b
ld1 {v3.16b,v4.16b}, [x0], x1
uaddw v18.8h, v0.8h, v2.8b
uaddw2 v19.8h, v0.8h, v2.16b
uaddw v20.8h, v0.8h, v3.8b
uaddw2 v21.8h, v0.8h, v3.16b
uaddw v22.8h, v0.8h, v4.8b
uaddw2 v23.8h, v0.8h, v4.16b
sqxtun v1.8b, v16.8h
sqxtun2 v1.16b, v17.8h
sqxtun v2.8b, v18.8h
sqxtun2 v2.16b, v19.8h
sqxtun v3.8b, v20.8h
sqxtun2 v3.16b, v21.8h
st1 {v1.16b,v2.16b}, [x3], x1
sqxtun v4.8b, v22.8h
sqxtun2 v4.16b, v23.8h
st1 {v3.16b,v4.16b}, [x3], x1
b.ne 1b
ret
endfunc
.macro idct32_end
butterfly_8h v16, v5, v4, v5
butterfly_8h v17, v20, v23, v20
butterfly_8h v18, v6, v7, v6
butterfly_8h v19, v21, v22, v21
butterfly_8h v4, v28, v28, v30
butterfly_8h v23, v26, v25, v26
butterfly_8h v7, v3, v29, v31
butterfly_8h v22, v27, v24, v27
dmbutterfly v27, v20, v0.h[2], v0.h[3], v24, v25, v30, v31
dmbutterfly v3, v5, v0.h[2], v0.h[3], v24, v25, v30, v31
dmbutterfly v28, v6, v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1
dmbutterfly v26, v21, v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1
butterfly_8h v31, v24, v7, v4
butterfly_8h v30, v25, v22, v23
butterfly_8h_r v23, v16, v16, v18
butterfly_8h_r v22, v17, v17, v19
butterfly_8h v18, v21, v27, v21
butterfly_8h_r v27, v28, v5, v28
butterfly_8h v29, v26, v20, v26
butterfly_8h v19, v20, v3, v6
dmbutterfly0 v27, v20, v27, v20, v2, v3, v4, v5, v6, v7
dmbutterfly0 v26, v21, v26, v21, v2, v3, v4, v5, v6, v7
dmbutterfly0 v25, v22, v25, v22, v2, v3, v4, v5, v6, v7
dmbutterfly0 v24, v23, v24, v23, v2, v3, v4, v5, v6, v7
ret
.endm
function idct32_odd
dmbutterfly v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7
dmbutterfly v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7
dmbutterfly v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7
dmbutterfly v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7
dmbutterfly v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7
dmbutterfly v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7
dmbutterfly v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7
dmbutterfly v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7
butterfly_8h v4, v24, v16, v24
butterfly_8h v5, v20, v28, v20
butterfly_8h v6, v26, v18, v26
butterfly_8h v7, v22, v30, v22
butterfly_8h v28, v25, v17, v25
butterfly_8h v30, v21, v29, v21
butterfly_8h v29, v23, v31, v23
butterfly_8h v31, v27, v19, v27
dmbutterfly v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19
dmbutterfly v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1
dmbutterfly v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19
dmbutterfly v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1
idct32_end
endfunc
function idct32_odd_half
dmbutterfly_h1 v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7
dmbutterfly_h2 v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7
dmbutterfly_h1 v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7
dmbutterfly_h2 v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7
dmbutterfly_h1 v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7
dmbutterfly_h2 v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7
dmbutterfly_h1 v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7
dmbutterfly_h2 v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7
butterfly_8h v4, v24, v16, v24
butterfly_8h v5, v20, v28, v20
butterfly_8h v6, v26, v18, v26
butterfly_8h v7, v22, v30, v22
butterfly_8h v28, v25, v17, v25
butterfly_8h v30, v21, v29, v21
butterfly_8h v29, v23, v31, v23
butterfly_8h v31, v27, v19, v27
dmbutterfly v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19
dmbutterfly v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1
dmbutterfly v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19
dmbutterfly v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1
idct32_end
endfunc
function idct32_odd_quarter
dsmull_h v4, v5, v16, v8.h[0]
dsmull_h v28, v29, v19, v8.h[7]
dsmull_h v30, v31, v16, v8.h[1]
dsmull_h v22, v23, v17, v9.h[6]
dsmull_h v7, v6, v17, v9.h[7]
dsmull_h v26, v27, v19, v8.h[6]
dsmull_h v20, v21, v18, v9.h[0]
dsmull_h v24, v25, v18, v9.h[1]
neg v28.4s, v28.4s
neg v29.4s, v29.4s
neg v7.4s, v7.4s
neg v6.4s, v6.4s
drshrn_h v4, v4, v5, #14
drshrn_h v5, v28, v29, #14
drshrn_h v29, v30, v31, #14
drshrn_h v28, v22, v23, #14
drshrn_h v7, v7, v6, #14
drshrn_h v31, v26, v27, #14
drshrn_h v6, v20, v21, #14
drshrn_h v30, v24, v25, #14
dmbutterfly_l v16, v17, v18, v19, v29, v4, v0.h[4], v0.h[5]
dmbutterfly_l v27, v26, v20, v21, v31, v5, v0.h[4], v0.h[5]
drshrn_h v23, v16, v17, #14
drshrn_h v24, v18, v19, #14
neg v20.4s, v20.4s
neg v21.4s, v21.4s
drshrn_h v27, v27, v26, #14
drshrn_h v20, v20, v21, #14
dmbutterfly_l v16, v17, v18, v19, v30, v6, v0.h[6], v0.h[7]
drshrn_h v21, v16, v17, #14
drshrn_h v26, v18, v19, #14
dmbutterfly_l v16, v17, v18, v19, v28, v7, v0.h[6], v0.h[7]
drshrn_h v25, v16, v17, #14
neg v18.4s, v18.4s
neg v19.4s, v19.4s
drshrn_h v22, v18, v19, #14
idct32_end
endfunc
.macro idct32_funcs suffix
function idct32_1d_8x32_pass1\suffix\()_neon
mov x14, x30
movi v2.8h, #0
.ifb \suffix
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
load_clear \i, x2, x9
.endr
.endif
.ifc \suffix,_quarter
.irp i, 16, 17, 18, 19
load_clear \i, x2, x9
.endr
.endif
.ifc \suffix,_half
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
load_clear \i, x2, x9
.endr
.endif
bl idct16\suffix
transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
.macro store_rev a, b
rev64 v3.8h, \b
st1 {\a}, [x0], #16
rev64 v2.8h, \a
ext v3.16b, v3.16b, v3.16b, #8
st1 {\b}, [x0], #16
ext v2.16b, v2.16b, v2.16b, #8
st1 {v3.8h}, [x0], #16
st1 {v2.8h}, [x0], #16
.endm
store_rev v16.8h, v24.8h
store_rev v17.8h, v25.8h
store_rev v18.8h, v26.8h
store_rev v19.8h, v27.8h
store_rev v20.8h, v28.8h
store_rev v21.8h, v29.8h
store_rev v22.8h, v30.8h
store_rev v23.8h, v31.8h
sub x0, x0, #512
.purgem store_rev
.ifb \suffix
sub x2, x2, x9, lsl #4
.endif
.ifc \suffix,_quarter
sub x2, x2, x9, lsl #2
.endif
.ifc \suffix,_half
sub x2, x2, x9, lsl #3
.endif
add x2, x2, #64
movi v2.8h, #0
.ifb \suffix
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
load_clear \i, x2, x9
.endr
.endif
.ifc \suffix,_quarter
.irp i, 16, 17, 18, 19
load_clear \i, x2, x9
.endr
.endif
.ifc \suffix,_half
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
load_clear \i, x2, x9
.endr
.endif
bl idct32_odd\suffix
transpose_8x8H v31, v30, v29, v28, v27, v26, v25, v24, v2, v3
transpose_8x8H v23, v22, v21, v20, v19, v18, v17, v16, v2, v3
.macro store_rev a, b
ld1 {v4.8h}, [x0]
rev64 v3.8h, \b
add v4.8h, v4.8h, \a
rev64 v2.8h, \a
st1 {v4.8h}, [x0], #16
ext v3.16b, v3.16b, v3.16b, #8
ld1 {v5.8h}, [x0]
ext v2.16b, v2.16b, v2.16b, #8
add v5.8h, v5.8h, \b
st1 {v5.8h}, [x0], #16
ld1 {v6.8h}, [x0]
sub v6.8h, v6.8h, v3.8h
st1 {v6.8h}, [x0], #16
ld1 {v7.8h}, [x0]
sub v7.8h, v7.8h, v2.8h
st1 {v7.8h}, [x0], #16
.endm
store_rev v31.8h, v23.8h
store_rev v30.8h, v22.8h
store_rev v29.8h, v21.8h
store_rev v28.8h, v20.8h
store_rev v27.8h, v19.8h
store_rev v26.8h, v18.8h
store_rev v25.8h, v17.8h
store_rev v24.8h, v16.8h
.purgem store_rev
br x14
endfunc
function idct32_1d_8x32_pass2\suffix\()_neon
mov x14, x30
.ifb \suffix
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
load \i, x2, x9
.endr
sub x2, x2, x9, lsl #4
.endif
.ifc \suffix,_quarter
.irp i, 16, 17, 18, 19
load \i, x2, x9
.endr
sub x2, x2, x9, lsl #2
.endif
.ifc \suffix,_half
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
load \i, x2, x9
.endr
sub x2, x2, x9, lsl #3
.endif
bl idct16\suffix
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
store \i, x2, x9
.endr
sub x2, x2, x9, lsl #4
add x2, x2, #64
.ifb \suffix
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
load \i, x2, x9
.endr
sub x2, x2, x9, lsl #4
.endif
.ifc \suffix,_quarter
.irp i, 16, 17, 18, 19
load \i, x2, x9
.endr
sub x2, x2, x9, lsl #2
.endif
.ifc \suffix,_half
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
load \i, x2, x9
.endr
sub x2, x2, x9, lsl #3
.endif
sub x2, x2, #64
bl idct32_odd\suffix
.macro load_acc_store a, b, c, d, neg=0
.if \neg == 0
ld1 {v4.8h}, [x2], x9
ld1 {v5.8h}, [x2], x9
add v4.8h, v4.8h, \a
ld1 {v6.8h}, [x2], x9
add v5.8h, v5.8h, \b
ld1 {v7.8h}, [x2], x9
add v6.8h, v6.8h, \c
add v7.8h, v7.8h, \d
.else
ld1 {v4.8h}, [x2], x7
ld1 {v5.8h}, [x2], x7
sub v4.8h, v4.8h, \a
ld1 {v6.8h}, [x2], x7
sub v5.8h, v5.8h, \b
ld1 {v7.8h}, [x2], x7
sub v6.8h, v6.8h, \c
sub v7.8h, v7.8h, \d
.endif
ld1 {v10.8b}, [x0], x1
ld1 {v11.8b}, [x0], x1
srshr v4.8h, v4.8h, #6
ld1 {v2.8b}, [x0], x1
srshr v5.8h, v5.8h, #6
uaddw v4.8h, v4.8h, v10.8b
ld1 {v3.8b}, [x0], x1
srshr v6.8h, v6.8h, #6
uaddw v5.8h, v5.8h, v11.8b
srshr v7.8h, v7.8h, #6
sub x0, x0, x1, lsl #2
uaddw v6.8h, v6.8h, v2.8b
sqxtun v4.8b, v4.8h
uaddw v7.8h, v7.8h, v3.8b
sqxtun v5.8b, v5.8h
st1 {v4.8b}, [x0], x1
sqxtun v6.8b, v6.8h
st1 {v5.8b}, [x0], x1
sqxtun v7.8b, v7.8h
st1 {v6.8b}, [x0], x1
st1 {v7.8b}, [x0], x1
.endm
load_acc_store v31.8h, v30.8h, v29.8h, v28.8h
load_acc_store v27.8h, v26.8h, v25.8h, v24.8h
load_acc_store v23.8h, v22.8h, v21.8h, v20.8h
load_acc_store v19.8h, v18.8h, v17.8h, v16.8h
sub x2, x2, x9
load_acc_store v16.8h, v17.8h, v18.8h, v19.8h, 1
load_acc_store v20.8h, v21.8h, v22.8h, v23.8h, 1
load_acc_store v24.8h, v25.8h, v26.8h, v27.8h, 1
load_acc_store v28.8h, v29.8h, v30.8h, v31.8h, 1
.purgem load_acc_store
br x14
endfunc
.endm
idct32_funcs
idct32_funcs _quarter
idct32_funcs _half
const min_eob_idct_idct_32, align=4
.short 0, 34, 135, 336
endconst
function ff_vp9_idct_idct_32x32_add_neon, export=1
cmp w3, #1
b.eq idct32x32_dc_add_neon
movrel x10, idct_coeffs
mov x15, x30
stp d10, d11, [sp, #-0x10]!
stp d8, d9, [sp, #-0x10]!
sub sp, sp, #2048
mov x4, x0
mov x5, x1
mov x6, x2
mov x9, #128
neg x7, x9
ld1 {v0.8h,v1.8h}, [x10], #32
ld1 {v8.8h,v9.8h}, [x10]
cmp w3, #34
b.le idct32x32_quarter_add_neon
cmp w3, #135
b.le idct32x32_half_add_neon
movrel x12, min_eob_idct_idct_32, 2
.irp i, 0, 8, 16, 24
add x0, sp, #(\i*64)
.if \i > 0
ldrh w1, [x12], #2
cmp w3, w1
mov x1, #(32 - \i)/4
b.le 1f
.endif
add x2, x6, #(\i*2)
bl idct32_1d_8x32_pass1_neon
.endr
b 3f
1:
movi v16.8h, #0
movi v17.8h, #0
movi v18.8h, #0
movi v19.8h, #0
2:
subs x1, x1, #1
.rept 4
st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x0], #64
.endr
b.ne 2b
3:
.irp i, 0, 8, 16, 24
add x0, x4, #(\i)
mov x1, x5
add x2, sp, #(\i*2)
bl idct32_1d_8x32_pass2_neon
.endr
add sp, sp, #2048
ldp d8, d9, [sp], 0x10
ldp d10, d11, [sp], 0x10
br x15
endfunc
.macro idct32_partial size
function idct32x32_\size\()_add_neon
add x0, sp, #(0*64)
add x2, x6, #(0*2)
bl idct32_1d_8x32_pass1_\size\()_neon
.ifc \size,half
add x0, sp, #(8*64)
add x2, x6, #(8*2)
bl idct32_1d_8x32_pass1_\size\()_neon
.endif
.irp i, 0, 8, 16, 24
add x0, x4, #(\i)
mov x1, x5
add x2, sp, #(\i*2)
bl idct32_1d_8x32_pass2_\size\()_neon
.endr
add sp, sp, #2048
ldp d8, d9, [sp], 0x10
ldp d10, d11, [sp], 0x10
br x15
endfunc
.endm
idct32_partial quarter
idct32_partial half