#include "libavutil/aarch64/asm.S"
.macro load_yoff_ycoeff yoff ycoeff
#if defined(__APPLE__)
ldp w9, w10, [sp, #\yoff]
#else
ldr w9, [sp, #\yoff]
ldr w10, [sp, #\ycoeff]
#endif
.endm
.macro load_args_nv12
ldr x8, [sp]
load_yoff_ycoeff 8, 16
ld1 {v1.1D}, [x8]
dup v0.8H, w10
dup v3.8H, w9
sub w3, w3, w0, lsl #2
sub w5, w5, w0
sub w7, w7, w0
neg w11, w0
.endm
.macro load_args_nv21
load_args_nv12
.endm
.macro load_args_yuv420p
ldr x13, [sp]
ldr w14, [sp, #8]
ldr x8, [sp, #16]
load_yoff_ycoeff 24, 32
ld1 {v1.1D}, [x8]
dup v0.8H, w10
dup v3.8H, w9
sub w3, w3, w0, lsl #2
sub w5, w5, w0
sub w7, w7, w0, lsr #1
sub w14, w14, w0, lsr #1
lsr w11, w0, #1
neg w11, w11
.endm
.macro load_args_yuv422p
ldr x13, [sp]
ldr w14, [sp, #8]
ldr x8, [sp, #16]
load_yoff_ycoeff 24, 32
ld1 {v1.1D}, [x8]
dup v0.8H, w10
dup v3.8H, w9
sub w3, w3, w0, lsl #2
sub w5, w5, w0
sub w7, w7, w0, lsr #1
sub w14, w14, w0, lsr #1
.endm
.macro load_chroma_nv12
ld2 {v16.8B, v17.8B}, [x6], #16
ushll v18.8H, v16.8B, #3
ushll v19.8H, v17.8B, #3
.endm
.macro load_chroma_nv21
ld2 {v16.8B, v17.8B}, [x6], #16
ushll v19.8H, v16.8B, #3
ushll v18.8H, v17.8B, #3
.endm
.macro load_chroma_yuv420p
ld1 {v16.8B}, [ x6], #8
ld1 {v17.8B}, [x13], #8
ushll v18.8H, v16.8B, #3
ushll v19.8H, v17.8B, #3
.endm
.macro load_chroma_yuv422p
load_chroma_yuv420p
.endm
.macro increment_nv12
ands w15, w1, #1
csel w16, w7, w11, ne
add x6, x6, w16, SXTW
.endm
.macro increment_nv21
increment_nv12
.endm
.macro increment_yuv420p
ands w15, w1, #1
csel w16, w7, w11, ne
csel w17, w14, w11, ne
add x6, x6, w16, SXTW
add x13, x13, w17, SXTW
.endm
.macro increment_yuv422p
add x6, x6, w7, UXTW
add x13, x13, w14, UXTW
.endm
.macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
add v20.8H, v26.8H, v20.8H
add v21.8H, v27.8H, v21.8H
add v22.8H, v26.8H, v22.8H
add v23.8H, v27.8H, v23.8H
add v24.8H, v26.8H, v24.8H
add v25.8H, v27.8H, v25.8H
sqrshrun \r1, v20.8H, #1
sqrshrun \r2, v21.8H, #1
sqrshrun \g1, v22.8H, #1
sqrshrun \g2, v23.8H, #1
sqrshrun \b1, v24.8H, #1
sqrshrun \b2, v25.8H, #1
movi \a1, #255
movi \a2, #255
.endm
.macro declare_func ifmt ofmt
function ff_\ifmt\()_to_\ofmt\()_neon, export=1
load_args_\ifmt
1:
mov w8, w0
2:
movi v5.8H, #4, lsl #8
load_chroma_\ifmt
sub v18.8H, v18.8H, v5.8H
sub v19.8H, v19.8H, v5.8H
sqdmulh v20.8H, v19.8H, v1.H[0]
sqdmulh v22.8H, v18.8H, v1.H[1]
sqdmulh v19.8H, v19.8H, v1.H[2]
add v22.8H, v22.8H, v19.8H
sqdmulh v24.8H, v18.8H, v1.H[3]
zip2 v21.8H, v20.8H, v20.8H
zip1 v20.8H, v20.8H, v20.8H
zip2 v23.8H, v22.8H, v22.8H
zip1 v22.8H, v22.8H, v22.8H
zip2 v25.8H, v24.8H, v24.8H
zip1 v24.8H, v24.8H, v24.8H
ld1 {v2.16B}, [x4], #16
ushll v26.8H, v2.8B, #3
ushll2 v27.8H, v2.16B, #3
sub v26.8H, v26.8H, v3.8H
sub v27.8H, v27.8H, v3.8H
sqdmulh v26.8H, v26.8H, v0.8H
sqdmulh v27.8H, v27.8H, v0.8H
.ifc \ofmt,argb
compute_rgba v5.8B,v6.8B,v7.8B,v4.8B, v17.8B,v18.8B,v19.8B,v16.8B
.endif
.ifc \ofmt,rgba
compute_rgba v4.8B,v5.8B,v6.8B,v7.8B, v16.8B,v17.8B,v18.8B,v19.8B
.endif
.ifc \ofmt,abgr
compute_rgba v7.8B,v6.8B,v5.8B,v4.8B, v19.8B,v18.8B,v17.8B,v16.8B
.endif
.ifc \ofmt,bgra
compute_rgba v6.8B,v5.8B,v4.8B,v7.8B, v18.8B,v17.8B,v16.8B,v19.8B
.endif
st4 { v4.8B, v5.8B, v6.8B, v7.8B}, [x2], #32
st4 {v16.8B,v17.8B,v18.8B,v19.8B}, [x2], #32
subs w8, w8, #16
b.gt 2b
add x2, x2, w3, UXTW
add x4, x4, w5, UXTW
increment_\ifmt
subs w1, w1, #1
b.gt 1b
ret
endfunc
.endm
.macro declare_rgb_funcs ifmt
declare_func \ifmt, argb
declare_func \ifmt, rgba
declare_func \ifmt, abgr
declare_func \ifmt, bgra
.endm
declare_rgb_funcs nv12
declare_rgb_funcs nv21
declare_rgb_funcs yuv420p
declare_rgb_funcs yuv422p