root/src/lib/asm_scale.S

/* [<][>][^][v][top][bottom][index][help] */
#include <config.h>
#include "asm.h"

#ifdef DO_MMX_ASM

/*\ 
|*| MMX assembly scaling routine for Imlib2
|*| Written by Willem Monsuwe <willem@stack.nl>
\*/

.text
        .align 8
FN_(imlib_Scale_mmx_AARGBA)

/*\ Prototype: __imlib_Scale_mmx_AARGBA(ImlibScaleInfo *isi, DATA32 *dest,
|*|     int dxx, int dyy, int dx, int dy, int dw, int dh, int dow, int sow)
\*/

#define isi     8(%ebp)
#define dest    12(%ebp)
#define dxx     16(%ebp)
#define dyy     20(%ebp)
#define dx      24(%ebp)
#define dy      28(%ebp)
#define dw      32(%ebp)
#define dh      36(%ebp)
#define dow     40(%ebp)
#define sow     44(%ebp)

/*\ Local variables that didn't fit in registers \*/
#define y       -4(%ebp)
#define yp      -8(%ebp)
#define yap     -12(%ebp)
#define xp      -16(%ebp)
#define xap     -20(%ebp)
#define Cx      -24(%ebp)
#define Mx      -28(%ebp)
#define Cy      -32(%ebp)
#define My      -36(%ebp)
#define sow_4   -40(%ebp)

/*\ When %edx points to ImlibScaleInfo, these are the members \*/
#define xpoints         (%edx)
#define ypoints         4(%edx)
#define xapoints        8(%edx)
#define yapoints        12(%edx)
#define xup_yup         16(%edx)

PR_(imlib_Scale_mmx_AARGBA):
        pushl %ebp
        movl %esp, %ebp
        subl $40, %esp
        pushl %ebx
        pushl %ecx
        pushl %edx
        pushl %edi
        pushl %esi
        movl isi, %edx

        /*\ Check (dw > 0) && (dh > 0) \*/
        cmpl $0, dw
        jle .scale_leave
        cmpl $0, dh
        jle .scale_leave

        /*\ X-based array pointers point to the end; we're looping up to 0 \*/
        /*\ %edi = dest + dow * dy + dx + dw \*/
        movl dow, %eax
        imull dy, %eax
        addl dx, %eax
        addl dw, %eax
        movl dest, %edi
        leal (%edi, %eax, 4), %edi
        /*\ xp = xpoints + dxx + dw \*/
        movl dxx, %ebx
        addl dw, %ebx
        movl xpoints, %eax
        leal (%eax, %ebx, 4), %eax
        movl %eax, xp
        /*\ xap = xapoints + dxx + dw \*/
        movl xapoints, %eax
        leal (%eax, %ebx, 4), %eax
        movl %eax, xap
        /*\ y = dh \*/
        movl dh, %eax
        movl %eax, y
        /*\ yp = ypoints + dyy \*/
        movl dyy, %ebx
        movl ypoints, %eax
        leal (%eax, %ebx, 4), %eax
        movl %eax, yp
        /*\ yap = yapoints + dyy \*/
        movl yapoints, %eax
        leal (%eax, %ebx, 4), %eax
        movl %eax, yap

        pxor %mm7, %mm7

        /*\ Test xup bit \*/
        movl xup_yup, %eax
        sarl $1, %eax
        jnc .scale_x_down

.scale_x_up:
        /*\ Test yup bit \*/
        sarl $1, %eax
        jnc .scale_x_up_y_down


/*\ Scaling up both ways \*/

.scale_x_up_y_up:
        movl sow, %ebx

.up_up_loop_y:

        /*\ x = -dw \*/
        movl dw, %ecx
        negl %ecx

        /*\ %eax = *yap << 4 \*/
        movl yap, %eax
        movl (%eax), %eax
        sall $4, %eax
        jz .up_up_yap_0
        movd %eax, %mm1
        punpcklwd %mm1, %mm1
        punpckldq %mm1, %mm1

.up_up_loop1_x:
        /*\ %esi = *yp + xp[x] \*/
        movl yp, %eax
        movl (%eax), %esi
        movl xp, %eax
        movl (%eax, %ecx, 4), %eax
        leal (%esi, %eax, 4), %esi

        /*\ %eax = xap[x] << 4 \*/
        movl xap, %eax
        movl (%eax, %ecx, 4), %eax
        sall $4, %eax
        jz .up_up_xap_0

        /*\ %mm0 = xap[x] << 4 \*/
        movd %eax, %mm0
        punpcklwd %mm0, %mm0
        punpckldq %mm0, %mm0

        /*\ Load and unpack four pixels in parralel
        |*| %mm2 = ptr[0],   %mm3 = ptr[1]
        |*| %mm4 = ptr[sow], %mm5 = ptr[sow + 1]
        \*/
        movq (%esi), %mm2
        movq (%esi, %ebx, 4), %mm4
        movq %mm2, %mm3
        movq %mm4, %mm5
        punpcklbw %mm7, %mm2
        punpcklbw %mm7, %mm4
        punpckhbw %mm7, %mm3
        punpckhbw %mm7, %mm5

        /*\ X interpolation: r = l + (r - l) * xap \*/
        psubw %mm2, %mm3
        psubw %mm4, %mm5
        psllw $4, %mm3
        psllw $4, %mm5
        pmulhw %mm0, %mm3
        pmulhw %mm0, %mm5
        paddw %mm2, %mm3
        paddw %mm4, %mm5
        /*\ Now %mm3 = I(ptr[0], ptr[1]), %mm5 = I(ptr[sow], ptr[sow + 1]) \*/
        jmp .up_up_common
.up_up_xap_0:
        /*\ Load and unpack two pixels
        |*| %mm3 = ptr[0], %mm5 = ptr[sow]
        \*/
        movd (%esi), %mm3
        movd (%esi, %ebx, 4), %mm5
        punpcklbw %mm7, %mm3
        punpcklbw %mm7, %mm5
.up_up_common:
        /*\ Y interpolation: d = u + (d - u) * yap \*/
        psubw %mm3, %mm5
        psllw $4, %mm5
        pmulhw %mm1, %mm5
        paddw %mm3, %mm5
        packuswb %mm5, %mm5
        movd %mm5, (%edi, %ecx, 4)

        /*\ while (++x) \*/
        incl %ecx
        jnz .up_up_loop1_x
        jmp .up_up_yap_end
.up_up_yap_0:

.up_up_loop2_x:
        /*\ %esi = *yp + xp[x] \*/
        movl yp, %eax
        movl (%eax), %esi
        movl xp, %eax
        movl (%eax, %ecx, 4), %eax
        leal (%esi, %eax, 4), %esi

        /*\ %eax = xap[x] << 4 \*/
        movl xap, %eax
        movl (%eax, %ecx, 4), %eax
        sall $4, %eax
        jz .up_up_0

        /*\ %mm0 = xap[x] << 4 \*/
        movd %eax, %mm0
        punpcklwd %mm0, %mm0
        punpckldq %mm0, %mm0

        /*\ Load and unpack two pixels in parralel
        |*| %mm2 = ptr[0], %mm3 = ptr[1]
        \*/
        movq (%esi), %mm2
        movq %mm2, %mm3
        punpcklbw %mm7, %mm2
        punpckhbw %mm7, %mm3

        /*\ X interpolation: r = l + (r - l) * xap \*/
        psubw %mm2, %mm3
        psllw $4, %mm3
        pmulhw %mm0, %mm3
        paddw %mm2, %mm3
        packuswb %mm3, %mm3
        movd %mm3, (%edi, %ecx, 4)
        jmp .up_up_1
.up_up_0:
        /*\ dptr[x] = *sptr \*/
        movl (%esi), %eax
        movl %eax, (%edi, %ecx, 4)
.up_up_1:
        incl %ecx
        jnz .up_up_loop2_x

.up_up_yap_end:
        /*\ dptr += dow \*/
        movl dow, %eax
        leal (%edi, %eax, 4), %edi
        /*\ yap++; yp++ \*/
        addl $4, yap
        addl $4, yp
        /*\ while (y--) \*/
        decl y
        jnz .up_up_loop_y

        jmp .scale_leave


/*\ Scaling down vertically \*/

.scale_x_up_y_down:
        /*\ sow_4 = sow * 4 \*/
        movl sow, %eax
        sall $2, %eax
        movl %eax, sow_4

.up_down_loop_y:

        /*\ Setup My and Cy \*/
        movl yap, %eax
        movzwl (%eax), %ebx
        movl %ebx, My
        movzwl 2(%eax), %eax
        movl %eax, Cy

        /*\ mm4 = Cy \*/
        movd %eax, %mm4
        punpcklwd %mm4, %mm4
        punpckldq %mm4, %mm4
        /*\ mm5 = My \*/
        movd %ebx, %mm5
        punpcklwd %mm5, %mm5
        punpckldq %mm5, %mm5

        /*\ x = -dw \*/
        movl dw, %ecx
        negl %ecx
.up_down_loop_x:
        /*\ %esi = *yp + xp[x] \*/
        movl yp, %eax
        movl (%eax), %esi
        movl xp, %eax
        movl (%eax, %ecx, 4), %eax
        leal (%esi, %eax, 4), %esi

        movl %esi, %eax
        /*\ v = (*p * My) >> 10 \*/
        movd (%eax), %mm0
        punpcklbw %mm7, %mm0
        psllw $6, %mm0
        pmulhw %mm5, %mm0
        
        /*\ i = 0x4000 - My \*/
        movl $0x4000, %ebx
        subl My, %ebx
        jbe 5f
        jmp 2f
1:
        /*\ p += sow; v += (*p * Cy) >> 10 \*/
        addl sow_4, %eax
        movd (%eax), %mm1
        punpcklbw %mm7, %mm1
        psllw $6, %mm1
        pmulhw %mm4, %mm1
        paddw %mm1, %mm0
        
        /*\ i -= Cy; while (i > Cy) \*/
        subl Cy, %ebx
2:
        cmpl Cy, %ebx
        jg 1b
        
        /*\ mm6 = i \*/
        movd %ebx, %mm6
        punpcklwd %mm6, %mm6
        punpckldq %mm6, %mm6
        
        /*\ p += sow; v += (*p * i) >> 10 \*/
        addl sow_4, %eax
        movd (%eax), %mm1
        punpcklbw %mm7, %mm1
        psllw $6, %mm1
        pmulhw %mm6, %mm1
        paddw %mm1, %mm0
5:
        /*\ %eax = xap[x] << 5 \*/
        movl xap, %eax
        movl (%eax, %ecx, 4), %eax
        sall $5, %eax
        jz 6f
        /*\ mm3 = xap[x] << 5 \*/
        movd %eax, %mm3
        punpcklwd %mm3, %mm3
        punpckldq %mm3, %mm3
        
        /*\ p + 1 \*/
        movl %esi, %eax
        addl $4, %eax
        /*\ vv = (*p * My) >> 10 \*/
        movd (%eax), %mm2
        punpcklbw %mm7, %mm2
        psllw $6, %mm2
        pmulhw %mm5, %mm2
        
        /*\ i = 0x4000 - My \*/
        movl $0x4000, %ebx
        subl My, %ebx
        jbe 5f
        jmp 2f
1:
        /*\ p += sow; vv += (*p * Cy) >> 10 \*/
        addl sow_4, %eax
        movd (%eax), %mm1
        punpcklbw %mm7, %mm1
        psllw $6, %mm1
        pmulhw %mm4, %mm1
        paddw %mm1, %mm2
        
        /*\ i -= Cy; while (i > Cy) \*/
        subl Cy, %ebx
2:
        cmpl Cy, %ebx
        jg 1b
        
        /*\ p += sow; v += (*p * i) >> 10 \*/
        addl sow_4, %eax
        movd (%eax), %mm1
        punpcklbw %mm7, %mm1
        psllw $6, %mm1
        pmulhw %mm6, %mm1
        paddw %mm1, %mm2
5:
        /*\ v = v + (vv - v) * xap \*/
        psubw %mm0, %mm2
        psllw $3, %mm2
        pmulhw %mm3, %mm2
        paddw %mm2, %mm0
6:
        /*\ dest[x] = v >> 4 \*/
        psrlw $4, %mm0
        packuswb %mm0, %mm0
        movd %mm0, (%edi, %ecx, 4)

        /*\ while (++x) \*/
        incl %ecx
        jnz .up_down_loop_x

        /*\ dptr += dow \*/
        movl dow, %eax
        leal (%edi, %eax, 4), %edi
        /*\ yap++; yp++ \*/
        addl $4, yap
        addl $4, yp
        /*\ while (y--) \*/
        decl y
        jnz .up_down_loop_y

        jmp .scale_leave

.scale_x_down:
        /*\ Test yup bit \*/
        sarl $1, %eax
        jnc .scale_x_down_y_down


/*\ Scaling down horizontally \*/

.scale_x_down_y_up:
        /*\ sow_4 = sow * 4 \*/
        movl sow, %eax
        sall $2, %eax
        movl %eax, sow_4

.down_up_loop_y:

        /*\ %eax = *yap << 5 \*/
        movl yap, %eax
        movl (%eax), %eax
        sall $5, %eax
        /*\ mm3 = *yap << 5 \*/
        movd %eax, %mm3
        punpcklwd %mm3, %mm3
        punpckldq %mm3, %mm3
        
        /*\ x = -dw \*/
        movl dw, %ecx
        negl %ecx
.down_up_loop_x:
        /*\ %esi = *yp + xp[x] \*/
        movl yp, %eax
        movl (%eax), %esi
        movl xp, %eax
        movl (%eax, %ecx, 4), %eax
        leal (%esi, %eax, 4), %esi

        /*\ Setup Mx and Cx \*/
        movl xap, %eax
        movzwl (%eax, %ecx, 4), %ebx
        movl %ebx, Mx
        movzwl 2(%eax, %ecx, 4), %eax
        movl %eax, Cx

        /*\ mm4 = Cx \*/
        movd %eax, %mm4
        punpcklwd %mm4, %mm4
        punpckldq %mm4, %mm4
        /*\ mm5 = Mx \*/
        movd %ebx, %mm5
        punpcklwd %mm5, %mm5
        punpckldq %mm5, %mm5

        movl %esi, %eax
        /*\ v = (*p * Mx) >> 10 \*/
        movd (%eax), %mm0
        punpcklbw %mm7, %mm0
        psllw $6, %mm0
        pmulhw %mm5, %mm0
        
        /*\ i = 0x4000 - Mx \*/
        movl $0x4000, %ebx
        subl Mx, %ebx
        jbe 5f
        jmp 2f
1:
        /*\ p += sow; v += (*p * Cx) >> 10 \*/
        addl $4, %eax
        movd (%eax), %mm1
        punpcklbw %mm7, %mm1
        psllw $6, %mm1
        pmulhw %mm4, %mm1
        paddw %mm1, %mm0
        
        /*\ i -= Cx; while (i > Cx) \*/
        subl Cx, %ebx
2:
        cmpl Cx, %ebx
        jg 1b
        
        /*\ mm6 = i \*/
        movd %ebx, %mm6
        punpcklwd %mm6, %mm6
        punpckldq %mm6, %mm6
        
        /*\ p += sow; v += (*p * i) >> 10 \*/
        addl $4, %eax
        movd (%eax), %mm1
        punpcklbw %mm7, %mm1
        psllw $6, %mm1
        pmulhw %mm6, %mm1
        paddw %mm1, %mm0
5:
        movd %mm3, %eax
        testl %eax, %eax
        jz 6f
        /*\ p + sow \*/
        movl %esi, %eax
        addl sow_4, %eax
        /*\ vv = (*p * Mx) >> 10 \*/
        movd (%eax), %mm2
        punpcklbw %mm7, %mm2
        psllw $6, %mm2
        pmulhw %mm5, %mm2
        
        /*\ i = 0x4000 - Mx \*/
        movl $0x4000, %ebx
        subl Mx, %ebx
        jbe 5f
        jmp 2f
1:
        /*\ p += sow; vv += (*p * Cx) >> 10 \*/
        addl $4, %eax
        movd (%eax), %mm1
        punpcklbw %mm7, %mm1
        psllw $6, %mm1
        pmulhw %mm4, %mm1
        paddw %mm1, %mm2
        
        /*\ i -= Cx; while (i > Cx) \*/
        subl Cx, %ebx
2:
        cmpl Cx, %ebx
        jg 1b
        
        /*\ p += sow; v += (*p * i) >> 10 \*/
        addl $4, %eax
        movd (%eax), %mm1
        punpcklbw %mm7, %mm1
        psllw $6, %mm1
        pmulhw %mm6, %mm1
        paddw %mm1, %mm2
5:
        /*\ v = v + (vv - v) * yap \*/
        psubw %mm0, %mm2
        psllw $3, %mm2
        pmulhw %mm3, %mm2
        paddw %mm2, %mm0
6:
        /*\ dest[x] = v >> 4 \*/
        psrlw $4, %mm0
        packuswb %mm0, %mm0
        movd %mm0, (%edi, %ecx, 4)

        /*\ while (++x) \*/
        incl %ecx
        jnz .down_up_loop_x

        /*\ dptr += dow \*/
        movl dow, %eax
        leal (%edi, %eax, 4), %edi
        /*\ yap++; yp++ \*/
        addl $4, yap
        addl $4, yp
        /*\ while (y--) \*/
        decl y
        jnz .down_up_loop_y

        jmp .scale_leave


/*\ Scaling down both ways \*/

.scale_x_down_y_down:
        /*\ sow_4 = sow * 4 \*/
        movl sow, %eax
        sall $2, %eax
        movl %eax, sow_4

.down_down_loop_y:

        /*\ Setup My and Cy \*/
        movl yap, %eax
        movzwl (%eax), %ebx
        movl %ebx, My
        movzwl 2(%eax), %eax
        movl %eax, Cy

        /*\ x = -dw \*/
        movl dw, %ecx
        negl %ecx
.down_down_loop_x:
        /*\ %esi = *yp + xp[x] \*/
        movl yp, %eax
        movl (%eax), %esi
        movl xp, %eax
        movl (%eax, %ecx, 4), %eax
        leal (%esi, %eax, 4), %esi

        /*\ Setup Mx and Cx \*/
        movl xap, %eax
        movzwl (%eax, %ecx, 4), %ebx
        movl %ebx, Mx
        movzwl 2(%eax, %ecx, 4), %eax
        movl %eax, Cx

        /*\ mm3 = Cx \*/
        movd %eax, %mm3
        punpcklwd %mm3, %mm3
        punpckldq %mm3, %mm3
        /*\ mm5 = Mx \*/
        movd %ebx, %mm5
        punpcklwd %mm5, %mm5
        punpckldq %mm5, %mm5
        
        /*\ p = sptr; v = (*p * Mx) >> 9 \*/
        movl %esi, %eax
        movd (%eax), %mm0
        punpcklbw %mm7, %mm0
        psllw $7, %mm0
        pmulhw %mm5, %mm0
        
        /*\ i = 0x4000 - Mx \*/
        movl $0x4000, %ebx
        subl Mx, %ebx
        jbe 5f
        jmp 2f
1:
        /*\ v += (*++p * Cx) >> 9 \*/
        addl $4, %eax
        movd (%eax), %mm1
        punpcklbw %mm7, %mm1
        psllw $7, %mm1
        pmulhw %mm3, %mm1
        paddw %mm1, %mm0
        
        /*\ i -= Cx; while (i > Cx) \*/
        subl Cx, %ebx
2:
        cmpl Cx, %ebx
        jg 1b
        
        /*\ mm6 = i \*/
        movd %ebx, %mm6
        punpcklwd %mm6, %mm6
        punpckldq %mm6, %mm6
        
        /*\ v += (*++p * i) >> 9 \*/
        addl $4, %eax
        movd (%eax), %mm1
        punpcklbw %mm7, %mm1
        psllw $7, %mm1
        pmulhw %mm6, %mm1
        paddw %mm1, %mm0
5:
        /*\ v *= My \*/
        movd My, %mm4
        punpcklwd %mm4, %mm4
        punpckldq %mm4, %mm4
        psllw $2, %mm0
        pmulhw %mm4, %mm0
        
        /*\ j = 0x4000 - My \*/
        movl $0x4000, %edx
        subl My, %edx
        jbe 6f
        jmp 4f
3:
        /*\ sptr += sow; p = sptr \*/
        addl sow_4, %esi
        movl %esi, %eax
        /*\ vx = (*p * Mx) >> 9 \*/
        movd (%eax), %mm1
        punpcklbw %mm7, %mm1
        psllw $7, %mm1
        pmulhw %mm5, %mm1
        
        /*\ i = 0x4000 - Mx \*/
        movl $0x4000, %ebx
        subl Mx, %ebx
        jbe 5f
        jmp 2f
1:
        /*\ vx += (*++p * Cx) >> 9 \*/
        addl $4, %eax
        movd (%eax), %mm2
        punpcklbw %mm7, %mm2
        psllw $7, %mm2
        pmulhw %mm3, %mm2
        paddw %mm2, %mm1
        
        /*\ i -= Cx; while (i > Cx) \*/
        subl Cx, %ebx
2:
        cmpl Cx, %ebx
        jg 1b
        
        /*\ vx += (*++p * i) >> 9 \*/
        addl $4, %eax
        movd (%eax), %mm2
        punpcklbw %mm7, %mm2
        psllw $7, %mm2
        pmulhw %mm6, %mm2
        paddw %mm2, %mm1
5:
        /*\ v += (vx * Cy) >> 14 \*/
        movd Cy, %mm4
        punpcklwd %mm4, %mm4
        punpckldq %mm4, %mm4
        psllw $2, %mm1
        pmulhw %mm4, %mm1
        paddw %mm1, %mm0
        
        /*\ j -= Cy; while (j > Cy) \*/
        subl Cy, %edx
4:
        cmpl Cy, %edx
        jg 3b
        
        /*\ sptr += sow; p = sptr \*/
        addl sow_4, %esi
        movl %esi, %eax
        /*\ vx = (*p * Mx) >> 9 \*/
        movd (%eax), %mm1
        punpcklbw %mm7, %mm1
        psllw $7, %mm1
        pmulhw %mm5, %mm1
        
        /*\ i = 0x4000 - Mx \*/
        movl $0x4000, %ebx
        subl Mx, %ebx
        jbe 5f
        jmp 2f
1:
        /*\ vx += (*++p * Cx) >> 9 \*/
        addl $4, %eax
        movd (%eax), %mm2
        punpcklbw %mm7, %mm2
        psllw $7, %mm2
        pmulhw %mm3, %mm2
        paddw %mm2, %mm1
        
        /*\ i -= Cx; while (i > Cx) \*/
        subl Cx, %ebx
2:
        cmpl Cx, %ebx
        jg 1b
        
        /*\ vx += (*++p * i) >> 9 \*/
        addl $4, %eax
        movd (%eax), %mm2
        punpcklbw %mm7, %mm2
        psllw $7, %mm2
        pmulhw %mm6, %mm2
        paddw %mm2, %mm1
5:
        /*\ v += (vx * j) >> 14 \*/
        movd %edx, %mm4
        punpcklwd %mm4, %mm4
        punpckldq %mm4, %mm4
        psllw $2, %mm1
        pmulhw %mm4, %mm1
        paddw %mm1, %mm0
6:
        /*\ dptr[x] = mm0 >> 5 \*/
        psrlw $5, %mm0
        packuswb %mm0, %mm0
        movd %mm0, (%edi, %ecx, 4)

        /*\ while (++x) \*/
        incl %ecx
        jnz .down_down_loop_x

        /*\ dptr += dow \*/
        movl dow, %eax
        leal (%edi, %eax, 4), %edi
        /*\ yap++; yp++ \*/
        addl $4, yap
        addl $4, yp
        /*\ while (y--) \*/
        decl y
        jnz .down_down_loop_y

        jmp .scale_leave

.scale_leave:
        emms
        popl %esi
        popl %edi
        popl %edx
        popl %ecx
        popl %ebx
        movl %ebp, %esp
        popl %ebp
        ret

SIZE(imlib_Scale_mmx_AARGBA)

#endif

#ifdef __ELF__
.section .note.GNU-stack,"",@progbits
#endif

/* [<][>][^][v][top][bottom][index][help] */