root/src/pkg/runtime/asm_amd64p32.s

/* [<][>][^][v][top][bottom][index][help] */
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

#include "zasm_GOOS_GOARCH.h"
#include "funcdata.h"
#include "../../cmd/ld/textflag.h"

TEXT _rt0_go(SB),NOSPLIT,$0
        // copy arguments forward on an even stack
        MOVL    argc+0(FP), AX
        MOVL    argv+4(FP), BX
        MOVL    SP, CX
        SUBL    $128, SP                // plenty of scratch
        ANDL    $~15, CX
        MOVL    CX, SP

        MOVL    AX, 16(SP)
        MOVL    BX, 24(SP)
        
        // create istack out of the given (operating system) stack.
        MOVL    $runtime·g0(SB), DI
        LEAL    (-64*1024+104)(SP), DI
        MOVL    BX, g_stackguard(DI)
        MOVL    BX, g_stackguard0(DI)
        MOVL    SP, g_stackbase(DI)

        // find out information about the processor we're on
        MOVQ    $0, AX
        CPUID
        CMPQ    AX, $0
        JE      nocpuinfo
        MOVQ    $1, AX
        CPUID
        MOVL    CX, runtime·cpuid_ecx(SB)
        MOVL    DX, runtime·cpuid_edx(SB)
nocpuinfo:      
        
needtls:
        LEAL    runtime·tls0(SB), DI
        CALL    runtime·settls(SB)

        // store through it, to make sure it works
        get_tls(BX)
        MOVQ    $0x123, g(BX)
        MOVQ    runtime·tls0(SB), AX
        CMPQ    AX, $0x123
        JEQ 2(PC)
        MOVL    AX, 0   // abort
ok:
        // set the per-goroutine and per-mach "registers"
        get_tls(BX)
        LEAL    runtime·g0(SB), CX
        MOVL    CX, g(BX)
        LEAL    runtime·m0(SB), AX
        MOVL    AX, m(BX)

        // save m->g0 = g0
        MOVL    CX, m_g0(AX)

        CLD                             // convention is D is always left cleared
        CALL    runtime·check(SB)

        MOVL    16(SP), AX              // copy argc
        MOVL    AX, 0(SP)
        MOVL    24(SP), AX              // copy argv
        MOVL    AX, 4(SP)
        CALL    runtime·args(SB)
        CALL    runtime·osinit(SB)
        CALL    runtime·hashinit(SB)
        CALL    runtime·schedinit(SB)

        // create a new goroutine to start program
        MOVL    $runtime·main·f(SB), AX       // entry
        MOVL    $0, 0(SP)
        MOVL    AX, 4(SP)
        ARGSIZE(8)
        CALL    runtime·newproc(SB)
        ARGSIZE(-1)

        // start this M
        CALL    runtime·mstart(SB)

        MOVL    $0xf1, 0xf1  // crash
        RET

DATA    runtime·main·f+0(SB)/4,$runtime·main(SB)
GLOBL   runtime·main·f(SB),RODATA,$4

TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
        INT $3
        RET

TEXT runtime·asminit(SB),NOSPLIT,$0-0
        // No per-thread init.
        RET

/*
 *  go-routine
 */

// void gosave(Gobuf*)
// save state in Gobuf; setjmp
TEXT runtime·gosave(SB), NOSPLIT, $0-4
        MOVL    b+0(FP), AX     // gobuf
        LEAL    b+0(FP), BX     // caller's SP
        MOVL    BX, gobuf_sp(AX)
        MOVL    0(SP), BX               // caller's PC
        MOVL    BX, gobuf_pc(AX)
        MOVL    $0, gobuf_ctxt(AX)
        MOVQ    $0, gobuf_ret(AX)
        get_tls(CX)
        MOVL    g(CX), BX
        MOVL    BX, gobuf_g(AX)
        RET

// void gogo(Gobuf*)
// restore state from Gobuf; longjmp
TEXT runtime·gogo(SB), NOSPLIT, $0-4
        MOVL    b+0(FP), BX             // gobuf
        MOVL    gobuf_g(BX), DX
        MOVL    0(DX), CX               // make sure g != nil
        get_tls(CX)
        MOVL    DX, g(CX)
        MOVL    gobuf_sp(BX), SP        // restore SP
        MOVL    gobuf_ctxt(BX), DX
        MOVQ    gobuf_ret(BX), AX
        MOVL    $0, gobuf_sp(BX)        // clear to help garbage collector
        MOVQ    $0, gobuf_ret(BX)
        MOVL    $0, gobuf_ctxt(BX)
        MOVL    gobuf_pc(BX), BX
        JMP     BX

// void mcall(void (*fn)(G*))
// Switch to m->g0's stack, call fn(g).
// Fn must never return.  It should gogo(&g->sched)
// to keep running g.
TEXT runtime·mcall(SB), NOSPLIT, $0-4
        MOVL    fn+0(FP), DI
        
        get_tls(CX)
        MOVL    g(CX), AX       // save state in g->sched
        MOVL    0(SP), BX       // caller's PC
        MOVL    BX, (g_sched+gobuf_pc)(AX)
        LEAL    fn+0(FP), BX    // caller's SP
        MOVL    BX, (g_sched+gobuf_sp)(AX)
        MOVL    AX, (g_sched+gobuf_g)(AX)

        // switch to m->g0 & its stack, call fn
        MOVL    m(CX), BX
        MOVL    m_g0(BX), SI
        CMPL    SI, AX  // if g == m->g0 call badmcall
        JNE     3(PC)
        MOVL    $runtime·badmcall(SB), AX
        JMP     AX
        MOVL    SI, g(CX)       // g = m->g0
        MOVL    (g_sched+gobuf_sp)(SI), SP      // sp = m->g0->sched.sp
        PUSHQ   AX
        ARGSIZE(8)
        CALL    DI
        POPQ    AX
        MOVL    $runtime·badmcall2(SB), AX
        JMP     AX
        RET

/*
 * support for morestack
 */

// Called during function prolog when more stack is needed.
// Caller has already done get_tls(CX); MOVQ m(CX), BX.
//
// The traceback routines see morestack on a g0 as being
// the top of a stack (for example, morestack calling newstack
// calling the scheduler calling newm calling gc), so we must
// record an argument size. For that purpose, it has no arguments.
TEXT runtime·morestack(SB),NOSPLIT,$0-0
        // Cannot grow scheduler stack (m->g0).
        MOVL    m_g0(BX), SI
        CMPL    g(CX), SI
        JNE     2(PC)
        MOVL    0, AX

        // Called from f.
        // Set m->morebuf to f's caller.
        MOVL    8(SP), AX       // f's caller's PC
        MOVL    AX, (m_morebuf+gobuf_pc)(BX)
        LEAL    16(SP), AX      // f's caller's SP
        MOVL    AX, (m_morebuf+gobuf_sp)(BX)
        MOVL    AX, m_moreargp(BX)
        get_tls(CX)
        MOVL    g(CX), SI
        MOVL    SI, (m_morebuf+gobuf_g)(BX)

        // Set g->sched to context in f.
        MOVL    0(SP), AX // f's PC
        MOVL    AX, (g_sched+gobuf_pc)(SI)
        MOVL    SI, (g_sched+gobuf_g)(SI)
        LEAL    8(SP), AX // f's SP
        MOVL    AX, (g_sched+gobuf_sp)(SI)
        MOVL    DX, (g_sched+gobuf_ctxt)(SI)

        // Call newstack on m->g0's stack.
        MOVL    m_g0(BX), BX
        MOVL    BX, g(CX)
        MOVL    (g_sched+gobuf_sp)(BX), SP
        CALL    runtime·newstack(SB)
        MOVL    $0, 0x1003      // crash if newstack returns
        RET

// Called from panic.  Mimics morestack,
// reuses stack growth code to create a frame
// with the desired args running the desired function.
//
// func call(fn *byte, arg *byte, argsize uint32).
TEXT runtime·newstackcall(SB), NOSPLIT, $0-20
        get_tls(CX)
        MOVL    m(CX), BX

        // Save our caller's state as the PC and SP to
        // restore when returning from f.
        MOVL    0(SP), AX       // our caller's PC
        MOVL    AX, (m_morebuf+gobuf_pc)(BX)
        LEAL    8(SP), AX       // our caller's SP
        MOVL    AX, (m_morebuf+gobuf_sp)(BX)
        MOVL    g(CX), AX
        MOVL    AX, (m_morebuf+gobuf_g)(BX)
        
        // Save our own state as the PC and SP to restore
        // if this goroutine needs to be restarted.
        MOVL    $runtime·newstackcall(SB), DI
        MOVL    DI, (g_sched+gobuf_pc)(AX)
        MOVL    SP, (g_sched+gobuf_sp)(AX)

        // Set up morestack arguments to call f on a new stack.
        // We set f's frame size to 1, as a hint to newstack
        // that this is a call from runtime·newstackcall.
        // If it turns out that f needs a larger frame than
        // the default stack, f's usual stack growth prolog will
        // allocate a new segment (and recopy the arguments).
        MOVL    8(SP), AX       // fn
        MOVL    12(SP), DX      // arg frame
        MOVL    16(SP), CX      // arg size

        MOVQ    AX, m_cret(BX)  // f's PC
        MOVL    DX, m_moreargp(BX)      // argument frame pointer
        MOVL    CX, m_moreargsize(BX)   // f's argument size
        MOVL    $1, m_moreframesize(BX) // f's frame size

        // Call newstack on m->g0's stack.
        MOVL    m_g0(BX), BX
        get_tls(CX)
        MOVL    BX, g(CX)
        MOVL    (g_sched+gobuf_sp)(BX), SP
        CALL    runtime·newstack(SB)
        MOVL    $0, 0x1103      // crash if newstack returns
        RET

// reflect·call: call a function with the given argument list
// func call(f *FuncVal, arg *byte, argsize uint32).
// we don't have variable-sized frames, so we use a small number
// of constant-sized-frame functions to encode a few bits of size in the pc.
// Caution: ugly multiline assembly macros in your future!

#define DISPATCH(NAME,MAXSIZE)          \
        CMPL    CX, $MAXSIZE;           \
        JA      3(PC);                  \
        MOVL    $runtime·NAME(SB), AX; \
        JMP     AX
// Note: can't just "JMP runtime·NAME(SB)" - bad inlining results.

TEXT reflect·call(SB), NOSPLIT, $0-20
        MOVLQZX argsize+8(FP), CX
        DISPATCH(call16, 16)
        DISPATCH(call32, 32)
        DISPATCH(call64, 64)
        DISPATCH(call128, 128)
        DISPATCH(call256, 256)
        DISPATCH(call512, 512)
        DISPATCH(call1024, 1024)
        DISPATCH(call2048, 2048)
        DISPATCH(call4096, 4096)
        DISPATCH(call8192, 8192)
        DISPATCH(call16384, 16384)
        DISPATCH(call32768, 32768)
        DISPATCH(call65536, 65536)
        DISPATCH(call131072, 131072)
        DISPATCH(call262144, 262144)
        DISPATCH(call524288, 524288)
        DISPATCH(call1048576, 1048576)
        DISPATCH(call2097152, 2097152)
        DISPATCH(call4194304, 4194304)
        DISPATCH(call8388608, 8388608)
        DISPATCH(call16777216, 16777216)
        DISPATCH(call33554432, 33554432)
        DISPATCH(call67108864, 67108864)
        DISPATCH(call134217728, 134217728)
        DISPATCH(call268435456, 268435456)
        DISPATCH(call536870912, 536870912)
        DISPATCH(call1073741824, 1073741824)
        MOVL    $runtime·badreflectcall(SB), AX
        JMP     AX

#define CALLFN(NAME,MAXSIZE)                    \
TEXT runtime·NAME(SB), WRAPPER, $MAXSIZE-16;           \
        /* copy arguments to stack */           \
        MOVL    argptr+4(FP), SI;               \
        MOVL    argsize+8(FP), CX;              \
        MOVL    SP, DI;                         \
        REP;MOVSB;                              \
        /* call function */                     \
        MOVL    f+0(FP), DX;                    \
        MOVL    (DX), AX;                               \
        CALL    AX; \
        /* copy return values back */           \
        MOVL    argptr+4(FP), DI;               \
        MOVL    argsize+8(FP), CX;              \
        MOVL    retoffset+12(FP), BX;           \
        MOVL    SP, SI;                         \
        ADDL    BX, DI;                         \
        ADDL    BX, SI;                         \
        SUBL    BX, CX;                         \
        REP;MOVSB;                              \
        RET

CALLFN(call16, 16)
CALLFN(call32, 32)
CALLFN(call64, 64)
CALLFN(call128, 128)
CALLFN(call256, 256)
CALLFN(call512, 512)
CALLFN(call1024, 1024)
CALLFN(call2048, 2048)
CALLFN(call4096, 4096)
CALLFN(call8192, 8192)
CALLFN(call16384, 16384)
CALLFN(call32768, 32768)
CALLFN(call65536, 65536)
CALLFN(call131072, 131072)
CALLFN(call262144, 262144)
CALLFN(call524288, 524288)
CALLFN(call1048576, 1048576)
CALLFN(call2097152, 2097152)
CALLFN(call4194304, 4194304)
CALLFN(call8388608, 8388608)
CALLFN(call16777216, 16777216)
CALLFN(call33554432, 33554432)
CALLFN(call67108864, 67108864)
CALLFN(call134217728, 134217728)
CALLFN(call268435456, 268435456)
CALLFN(call536870912, 536870912)
CALLFN(call1073741824, 1073741824)

// Return point when leaving stack.
//
// Lessstack can appear in stack traces for the same reason
// as morestack; in that context, it has 0 arguments.
TEXT runtime·lessstack(SB), NOSPLIT, $0-0
        // Save return value in m->cret
        get_tls(CX)
        MOVL    m(CX), BX
        MOVQ    AX, m_cret(BX)  // MOVQ, to save all 64 bits

        // Call oldstack on m->g0's stack.
        MOVL    m_g0(BX), BX
        MOVL    BX, g(CX)
        MOVL    (g_sched+gobuf_sp)(BX), SP
        CALL    runtime·oldstack(SB)
        MOVL    $0, 0x1004      // crash if oldstack returns
        RET

// morestack trampolines
TEXT runtime·morestack00(SB),NOSPLIT,$0
        get_tls(CX)
        MOVL    m(CX), BX
        MOVQ    $0, AX
        MOVQ    AX, m_moreframesize(BX)
        MOVL    $runtime·morestack(SB), AX
        JMP     AX

TEXT runtime·morestack01(SB),NOSPLIT,$0
        get_tls(CX)
        MOVL    m(CX), BX
        SHLQ    $32, AX
        MOVQ    AX, m_moreframesize(BX)
        MOVL    $runtime·morestack(SB), AX
        JMP     AX

TEXT runtime·morestack10(SB),NOSPLIT,$0
        get_tls(CX)
        MOVL    m(CX), BX
        MOVLQZX AX, AX
        MOVQ    AX, m_moreframesize(BX)
        MOVL    $runtime·morestack(SB), AX
        JMP     AX

TEXT runtime·morestack11(SB),NOSPLIT,$0
        get_tls(CX)
        MOVL    m(CX), BX
        MOVQ    AX, m_moreframesize(BX)
        MOVL    $runtime·morestack(SB), AX
        JMP     AX

// subcases of morestack01
// with const of 8,16,...48
TEXT runtime·morestack8(SB),NOSPLIT,$0
        MOVQ    $1, R8
        MOVL    $morestack<>(SB), AX
        JMP     AX

TEXT runtime·morestack16(SB),NOSPLIT,$0
        MOVQ    $2, R8
        MOVL    $morestack<>(SB), AX
        JMP     AX

TEXT runtime·morestack24(SB),NOSPLIT,$0
        MOVQ    $3, R8
        MOVL    $morestack<>(SB), AX
        JMP     AX

TEXT runtime·morestack32(SB),NOSPLIT,$0
        MOVQ    $4, R8
        MOVL    $morestack<>(SB), AX
        JMP     AX

TEXT runtime·morestack40(SB),NOSPLIT,$0
        MOVQ    $5, R8
        MOVL    $morestack<>(SB), AX
        JMP     AX

TEXT runtime·morestack48(SB),NOSPLIT,$0
        MOVQ    $6, R8
        MOVL    $morestack<>(SB), AX
        JMP     AX

TEXT morestack<>(SB),NOSPLIT,$0
        get_tls(CX)
        MOVL    m(CX), BX
        SHLQ    $35, R8
        MOVQ    R8, m_moreframesize(BX)
        MOVL    $runtime·morestack(SB), AX
        JMP     AX

TEXT runtime·morestack00_noctxt(SB),NOSPLIT,$0
        MOVL    $0, DX
        JMP     runtime·morestack00(SB)

TEXT runtime·morestack01_noctxt(SB),NOSPLIT,$0
        MOVL    $0, DX
        JMP     runtime·morestack01(SB)

TEXT runtime·morestack10_noctxt(SB),NOSPLIT,$0
        MOVL    $0, DX
        JMP     runtime·morestack10(SB)

TEXT runtime·morestack11_noctxt(SB),NOSPLIT,$0
        MOVL    $0, DX
        JMP     runtime·morestack11(SB)

TEXT runtime·morestack8_noctxt(SB),NOSPLIT,$0
        MOVL    $0, DX
        JMP     runtime·morestack8(SB)

TEXT runtime·morestack16_noctxt(SB),NOSPLIT,$0
        MOVL    $0, DX
        JMP     runtime·morestack16(SB)

TEXT runtime·morestack24_noctxt(SB),NOSPLIT,$0
        MOVL    $0, DX
        JMP     runtime·morestack24(SB)

TEXT runtime·morestack32_noctxt(SB),NOSPLIT,$0
        MOVL    $0, DX
        JMP     runtime·morestack32(SB)

TEXT runtime·morestack40_noctxt(SB),NOSPLIT,$0
        MOVL    $0, DX
        JMP     runtime·morestack40(SB)

TEXT runtime·morestack48_noctxt(SB),NOSPLIT,$0
        MOVL    $0, DX
        JMP     runtime·morestack48(SB)

// bool cas(int32 *val, int32 old, int32 new)
// Atomically:
//      if(*val == old){
//              *val = new;
//              return 1;
//      } else
//              return 0;
TEXT runtime·cas(SB), NOSPLIT, $0-12
        MOVL    val+0(FP), BX
        MOVL    old+4(FP), AX
        MOVL    new+8(FP), CX
        LOCK
        CMPXCHGL        CX, 0(BX)
        JZ 3(PC)
        MOVL    $0, AX
        RET
        MOVL    $1, AX
        RET

// bool runtime·cas64(uint64 *val, uint64 old, uint64 new)
// Atomically:
//      if(*val == *old){
//              *val = new;
//              return 1;
//      } else {
//              return 0;
//      }
TEXT runtime·cas64(SB), NOSPLIT, $0-24
        MOVL    val+0(FP), BX
        MOVQ    old+8(FP), AX
        MOVQ    new+16(FP), CX
        LOCK
        CMPXCHGQ        CX, 0(BX)
        JNZ     cas64_fail
        MOVL    $1, AX
        RET
cas64_fail:
        MOVL    $0, AX
        RET

// bool casp(void **val, void *old, void *new)
// Atomically:
//      if(*val == old){
//              *val = new;
//              return 1;
//      } else
//              return 0;
TEXT runtime·casp(SB), NOSPLIT, $0-12
        MOVL    val+0(FP), BX
        MOVL    old+4(FP), AX
        MOVL    new+8(FP), CX
        LOCK
        CMPXCHGL        CX, 0(BX)
        JZ 3(PC)
        MOVL    $0, AX
        RET
        MOVL    $1, AX
        RET

// uint32 xadd(uint32 volatile *val, int32 delta)
// Atomically:
//      *val += delta;
//      return *val;
TEXT runtime·xadd(SB), NOSPLIT, $0-8
        MOVL    val+0(FP), BX
        MOVL    delta+4(FP), AX
        MOVL    AX, CX
        LOCK
        XADDL   AX, 0(BX)
        ADDL    CX, AX
        RET

TEXT runtime·xadd64(SB), NOSPLIT, $0-16
        MOVL    val+0(FP), BX
        MOVQ    delta+8(FP), AX
        MOVQ    AX, CX
        LOCK
        XADDQ   AX, 0(BX)
        ADDQ    CX, AX
        RET

TEXT runtime·xchg(SB), NOSPLIT, $0-8
        MOVL    val+0(FP), BX
        MOVL    new+4(FP), AX
        XCHGL   AX, 0(BX)
        RET

TEXT runtime·xchg64(SB), NOSPLIT, $0-16
        MOVL    val+0(FP), BX
        MOVQ    new+8(FP), AX
        XCHGQ   AX, 0(BX)
        RET

TEXT runtime·procyield(SB),NOSPLIT,$0-0
        MOVL    val+0(FP), AX
again:
        PAUSE
        SUBL    $1, AX
        JNZ     again
        RET

TEXT runtime·atomicstorep(SB), NOSPLIT, $0-8
        MOVL    ptr+0(FP), BX
        MOVL    val+4(FP), AX
        XCHGL   AX, 0(BX)
        RET

TEXT runtime·atomicstore(SB), NOSPLIT, $0-8
        MOVL    ptr+0(FP), BX
        MOVL    val+4(FP), AX
        XCHGL   AX, 0(BX)
        RET

TEXT runtime·atomicstore64(SB), NOSPLIT, $0-16
        MOVL    ptr+0(FP), BX
        MOVQ    val+8(FP), AX
        XCHGQ   AX, 0(BX)
        RET

// void jmpdefer(fn, sp);
// called from deferreturn.
// 1. pop the caller
// 2. sub 5 bytes from the callers return
// 3. jmp to the argument
TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16
        MOVL    fn+0(FP), DX
        MOVL    callersp+4(FP), BX
        LEAL    -8(BX), SP      // caller sp after CALL
        SUBL    $5, (SP)        // return to CALL again
        MOVL    0(DX), BX
        JMP     BX      // but first run the deferred function

// asmcgocall(void(*fn)(void*), void *arg)
// Not implemented.
TEXT runtime·asmcgocall(SB),NOSPLIT,$0-8
        MOVL    0, AX
        RET

// cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
// Not implemented.
TEXT runtime·cgocallback(SB),NOSPLIT,$0-12
        MOVL    0, AX
        RET

// void setmg(M*, G*); set m and g. for use by needm.
// Not implemented.
TEXT runtime·setmg(SB), NOSPLIT, $0-8
        MOVL    0, AX
        RET

// check that SP is in range [g->stackbase, g->stackguard)
TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
        get_tls(CX)
        MOVL    g(CX), AX
        CMPL    g_stackbase(AX), SP
        JHI     2(PC)
        MOVL    0, AX
        CMPL    SP, g_stackguard(AX)
        JHI     2(PC)
        MOVL    0, AX
        RET

TEXT runtime·memclr(SB),NOSPLIT,$0-8
        MOVL    addr+0(FP), DI
        MOVL    count+4(FP), CX
        MOVQ    CX, BX
        ANDQ    $7, BX
        SHRQ    $3, CX
        MOVQ    $0, AX
        CLD
        REP
        STOSQ
        MOVQ    BX, CX
        REP
        STOSB
        RET

TEXT runtime·getcallerpc(SB),NOSPLIT,$0-8
        MOVL    x+0(FP),AX              // addr of first arg
        MOVL    -8(AX),AX               // get calling pc
        RET

TEXT runtime·setcallerpc(SB),NOSPLIT,$0-16
        MOVL    x+0(FP),AX              // addr of first arg
        MOVL    pc+4(FP), BX            // pc to set
        MOVQ    BX, -8(AX)              // set calling pc
        RET

TEXT runtime·getcallersp(SB),NOSPLIT,$0-8
        MOVL    sp+0(FP), AX
        RET

// int64 runtime·cputicks(void)
TEXT runtime·cputicks(SB),NOSPLIT,$0-0
        RDTSC
        SHLQ    $32, DX
        ADDQ    DX, AX
        RET

TEXT runtime·stackguard(SB),NOSPLIT,$0-16
        MOVL    SP, DX
        MOVL    DX, sp+0(FP)
        get_tls(CX)
        MOVL    g(CX), BX
        MOVL    g_stackguard(BX), DX
        MOVL    DX, limit+4(FP)
        RET

GLOBL runtime·tls0(SB), $64

// hash function using AES hardware instructions
// For now, our one amd64p32 system (NaCl) does not
// support using AES instructions, so have not bothered to
// write the implementations. Can copy and adjust the ones
// in asm_amd64.s when the time comes.

TEXT runtime·aeshash(SB),NOSPLIT,$0-24
        RET

TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24
        RET

TEXT runtime·aeshash32(SB),NOSPLIT,$0-24
        RET

TEXT runtime·aeshash64(SB),NOSPLIT,$0-24
        RET

TEXT runtime·memeq(SB),NOSPLIT,$0-12
        MOVL    a+0(FP), SI
        MOVL    b+4(FP), DI
        MOVL    count+8(FP), BX
        JMP     runtime·memeqbody(SB)

// a in SI
// b in DI
// count in BX
TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
        XORQ    AX, AX

        CMPQ    BX, $8
        JB      small
        
        // 64 bytes at a time using xmm registers
hugeloop:
        CMPQ    BX, $64
        JB      bigloop
        MOVOU   (SI), X0
        MOVOU   (DI), X1
        MOVOU   16(SI), X2
        MOVOU   16(DI), X3
        MOVOU   32(SI), X4
        MOVOU   32(DI), X5
        MOVOU   48(SI), X6
        MOVOU   48(DI), X7
        PCMPEQB X1, X0
        PCMPEQB X3, X2
        PCMPEQB X5, X4
        PCMPEQB X7, X6
        PAND    X2, X0
        PAND    X6, X4
        PAND    X4, X0
        PMOVMSKB X0, DX
        ADDQ    $64, SI
        ADDQ    $64, DI
        SUBQ    $64, BX
        CMPL    DX, $0xffff
        JEQ     hugeloop
        RET

        // 8 bytes at a time using 64-bit register
bigloop:
        CMPQ    BX, $8
        JBE     leftover
        MOVQ    (SI), CX
        MOVQ    (DI), DX
        ADDQ    $8, SI
        ADDQ    $8, DI
        SUBQ    $8, BX
        CMPQ    CX, DX
        JEQ     bigloop
        RET

        // remaining 0-8 bytes
leftover:
        ADDQ    BX, SI
        ADDQ    BX, DI
        MOVQ    -8(SI), CX
        MOVQ    -8(DI), DX
        CMPQ    CX, DX
        SETEQ   AX
        RET

small:
        CMPQ    BX, $0
        JEQ     equal

        LEAQ    0(BX*8), CX
        NEGQ    CX

        CMPB    SI, $0xf8
        JA      si_high

        // load at SI won't cross a page boundary.
        MOVQ    (SI), SI
        JMP     si_finish
si_high:
        // address ends in 11111xxx.  Load up to bytes we want, move to correct position.
        MOVQ    BX, DX
        ADDQ    SI, DX
        MOVQ    -8(DX), SI
        SHRQ    CX, SI
si_finish:

        // same for DI.
        CMPB    DI, $0xf8
        JA      di_high
        MOVQ    (DI), DI
        JMP     di_finish
di_high:
        MOVQ    BX, DX
        ADDQ    DI, DX
        MOVQ    -8(DX), DI
        SHRQ    CX, DI
di_finish:

        SUBQ    SI, DI
        SHLQ    CX, DI
equal:
        SETEQ   AX
        RET

TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
        MOVL    s1+0(FP), SI
        MOVL    s1+4(FP), BX
        MOVL    s2+8(FP), DI
        MOVL    s2+12(FP), DX
        CALL    runtime·cmpbody(SB)
        MOVL    AX, res+16(FP)
        RET

TEXT bytes·Compare(SB),NOSPLIT,$0-28
        MOVL    s1+0(FP), SI
        MOVL    s1+4(FP), BX
        MOVL    s2+12(FP), DI
        MOVL    s2+16(FP), DX
        CALL    runtime·cmpbody(SB)
        MOVQ    AX, res+24(FP)
        RET

// input:
//   SI = a
//   DI = b
//   BX = alen
//   DX = blen
// output:
//   AX = 1/0/-1
TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
        CMPQ    SI, DI
        JEQ     cmp_allsame
        CMPQ    BX, DX
        MOVQ    DX, R8
        CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare
        CMPQ    R8, $8
        JB      cmp_small

cmp_loop:
        CMPQ    R8, $16
        JBE     cmp_0through16
        MOVOU   (SI), X0
        MOVOU   (DI), X1
        PCMPEQB X0, X1
        PMOVMSKB X1, AX
        XORQ    $0xffff, AX     // convert EQ to NE
        JNE     cmp_diff16      // branch if at least one byte is not equal
        ADDQ    $16, SI
        ADDQ    $16, DI
        SUBQ    $16, R8
        JMP     cmp_loop
        
        // AX = bit mask of differences
cmp_diff16:
        BSFQ    AX, BX  // index of first byte that differs
        XORQ    AX, AX
        ADDQ    BX, SI
        MOVB    (SI), CX
        ADDQ    BX, DI
        CMPB    CX, (DI)
        SETHI   AX
        LEAQ    -1(AX*2), AX    // convert 1/0 to +1/-1
        RET

        // 0 through 16 bytes left, alen>=8, blen>=8
cmp_0through16:
        CMPQ    R8, $8
        JBE     cmp_0through8
        MOVQ    (SI), AX
        MOVQ    (DI), CX
        CMPQ    AX, CX
        JNE     cmp_diff8
cmp_0through8:
        ADDQ    R8, SI
        ADDQ    R8, DI
        MOVQ    -8(SI), AX
        MOVQ    -8(DI), CX
        CMPQ    AX, CX
        JEQ     cmp_allsame

        // AX and CX contain parts of a and b that differ.
cmp_diff8:
        BSWAPQ  AX      // reverse order of bytes
        BSWAPQ  CX
        XORQ    AX, CX
        BSRQ    CX, CX  // index of highest bit difference
        SHRQ    CX, AX  // move a's bit to bottom
        ANDQ    $1, AX  // mask bit
        LEAQ    -1(AX*2), AX // 1/0 => +1/-1
        RET

        // 0-7 bytes in common
cmp_small:
        LEAQ    (R8*8), CX      // bytes left -> bits left
        NEGQ    CX              //  - bits lift (== 64 - bits left mod 64)
        JEQ     cmp_allsame

        // load bytes of a into high bytes of AX
        CMPB    SI, $0xf8
        JA      cmp_si_high
        MOVQ    (SI), SI
        JMP     cmp_si_finish
cmp_si_high:
        ADDQ    R8, SI
        MOVQ    -8(SI), SI
        SHRQ    CX, SI
cmp_si_finish:
        SHLQ    CX, SI

        // load bytes of b in to high bytes of BX
        CMPB    DI, $0xf8
        JA      cmp_di_high
        MOVQ    (DI), DI
        JMP     cmp_di_finish
cmp_di_high:
        ADDQ    R8, DI
        MOVQ    -8(DI), DI
        SHRQ    CX, DI
cmp_di_finish:
        SHLQ    CX, DI

        BSWAPQ  SI      // reverse order of bytes
        BSWAPQ  DI
        XORQ    SI, DI  // find bit differences
        JEQ     cmp_allsame
        BSRQ    DI, CX  // index of highest bit difference
        SHRQ    CX, SI  // move a's bit to bottom
        ANDQ    $1, SI  // mask bit
        LEAQ    -1(SI*2), AX // 1/0 => +1/-1
        RET

cmp_allsame:
        XORQ    AX, AX
        XORQ    CX, CX
        CMPQ    BX, DX
        SETGT   AX      // 1 if alen > blen
        SETEQ   CX      // 1 if alen == blen
        LEAQ    -1(CX)(AX*2), AX        // 1,0,-1 result
        RET

TEXT bytes·IndexByte(SB),NOSPLIT,$0
        MOVL s+0(FP), SI
        MOVL s_len+4(FP), BX
        MOVB c+12(FP), AL
        CALL runtime·indexbytebody(SB)
        MOVL AX, ret+16(FP)
        RET

TEXT strings·IndexByte(SB),NOSPLIT,$0
        MOVL s+0(FP), SI
        MOVL s_len+4(FP), BX
        MOVB c+8(FP), AL
        CALL runtime·indexbytebody(SB)
        MOVL AX, ret+16(FP)
        RET

// input:
//   SI: data
//   BX: data len
//   AL: byte sought
// output:
//   AX
TEXT runtime·indexbytebody(SB),NOSPLIT,$0
        MOVL SI, DI

        CMPL BX, $16
        JLT indexbyte_small

        // round up to first 16-byte boundary
        TESTL $15, SI
        JZ aligned
        MOVL SI, CX
        ANDL $~15, CX
        ADDL $16, CX

        // search the beginning
        SUBL SI, CX
        REPN; SCASB
        JZ success

// DI is 16-byte aligned; get ready to search using SSE instructions
aligned:
        // round down to last 16-byte boundary
        MOVL BX, R11
        ADDL SI, R11
        ANDL $~15, R11

        // shuffle X0 around so that each byte contains c
        MOVD AX, X0
        PUNPCKLBW X0, X0
        PUNPCKLBW X0, X0
        PSHUFL $0, X0, X0
        JMP condition

sse:
        // move the next 16-byte chunk of the buffer into X1
        MOVO (DI), X1
        // compare bytes in X0 to X1
        PCMPEQB X0, X1
        // take the top bit of each byte in X1 and put the result in DX
        PMOVMSKB X1, DX
        TESTL DX, DX
        JNZ ssesuccess
        ADDL $16, DI

condition:
        CMPL DI, R11
        JLT sse

        // search the end
        MOVL SI, CX
        ADDL BX, CX
        SUBL R11, CX
        // if CX == 0, the zero flag will be set and we'll end up
        // returning a false success
        JZ failure
        REPN; SCASB
        JZ success

failure:
        MOVL $-1, AX
        RET

// handle for lengths < 16
indexbyte_small:
        MOVL BX, CX
        REPN; SCASB
        JZ success
        MOVL $-1, AX
        RET

// we've found the chunk containing the byte
// now just figure out which specific byte it is
ssesuccess:
        // get the index of the least significant set bit
        BSFW DX, DX
        SUBL SI, DI
        ADDL DI, DX
        MOVL DX, AX
        RET

success:
        SUBL SI, DI
        SUBL $1, DI
        MOVL DI, AX
        RET

TEXT bytes·Equal(SB),NOSPLIT,$0-25
        MOVL    a_len+4(FP), BX
        MOVL    b_len+16(FP), CX
        XORL    AX, AX
        CMPL    BX, CX
        JNE     eqret
        MOVL    a+0(FP), SI
        MOVL    b+12(FP), DI
        CALL    runtime·memeqbody(SB)
eqret:
        MOVB    AX, ret+24(FP)
        RET

TEXT runtime·timenow(SB), NOSPLIT, $0-0
        JMP     time·now(SB)

/* [<][>][^][v][top][bottom][index][help] */