root/src/cmd/8g/ggen.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. defframe
  2. zerorange
  3. appendpp
  4. markautoused
  5. fixautoused
  6. clearfat
  7. ginscall
  8. cgen_callinter
  9. cgen_call
  10. cgen_callret
  11. cgen_aret
  12. cgen_ret
  13. cgen_asop
  14. samereg
  15. dodiv
  16. savex
  17. restx
  18. cgen_div
  19. cgen_shift
  20. cgen_bmul
  21. cgen_hmul
  22. cgen_float
  23. cgen_float387
  24. cgen_floatsse
  25. bgen_float
  26. expandchecks

// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

#undef  EXTERN
#define EXTERN
#include <u.h>
#include <libc.h>
#include "gg.h"
#include "opt.h"

static Prog *appendpp(Prog*, int, int, vlong, int, vlong);
static Prog *zerorange(Prog *p, vlong frame, vlong lo, vlong hi, uint32 *ax);

void
defframe(Prog *ptxt)
{
        uint32 frame, ax;
        Prog *p;
        vlong lo, hi;
        NodeList *l;
        Node *n;

        // fill in argument size
        ptxt->to.offset2 = rnd(curfn->type->argwid, widthptr);

        // fill in final stack size
        frame = rnd(stksize+maxarg, widthptr);
        ptxt->to.offset = frame;
        
        // insert code to zero ambiguously live variables
        // so that the garbage collector only sees initialized values
        // when it looks for pointers.
        p = ptxt;
        hi = 0;
        lo = hi;
        ax = 0;
        for(l=curfn->dcl; l != nil; l = l->next) {
                n = l->n;
                if(!n->needzero)
                        continue;
                if(n->class != PAUTO)
                        fatal("needzero class %d", n->class);
                if(n->type->width % widthptr != 0 || n->xoffset % widthptr != 0 || n->type->width == 0)
                        fatal("var %lN has size %d offset %d", n, (int)n->type->width, (int)n->xoffset);
                if(lo != hi && n->xoffset + n->type->width == lo - 2*widthptr) {
                        // merge with range we already have
                        lo = n->xoffset;
                        continue;
                }
                // zero old range
                p = zerorange(p, frame, lo, hi, &ax);

                // set new range
                hi = n->xoffset + n->type->width;
                lo = n->xoffset;
        }
        // zero final range
        zerorange(p, frame, lo, hi, &ax);
}

static Prog*
zerorange(Prog *p, vlong frame, vlong lo, vlong hi, uint32 *ax)
{
        vlong cnt, i;

        cnt = hi - lo;
        if(cnt == 0)
                return p;
        if(*ax == 0) {
                p = appendpp(p, AMOVL, D_CONST, 0, D_AX, 0);
                *ax = 1;
        }
        if(cnt <= 4*widthreg) {
                for(i = 0; i < cnt; i += widthreg) {
                        p = appendpp(p, AMOVL, D_AX, 0, D_SP+D_INDIR, frame+lo+i);
                }
        } else if(!nacl && cnt <= 128*widthreg) {
                p = appendpp(p, ALEAL, D_SP+D_INDIR, frame+lo, D_DI, 0);
                p = appendpp(p, ADUFFZERO, D_NONE, 0, D_ADDR, 1*(128-cnt/widthreg));
                p->to.sym = linksym(pkglookup("duffzero", runtimepkg));
        } else {
                p = appendpp(p, AMOVL, D_CONST, cnt/widthreg, D_CX, 0);
                p = appendpp(p, ALEAL, D_SP+D_INDIR, frame+lo, D_DI, 0);
                p = appendpp(p, AREP, D_NONE, 0, D_NONE, 0);
                p = appendpp(p, ASTOSL, D_NONE, 0, D_NONE, 0);
        }
        return p;
}

static Prog*    
appendpp(Prog *p, int as, int ftype, vlong foffset, int ttype, vlong toffset)   
{
        Prog *q;
        q = mal(sizeof(*q));    
        clearp(q);      
        q->as = as;     
        q->lineno = p->lineno;  
        q->from.type = ftype;   
        q->from.offset = foffset;       
        q->to.type = ttype;     
        q->to.offset = toffset; 
        q->link = p->link;      
        p->link = q;    
        return q;       
}

// Sweep the prog list to mark any used nodes.
void
markautoused(Prog* p)
{
        for (; p; p = p->link) {
                if (p->as == ATYPE || p->as == AVARDEF || p->as == AVARKILL)
                        continue;

                if (p->from.node)
                        p->from.node->used = 1;

                if (p->to.node)
                        p->to.node->used = 1;
        }
}

// Fixup instructions after allocauto (formerly compactframe) has moved all autos around.
void
fixautoused(Prog* p)
{
        Prog **lp;

        for (lp=&p; (p=*lp) != P; ) {
                if (p->as == ATYPE && p->from.node && p->from.type == D_AUTO && !p->from.node->used) {
                        *lp = p->link;
                        continue;
                }
                if ((p->as == AVARDEF || p->as == AVARKILL) && p->to.node && !p->to.node->used) {
                        // Cannot remove VARDEF instruction, because - unlike TYPE handled above -
                        // VARDEFs are interspersed with other code, and a jump might be using the
                        // VARDEF as a target. Replace with a no-op instead. A later pass will remove
                        // the no-ops.
                        p->to.type = D_NONE;
                        p->to.node = N;
                        p->as = ANOP;
                        continue;
                }

                if (p->from.type == D_AUTO && p->from.node)
                        p->from.offset += p->from.node->stkdelta;

                if (p->to.type == D_AUTO && p->to.node)
                        p->to.offset += p->to.node->stkdelta;

                lp = &p->link;
        }
}

void
clearfat(Node *nl)
{
        uint32 w, c, q;
        Node n1;
        Prog *p;

        /* clear a fat object */
        if(debug['g'])
                dump("\nclearfat", nl);

        w = nl->type->width;
        // Avoid taking the address for simple enough types.
        if(componentgen(N, nl))
                return;

        c = w % 4;      // bytes
        q = w / 4;      // quads

        nodreg(&n1, types[tptr], D_DI);
        agen(nl, &n1);
        gconreg(AMOVL, 0, D_AX);

        if(q > 128 || (q >= 4 && nacl)) {
                gconreg(AMOVL, q, D_CX);
                gins(AREP, N, N);       // repeat
                gins(ASTOSL, N, N);     // STOL AL,*(DI)+
        } else if(q >= 4) {
                p = gins(ADUFFZERO, N, N);
                p->to.type = D_ADDR;
                p->to.sym = linksym(pkglookup("duffzero", runtimepkg));
                // 1 and 128 = magic constants: see ../../pkg/runtime/asm_386.s
                p->to.offset = 1*(128-q);
        } else
        while(q > 0) {
                gins(ASTOSL, N, N);     // STOL AL,*(DI)+
                q--;
        }

        while(c > 0) {
                gins(ASTOSB, N, N);     // STOB AL,*(DI)+
                c--;
        }
}

/*
 * generate:
 *      call f
 *      proc=-1 normal call but no return
 *      proc=0  normal call
 *      proc=1  goroutine run in new proc
 *      proc=2  defer call save away stack
  *     proc=3  normal call to C pointer (not Go func value)
 */
void
ginscall(Node *f, int proc)
{
        int32 arg;
        Prog *p;
        Node reg, r1, con;

        if(f->type != T)
                setmaxarg(f->type);

        arg = -1;
        // Most functions have a fixed-size argument block, so traceback uses that during unwind.
        // Not all, though: there are some variadic functions in package runtime,
        // and for those we emit call-specific metadata recorded by caller.
        // Reflect generates functions with variable argsize (see reflect.methodValueCall/makeFuncStub),
        // so we do this for all indirect calls as well.
        if(f->type != T && (f->sym == S || (f->sym != S && f->sym->pkg == runtimepkg) || proc == 1 || proc == 2)) {
                arg = f->type->argwid;
                if(proc == 1 || proc == 2)
                        arg += 2*widthptr;
        }

        if(arg != -1)
                gargsize(arg);

        switch(proc) {
        default:
                fatal("ginscall: bad proc %d", proc);
                break;

        case 0: // normal call
        case -1:        // normal call but no return
                if(f->op == ONAME && f->class == PFUNC) {
                        if(f == deferreturn) {
                                // Deferred calls will appear to be returning to
                                // the CALL deferreturn(SB) that we are about to emit.
                                // However, the stack trace code will show the line
                                // of the instruction byte before the return PC. 
                                // To avoid that being an unrelated instruction,
                                // insert an x86 NOP that we will have the right line number.
                                // x86 NOP 0x90 is really XCHG AX, AX; use that description
                                // because the NOP pseudo-instruction will be removed by
                                // the linker.
                                nodreg(&reg, types[TINT], D_AX);
                                gins(AXCHGL, &reg, &reg);
                        }
                        p = gins(ACALL, N, f);
                        afunclit(&p->to, f);
                        if(proc == -1 || noreturn(p))
                                gins(AUNDEF, N, N);
                        break;
                }
                nodreg(&reg, types[tptr], D_DX);
                nodreg(&r1, types[tptr], D_BX);
                gmove(f, &reg);
                reg.op = OINDREG;
                gmove(&reg, &r1);
                reg.op = OREGISTER;
                gins(ACALL, &reg, &r1);
                break;
        
        case 3: // normal call of c function pointer
                gins(ACALL, N, f);
                break;

        case 1: // call in new proc (go)
        case 2: // deferred call (defer)
                nodreg(&reg, types[TINT32], D_CX);
                gins(APUSHL, f, N);
                nodconst(&con, types[TINT32], argsize(f->type));
                gins(APUSHL, &con, N);
                if(proc == 1)
                        ginscall(newproc, 0);
                else
                        ginscall(deferproc, 0);
                gins(APOPL, N, &reg);
                gins(APOPL, N, &reg);
                if(proc == 2) {
                        nodreg(&reg, types[TINT64], D_AX);
                        gins(ATESTL, &reg, &reg);
                        p = gbranch(AJEQ, T, +1);
                        cgen_ret(N);
                        patch(p, pc);
                }
                break;
        }
        
        if(arg != -1)
                gargsize(-1);
}

/*
 * n is call to interface method.
 * generate res = n.
 */
void
cgen_callinter(Node *n, Node *res, int proc)
{
        Node *i, *f;
        Node tmpi, nodi, nodo, nodr, nodsp;

        i = n->left;
        if(i->op != ODOTINTER)
                fatal("cgen_callinter: not ODOTINTER %O", i->op);

        f = i->right;           // field
        if(f->op != ONAME)
                fatal("cgen_callinter: not ONAME %O", f->op);

        i = i->left;            // interface

        if(!i->addable) {
                tempname(&tmpi, i->type);
                cgen(i, &tmpi);
                i = &tmpi;
        }

        genlist(n->list);               // assign the args

        // i is now addable, prepare an indirected
        // register to hold its address.
        igen(i, &nodi, res);            // REG = &inter

        nodindreg(&nodsp, types[tptr], D_SP);
        nodi.type = types[tptr];
        nodi.xoffset += widthptr;
        cgen(&nodi, &nodsp);    // 0(SP) = 4(REG) -- i.data

        regalloc(&nodo, types[tptr], res);
        nodi.type = types[tptr];
        nodi.xoffset -= widthptr;
        cgen(&nodi, &nodo);     // REG = 0(REG) -- i.tab
        regfree(&nodi);

        regalloc(&nodr, types[tptr], &nodo);
        if(n->left->xoffset == BADWIDTH)
                fatal("cgen_callinter: badwidth");
        cgen_checknil(&nodo);
        nodo.op = OINDREG;
        nodo.xoffset = n->left->xoffset + 3*widthptr + 8;
        
        if(proc == 0) {
                // plain call: use direct c function pointer - more efficient
                cgen(&nodo, &nodr);     // REG = 20+offset(REG) -- i.tab->fun[f]
                proc = 3;
        } else {
                // go/defer. generate go func value.
                gins(ALEAL, &nodo, &nodr);      // REG = &(20+offset(REG)) -- i.tab->fun[f]
        }

        nodr.type = n->left->type;
        ginscall(&nodr, proc);

        regfree(&nodr);
        regfree(&nodo);
}

/*
 * generate function call;
 *      proc=0  normal call
 *      proc=1  goroutine run in new proc
 *      proc=2  defer call save away stack
 */
void
cgen_call(Node *n, int proc)
{
        Type *t;
        Node nod, afun;

        if(n == N)
                return;

        if(n->left->ullman >= UINF) {
                // if name involves a fn call
                // precompute the address of the fn
                tempname(&afun, types[tptr]);
                cgen(n->left, &afun);
        }

        genlist(n->list);               // assign the args
        t = n->left->type;

        // call tempname pointer
        if(n->left->ullman >= UINF) {
                regalloc(&nod, types[tptr], N);
                cgen_as(&nod, &afun);
                nod.type = t;
                ginscall(&nod, proc);
                regfree(&nod);
                return;
        }

        // call pointer
        if(n->left->op != ONAME || n->left->class != PFUNC) {
                regalloc(&nod, types[tptr], N);
                cgen_as(&nod, n->left);
                nod.type = t;
                ginscall(&nod, proc);
                regfree(&nod);
                return;
        }

        // call direct
        n->left->method = 1;
        ginscall(n->left, proc);
}

/*
 * call to n has already been generated.
 * generate:
 *      res = return value from call.
 */
void
cgen_callret(Node *n, Node *res)
{
        Node nod;
        Type *fp, *t;
        Iter flist;

        t = n->left->type;
        if(t->etype == TPTR32 || t->etype == TPTR64)
                t = t->type;

        fp = structfirst(&flist, getoutarg(t));
        if(fp == T)
                fatal("cgen_callret: nil");

        memset(&nod, 0, sizeof(nod));
        nod.op = OINDREG;
        nod.val.u.reg = D_SP;
        nod.addable = 1;

        nod.xoffset = fp->width;
        nod.type = fp->type;
        cgen_as(res, &nod);
}

/*
 * call to n has already been generated.
 * generate:
 *      res = &return value from call.
 */
void
cgen_aret(Node *n, Node *res)
{
        Node nod1, nod2;
        Type *fp, *t;
        Iter flist;

        t = n->left->type;
        if(isptr[t->etype])
                t = t->type;

        fp = structfirst(&flist, getoutarg(t));
        if(fp == T)
                fatal("cgen_aret: nil");

        memset(&nod1, 0, sizeof(nod1));
        nod1.op = OINDREG;
        nod1.val.u.reg = D_SP;
        nod1.addable = 1;

        nod1.xoffset = fp->width;
        nod1.type = fp->type;

        if(res->op != OREGISTER) {
                regalloc(&nod2, types[tptr], res);
                gins(ALEAL, &nod1, &nod2);
                gins(AMOVL, &nod2, res);
                regfree(&nod2);
        } else
                gins(ALEAL, &nod1, res);
}

/*
 * generate return.
 * n->left is assignments to return values.
 */
void
cgen_ret(Node *n)
{
        Prog *p;

        if(n != N)
                genlist(n->list);               // copy out args
        if(hasdefer)
                ginscall(deferreturn, 0);
        genlist(curfn->exit);
        p = gins(ARET, N, N);
        if(n != N && n->op == ORETJMP) {
                p->to.type = D_EXTERN;
                p->to.sym = linksym(n->left->sym);
        }
}

/*
 * generate += *= etc.
 */
void
cgen_asop(Node *n)
{
        Node n1, n2, n3, n4;
        Node *nl, *nr;
        Prog *p1;
        Addr addr;
        int a;

        nl = n->left;
        nr = n->right;

        if(nr->ullman >= UINF && nl->ullman >= UINF) {
                tempname(&n1, nr->type);
                cgen(nr, &n1);
                n2 = *n;
                n2.right = &n1;
                cgen_asop(&n2);
                goto ret;
        }

        if(!isint[nl->type->etype])
                goto hard;
        if(!isint[nr->type->etype])
                goto hard;
        if(is64(nl->type) || is64(nr->type))
                goto hard;

        switch(n->etype) {
        case OADD:
                if(smallintconst(nr))
                if(mpgetfix(nr->val.u.xval) == 1) {
                        a = optoas(OINC, nl->type);
                        if(nl->addable) {
                                gins(a, N, nl);
                                goto ret;
                        }
                        if(sudoaddable(a, nl, &addr)) {
                                p1 = gins(a, N, N);
                                p1->to = addr;
                                sudoclean();
                                goto ret;
                        }
                }
                break;

        case OSUB:
                if(smallintconst(nr))
                if(mpgetfix(nr->val.u.xval) == 1) {
                        a = optoas(ODEC, nl->type);
                        if(nl->addable) {
                                gins(a, N, nl);
                                goto ret;
                        }
                        if(sudoaddable(a, nl, &addr)) {
                                p1 = gins(a, N, N);
                                p1->to = addr;
                                sudoclean();
                                goto ret;
                        }
                }
                break;
        }

        switch(n->etype) {
        case OADD:
        case OSUB:
        case OXOR:
        case OAND:
        case OOR:
                a = optoas(n->etype, nl->type);
                if(nl->addable) {
                        if(smallintconst(nr)) {
                                gins(a, nr, nl);
                                goto ret;
                        }
                        regalloc(&n2, nr->type, N);
                        cgen(nr, &n2);
                        gins(a, &n2, nl);
                        regfree(&n2);
                        goto ret;
                }
                if(nr->ullman < UINF)
                if(sudoaddable(a, nl, &addr)) {
                        if(smallintconst(nr)) {
                                p1 = gins(a, nr, N);
                                p1->to = addr;
                                sudoclean();
                                goto ret;
                        }
                        regalloc(&n2, nr->type, N);
                        cgen(nr, &n2);
                        p1 = gins(a, &n2, N);
                        p1->to = addr;
                        regfree(&n2);
                        sudoclean();
                        goto ret;
                }
        }

hard:
        n2.op = 0;
        n1.op = 0;
        if(nr->ullman >= nl->ullman || nl->addable) {
                mgen(nr, &n2, N);
                nr = &n2;
        } else {
                tempname(&n2, nr->type);
                cgen(nr, &n2);
                nr = &n2;
        }
        if(!nl->addable) {
                igen(nl, &n1, N);
                nl = &n1;
        }

        n3 = *n;
        n3.left = nl;
        n3.right = nr;
        n3.op = n->etype;

        mgen(&n3, &n4, N);
        gmove(&n4, nl);

        if(n1.op)
                regfree(&n1);
        mfree(&n2);
        mfree(&n4);

ret:
        ;
}

int
samereg(Node *a, Node *b)
{
        if(a->op != OREGISTER)
                return 0;
        if(b->op != OREGISTER)
                return 0;
        if(a->val.u.reg != b->val.u.reg)
                return 0;
        return 1;
}

/*
 * generate division.
 * caller must set:
 *      ax = allocated AX register
 *      dx = allocated DX register
 * generates one of:
 *      res = nl / nr
 *      res = nl % nr
 * according to op.
 */
void
dodiv(int op, Node *nl, Node *nr, Node *res, Node *ax, Node *dx)
{
        int check;
        Node n1, t1, t2, t3, t4, n4, nz;
        Type *t, *t0;
        Prog *p1, *p2;

        // Have to be careful about handling
        // most negative int divided by -1 correctly.
        // The hardware will trap.
        // Also the byte divide instruction needs AH,
        // which we otherwise don't have to deal with.
        // Easiest way to avoid for int8, int16: use int32.
        // For int32 and int64, use explicit test.
        // Could use int64 hw for int32.
        t = nl->type;
        t0 = t;
        check = 0;
        if(issigned[t->etype]) {
                check = 1;
                if(isconst(nl, CTINT) && mpgetfix(nl->val.u.xval) != -1LL<<(t->width*8-1))
                        check = 0;
                else if(isconst(nr, CTINT) && mpgetfix(nr->val.u.xval) != -1)
                        check = 0;
        }
        if(t->width < 4) {
                if(issigned[t->etype])
                        t = types[TINT32];
                else
                        t = types[TUINT32];
                check = 0;
        }

        tempname(&t1, t);
        tempname(&t2, t);
        if(t0 != t) {
                tempname(&t3, t0);
                tempname(&t4, t0);
                cgen(nl, &t3);
                cgen(nr, &t4);
                // Convert.
                gmove(&t3, &t1);
                gmove(&t4, &t2);
        } else {
                cgen(nl, &t1);
                cgen(nr, &t2);
        }

        if(!samereg(ax, res) && !samereg(dx, res))
                regalloc(&n1, t, res);
        else
                regalloc(&n1, t, N);
        gmove(&t2, &n1);
        gmove(&t1, ax);
        p2 = P;
        if(nacl) {
                // Native Client does not relay the divide-by-zero trap
                // to the executing program, so we must insert a check
                // for ourselves.
                nodconst(&n4, t, 0);
                gins(optoas(OCMP, t), &n1, &n4);
                p1 = gbranch(optoas(ONE, t), T, +1);
                if(panicdiv == N)
                        panicdiv = sysfunc("panicdivide");
                ginscall(panicdiv, -1);
                patch(p1, pc);
        }
        if(check) {
                nodconst(&n4, t, -1);
                gins(optoas(OCMP, t), &n1, &n4);
                p1 = gbranch(optoas(ONE, t), T, +1);
                if(op == ODIV) {
                        // a / (-1) is -a.
                        gins(optoas(OMINUS, t), N, ax);
                        gmove(ax, res);
                } else {
                        // a % (-1) is 0.
                        nodconst(&n4, t, 0);
                        gmove(&n4, res);
                }
                p2 = gbranch(AJMP, T, 0);
                patch(p1, pc);
        }
        if(!issigned[t->etype]) {
                nodconst(&nz, t, 0);
                gmove(&nz, dx);
        } else
                gins(optoas(OEXTEND, t), N, N);
        gins(optoas(op, t), &n1, N);
        regfree(&n1);

        if(op == ODIV)
                gmove(ax, res);
        else
                gmove(dx, res);
        if(check)
                patch(p2, pc);
}

static void
savex(int dr, Node *x, Node *oldx, Node *res, Type *t)
{
        int r;

        r = reg[dr];
        nodreg(x, types[TINT32], dr);

        // save current ax and dx if they are live
        // and not the destination
        memset(oldx, 0, sizeof *oldx);
        if(r > 0 && !samereg(x, res)) {
                tempname(oldx, types[TINT32]);
                gmove(x, oldx);
        }

        regalloc(x, t, x);
}

static void
restx(Node *x, Node *oldx)
{
        regfree(x);

        if(oldx->op != 0) {
                x->type = types[TINT32];
                gmove(oldx, x);
        }
}

/*
 * generate division according to op, one of:
 *      res = nl / nr
 *      res = nl % nr
 */
void
cgen_div(int op, Node *nl, Node *nr, Node *res)
{
        Node ax, dx, oldax, olddx;
        Type *t;

        if(is64(nl->type))
                fatal("cgen_div %T", nl->type);

        if(issigned[nl->type->etype])
                t = types[TINT32];
        else
                t = types[TUINT32];
        savex(D_AX, &ax, &oldax, res, t);
        savex(D_DX, &dx, &olddx, res, t);
        dodiv(op, nl, nr, res, &ax, &dx);
        restx(&dx, &olddx);
        restx(&ax, &oldax);
}

/*
 * generate shift according to op, one of:
 *      res = nl << nr
 *      res = nl >> nr
 */
void
cgen_shift(int op, int bounded, Node *nl, Node *nr, Node *res)
{
        Node n1, n2, nt, cx, oldcx, hi, lo;
        int a, w;
        Prog *p1, *p2;
        uvlong sc;

        if(nl->type->width > 4)
                fatal("cgen_shift %T", nl->type);

        w = nl->type->width * 8;

        a = optoas(op, nl->type);

        if(nr->op == OLITERAL) {
                tempname(&n2, nl->type);
                cgen(nl, &n2);
                regalloc(&n1, nl->type, res);
                gmove(&n2, &n1);
                sc = mpgetfix(nr->val.u.xval);
                if(sc >= nl->type->width*8) {
                        // large shift gets 2 shifts by width-1
                        gins(a, ncon(w-1), &n1);
                        gins(a, ncon(w-1), &n1);
                } else
                        gins(a, nr, &n1);
                gmove(&n1, res);
                regfree(&n1);
                return;
        }

        memset(&oldcx, 0, sizeof oldcx);
        nodreg(&cx, types[TUINT32], D_CX);
        if(reg[D_CX] > 1 && !samereg(&cx, res)) {
                tempname(&oldcx, types[TUINT32]);
                gmove(&cx, &oldcx);
        }

        if(nr->type->width > 4) {
                tempname(&nt, nr->type);
                n1 = nt;
        } else {
                nodreg(&n1, types[TUINT32], D_CX);
                regalloc(&n1, nr->type, &n1);           // to hold the shift type in CX
        }

        if(samereg(&cx, res))
                regalloc(&n2, nl->type, N);
        else
                regalloc(&n2, nl->type, res);
        if(nl->ullman >= nr->ullman) {
                cgen(nl, &n2);
                cgen(nr, &n1);
        } else {
                cgen(nr, &n1);
                cgen(nl, &n2);
        }

        // test and fix up large shifts
        if(bounded) {
                if(nr->type->width > 4) {
                        // delayed reg alloc
                        nodreg(&n1, types[TUINT32], D_CX);
                        regalloc(&n1, types[TUINT32], &n1);             // to hold the shift type in CX
                        split64(&nt, &lo, &hi);
                        gmove(&lo, &n1);
                        splitclean();
                }
        } else {
                if(nr->type->width > 4) {
                        // delayed reg alloc
                        nodreg(&n1, types[TUINT32], D_CX);
                        regalloc(&n1, types[TUINT32], &n1);             // to hold the shift type in CX
                        split64(&nt, &lo, &hi);
                        gmove(&lo, &n1);
                        gins(optoas(OCMP, types[TUINT32]), &hi, ncon(0));
                        p2 = gbranch(optoas(ONE, types[TUINT32]), T, +1);
                        gins(optoas(OCMP, types[TUINT32]), &n1, ncon(w));
                        p1 = gbranch(optoas(OLT, types[TUINT32]), T, +1);
                        splitclean();
                        patch(p2, pc);
                } else {
                        gins(optoas(OCMP, nr->type), &n1, ncon(w));
                        p1 = gbranch(optoas(OLT, types[TUINT32]), T, +1);
                }
                if(op == ORSH && issigned[nl->type->etype]) {
                        gins(a, ncon(w-1), &n2);
                } else {
                        gmove(ncon(0), &n2);
                }
                patch(p1, pc);
        }
        gins(a, &n1, &n2);

        if(oldcx.op != 0)
                gmove(&oldcx, &cx);

        gmove(&n2, res);

        regfree(&n1);
        regfree(&n2);
}

/*
 * generate byte multiply:
 *      res = nl * nr
 * there is no 2-operand byte multiply instruction so
 * we do a full-width multiplication and truncate afterwards.
 */
void
cgen_bmul(int op, Node *nl, Node *nr, Node *res)
{
        Node n1, n2, nt, *tmp;
        Type *t;
        int a;

        // copy from byte to full registers
        t = types[TUINT32];
        if(issigned[nl->type->etype])
                t = types[TINT32];

        // largest ullman on left.
        if(nl->ullman < nr->ullman) {
                tmp = nl;
                nl = nr;
                nr = tmp;
        }

        tempname(&nt, nl->type);
        cgen(nl, &nt);
        regalloc(&n1, t, res);
        cgen(nr, &n1);
        regalloc(&n2, t, N);
        gmove(&nt, &n2);
        a = optoas(op, t);
        gins(a, &n2, &n1);
        regfree(&n2);
        gmove(&n1, res);
        regfree(&n1);
}

/*
 * generate high multiply:
 *   res = (nl*nr) >> width
 */
void
cgen_hmul(Node *nl, Node *nr, Node *res)
{
        Type *t;
        int a;
        Node n1, n2, ax, dx;

        t = nl->type;
        a = optoas(OHMUL, t);
        // gen nl in n1.
        tempname(&n1, t);
        cgen(nl, &n1);
        // gen nr in n2.
        regalloc(&n2, t, res);
        cgen(nr, &n2);

        // multiply.
        nodreg(&ax, t, D_AX);
        gmove(&n2, &ax);
        gins(a, &n1, N);
        regfree(&n2);

        if(t->width == 1) {
                // byte multiply behaves differently.
                nodreg(&ax, t, D_AH);
                nodreg(&dx, t, D_DL);
                gmove(&ax, &dx);
        }
        nodreg(&dx, t, D_DX);
        gmove(&dx, res);
}

static void cgen_float387(Node *n, Node *res);
static void cgen_floatsse(Node *n, Node *res);

/*
 * generate floating-point operation.
 */
void
cgen_float(Node *n, Node *res)
{
        Node *nl;
        Node n1, n2;
        Prog *p1, *p2, *p3;

        nl = n->left;
        switch(n->op) {
        case OEQ:
        case ONE:
        case OLT:
        case OLE:
        case OGE:
                p1 = gbranch(AJMP, T, 0);
                p2 = pc;
                gmove(nodbool(1), res);
                p3 = gbranch(AJMP, T, 0);
                patch(p1, pc);
                bgen(n, 1, 0, p2);
                gmove(nodbool(0), res);
                patch(p3, pc);
                return;

        case OPLUS:
                cgen(nl, res);
                return;

        case OCONV:
                if(eqtype(n->type, nl->type) || noconv(n->type, nl->type)) {
                        cgen(nl, res);
                        return;
                }

                tempname(&n2, n->type);
                mgen(nl, &n1, res);
                gmove(&n1, &n2);
                gmove(&n2, res);
                mfree(&n1);
                return;
        }

        if(use_sse)
                cgen_floatsse(n, res);
        else
                cgen_float387(n, res);
}

// floating-point.  387 (not SSE2)
static void
cgen_float387(Node *n, Node *res)
{
        Node f0, f1;
        Node *nl, *nr;

        nl = n->left;
        nr = n->right;
        nodreg(&f0, nl->type, D_F0);
        nodreg(&f1, n->type, D_F0+1);
        if(nr != N)
                goto flt2;

        // unary
        cgen(nl, &f0);
        if(n->op != OCONV && n->op != OPLUS)
                gins(foptoas(n->op, n->type, 0), N, N);
        gmove(&f0, res);
        return;

flt2:   // binary
        if(nl->ullman >= nr->ullman) {
                cgen(nl, &f0);
                if(nr->addable)
                        gins(foptoas(n->op, n->type, 0), nr, &f0);
                else {
                        cgen(nr, &f0);
                        gins(foptoas(n->op, n->type, Fpop), &f0, &f1);
                }
        } else {
                cgen(nr, &f0);
                if(nl->addable)
                        gins(foptoas(n->op, n->type, Frev), nl, &f0);
                else {
                        cgen(nl, &f0);
                        gins(foptoas(n->op, n->type, Frev|Fpop), &f0, &f1);
                }
        }
        gmove(&f0, res);
        return;

}

static void
cgen_floatsse(Node *n, Node *res)
{
        Node *nl, *nr, *r;
        Node n1, n2, nt;
        int a;

        nl = n->left;
        nr = n->right;
        switch(n->op) {
        default:
                dump("cgen_floatsse", n);
                fatal("cgen_floatsse %O", n->op);
                return;

        case OMINUS:
        case OCOM:
                nr = nodintconst(-1);
                convlit(&nr, n->type);
                a = foptoas(OMUL, nl->type, 0);
                goto sbop;

        // symmetric binary
        case OADD:
        case OMUL:
                a = foptoas(n->op, nl->type, 0);
                goto sbop;

        // asymmetric binary
        case OSUB:
        case OMOD:
        case ODIV:
                a = foptoas(n->op, nl->type, 0);
                goto abop;
        }

sbop:   // symmetric binary
        if(nl->ullman < nr->ullman || nl->op == OLITERAL) {
                r = nl;
                nl = nr;
                nr = r;
        }

abop:   // asymmetric binary
        if(nl->ullman >= nr->ullman) {
                tempname(&nt, nl->type);
                cgen(nl, &nt);
                mgen(nr, &n2, N);
                regalloc(&n1, nl->type, res);
                gmove(&nt, &n1);
                gins(a, &n2, &n1);
                gmove(&n1, res);
                regfree(&n1);
                mfree(&n2);
        } else {
                regalloc(&n2, nr->type, res);
                cgen(nr, &n2);
                regalloc(&n1, nl->type, N);
                cgen(nl, &n1);
                gins(a, &n2, &n1);
                regfree(&n2);
                gmove(&n1, res);
                regfree(&n1);
        }
        return;
}

void
bgen_float(Node *n, int true, int likely, Prog *to)
{
        int et, a;
        Node *nl, *nr, *r;
        Node n1, n2, n3, tmp, t1, t2, ax;
        Prog *p1, *p2;

        nl = n->left;
        nr = n->right;
        a = n->op;
        if(!true) {
                // brcom is not valid on floats when NaN is involved.
                p1 = gbranch(AJMP, T, 0);
                p2 = gbranch(AJMP, T, 0);
                patch(p1, pc);
                // No need to avoid re-genning ninit.
                bgen_float(n, 1, -likely, p2);
                patch(gbranch(AJMP, T, 0), to);
                patch(p2, pc);
                return;
        }

        if(use_sse)
                goto sse;
        else
                goto x87;

x87:
        a = brrev(a);   // because the args are stacked
        if(a == OGE || a == OGT) {
                // only < and <= work right with NaN; reverse if needed
                r = nr;
                nr = nl;
                nl = r;
                a = brrev(a);
        }

        nodreg(&tmp, nr->type, D_F0);
        nodreg(&n2, nr->type, D_F0 + 1);
        nodreg(&ax, types[TUINT16], D_AX);
        et = simsimtype(nr->type);
        if(et == TFLOAT64) {
                if(nl->ullman > nr->ullman) {
                        cgen(nl, &tmp);
                        cgen(nr, &tmp);
                        gins(AFXCHD, &tmp, &n2);
                } else {
                        cgen(nr, &tmp);
                        cgen(nl, &tmp);
                }
                gins(AFUCOMIP, &tmp, &n2);
                gins(AFMOVDP, &tmp, &tmp);      // annoying pop but still better than STSW+SAHF
        } else {
                // TODO(rsc): The moves back and forth to memory
                // here are for truncating the value to 32 bits.
                // This handles 32-bit comparison but presumably
                // all the other ops have the same problem.
                // We need to figure out what the right general
                // solution is, besides telling people to use float64.
                tempname(&t1, types[TFLOAT32]);
                tempname(&t2, types[TFLOAT32]);
                cgen(nr, &t1);
                cgen(nl, &t2);
                gmove(&t2, &tmp);
                gins(AFCOMFP, &t1, &tmp);
                gins(AFSTSW, N, &ax);
                gins(ASAHF, N, N);
        }

        goto ret;

sse:
        if(!nl->addable) {
                tempname(&n1, nl->type);
                cgen(nl, &n1);
                nl = &n1;
        }
        if(!nr->addable) {
                tempname(&tmp, nr->type);
                cgen(nr, &tmp);
                nr = &tmp;
        }
        regalloc(&n2, nr->type, N);
        gmove(nr, &n2);
        nr = &n2;

        if(nl->op != OREGISTER) {
                regalloc(&n3, nl->type, N);
                gmove(nl, &n3);
                nl = &n3;
        }

        if(a == OGE || a == OGT) {
                // only < and <= work right with NaN; reverse if needed
                r = nr;
                nr = nl;
                nl = r;
                a = brrev(a);
        }

        gins(foptoas(OCMP, nr->type, 0), nl, nr);
        if(nl->op == OREGISTER)
                regfree(nl);
        regfree(nr);

ret:
        if(a == OEQ) {
                // neither NE nor P
                p1 = gbranch(AJNE, T, -likely);
                p2 = gbranch(AJPS, T, -likely);
                patch(gbranch(AJMP, T, 0), to);
                patch(p1, pc);
                patch(p2, pc);
        } else if(a == ONE) {
                // either NE or P
                patch(gbranch(AJNE, T, likely), to);
                patch(gbranch(AJPS, T, likely), to);
        } else
                patch(gbranch(optoas(a, nr->type), T, likely), to);

}

// Called after regopt and peep have run.
// Expand CHECKNIL pseudo-op into actual nil pointer check.
void
expandchecks(Prog *firstp)
{
        Prog *p, *p1, *p2;

        for(p = firstp; p != P; p = p->link) {
                if(p->as != ACHECKNIL)
                        continue;
                if(debug_checknil && p->lineno > 1) // p->lineno==1 in generated wrappers
                        warnl(p->lineno, "generated nil check");
                // check is
                //      CMP arg, $0
                //      JNE 2(PC) (likely)
                //      MOV AX, 0
                p1 = mal(sizeof *p1);
                p2 = mal(sizeof *p2);
                clearp(p1);
                clearp(p2);
                p1->link = p2;
                p2->link = p->link;
                p->link = p1;
                p1->lineno = p->lineno;
                p2->lineno = p->lineno;
                p1->pc = 9999;
                p2->pc = 9999;
                p->as = ACMPL;
                p->to.type = D_CONST;
                p->to.offset = 0;
                p1->as = AJNE;
                p1->from.type = D_CONST;
                p1->from.offset = 1; // likely
                p1->to.type = D_BRANCH;
                p1->to.u.branch = p2->link;
                // crash by write to memory address 0.
                // if possible, since we know arg is 0, use 0(arg),
                // which will be shorter to encode than plain 0.
                p2->as = AMOVL;
                p2->from.type = D_AX;
                if(regtyp(&p->from))
                        p2->to.type = p->from.type + D_INDIR;
                else
                        p2->to.type = D_INDIR+D_NONE;
                p2->to.offset = 0;
        }
}

/* [<][>][^][v][top][bottom][index][help] */