root/ext/pcre/pcrelib/sljit/sljitNativeX86_common.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. sljit_get_platform_name
  2. get_cpu_features
  3. get_jump_code
  4. generate_near_jump_code
  5. sljit_generate_code
  6. emit_save_flags
  7. emit_restore_flags
  8. sljit_grow_stack
  9. emit_mov
  10. sljit_emit_op0
  11. emit_mov_byte
  12. emit_mov_half
  13. emit_unary
  14. emit_not_with_flags
  15. emit_clz
  16. sljit_emit_op1
  17. emit_cum_binary
  18. emit_non_cum_binary
  19. emit_mul
  20. emit_lea_binary
  21. emit_cmp_binary
  22. emit_test_binary
  23. emit_shift
  24. emit_shift_with_flags
  25. sljit_emit_op2
  26. sljit_get_register_index
  27. sljit_get_float_register_index
  28. sljit_emit_op_custom
  29. init_compiler
  30. sljit_is_fpu_available
  31. emit_sse2
  32. emit_sse2_logic
  33. emit_sse2_load
  34. emit_sse2_store
  35. sljit_emit_fop1_convw_fromd
  36. sljit_emit_fop1_convd_fromw
  37. sljit_emit_fop1_cmp
  38. sljit_emit_fop1
  39. sljit_emit_fop2
  40. sljit_emit_label
  41. sljit_emit_jump
  42. sljit_emit_ijump
  43. sljit_emit_op_flags
  44. sljit_get_local_base
  45. sljit_emit_const
  46. sljit_set_jump_addr
  47. sljit_set_const

/*
 *    Stack-less Just-In-Time compiler
 *
 *    Copyright 2009-2012 Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are
 * permitted provided that the following conditions are met:
 *
 *   1. Redistributions of source code must retain the above copyright notice, this list of
 *      conditions and the following disclaimer.
 *
 *   2. Redistributions in binary form must reproduce the above copyright notice, this list
 *      of conditions and the following disclaimer in the documentation and/or other materials
 *      provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
 * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

SLJIT_API_FUNC_ATTRIBUTE SLJIT_CONST char* sljit_get_platform_name(void)
{
        return "x86" SLJIT_CPUINFO;
}

/*
   32b register indexes:
     0 - EAX
     1 - ECX
     2 - EDX
     3 - EBX
     4 - none
     5 - EBP
     6 - ESI
     7 - EDI
*/

/*
   64b register indexes:
     0 - RAX
     1 - RCX
     2 - RDX
     3 - RBX
     4 - none
     5 - RBP
     6 - RSI
     7 - RDI
     8 - R8   - From now on REX prefix is required
     9 - R9
    10 - R10
    11 - R11
    12 - R12
    13 - R13
    14 - R14
    15 - R15
*/

#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)

/* Last register + 1. */
#define TMP_REG1        (SLJIT_NUMBER_OF_REGISTERS + 2)

static SLJIT_CONST sljit_ub reg_map[SLJIT_NUMBER_OF_REGISTERS + 3] = {
        0, 0, 2, 1, 0, 0, 0, 0, 7, 6, 3, 4, 5
};

#define CHECK_EXTRA_REGS(p, w, do) \
        if (p >= SLJIT_R3 && p <= SLJIT_R6) { \
                w = SLJIT_LOCALS_OFFSET + ((p) - (SLJIT_R3 + 4)) * sizeof(sljit_sw); \
                p = SLJIT_MEM1(SLJIT_SP); \
                do; \
        }

#else /* SLJIT_CONFIG_X86_32 */

/* Last register + 1. */
#define TMP_REG1        (SLJIT_NUMBER_OF_REGISTERS + 2)
#define TMP_REG2        (SLJIT_NUMBER_OF_REGISTERS + 3)
#define TMP_REG3        (SLJIT_NUMBER_OF_REGISTERS + 4)

/* Note: r12 & 0x7 == 0b100, which decoded as SIB byte present
   Note: avoid to use r12 and r13 for memory addessing
   therefore r12 is better for SAVED_EREG than SAVED_REG. */
#ifndef _WIN64
/* 1st passed in rdi, 2nd argument passed in rsi, 3rd in rdx. */
static SLJIT_CONST sljit_ub reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = {
        0, 0, 6, 1, 8, 11, 10, 12, 5, 13, 14, 15, 3, 4, 2, 7, 9
};
/* low-map. reg_map & 0x7. */
static SLJIT_CONST sljit_ub reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 5] = {
        0, 0, 6, 1, 0, 3,  2,  4,  5,  5,  6,  7, 3, 4, 2, 7, 1
};
#else
/* 1st passed in rcx, 2nd argument passed in rdx, 3rd in r8. */
static SLJIT_CONST sljit_ub reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = {
        0, 0, 2, 1, 11, 12, 5, 13, 14, 15, 7, 6, 3, 4, 10, 8, 9
};
/* low-map. reg_map & 0x7. */
static SLJIT_CONST sljit_ub reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 5] = {
        0, 0, 2, 1, 3,  4,  5,  5, 6,  7,  7, 6, 3, 4, 2,  0, 1
};
#endif

#define REX_W           0x48
#define REX_R           0x44
#define REX_X           0x42
#define REX_B           0x41
#define REX             0x40

#ifndef _WIN64
#define HALFWORD_MAX 0x7fffffffl
#define HALFWORD_MIN -0x80000000l
#else
#define HALFWORD_MAX 0x7fffffffll
#define HALFWORD_MIN -0x80000000ll
#endif

#define IS_HALFWORD(x)          ((x) <= HALFWORD_MAX && (x) >= HALFWORD_MIN)
#define NOT_HALFWORD(x)         ((x) > HALFWORD_MAX || (x) < HALFWORD_MIN)

#define CHECK_EXTRA_REGS(p, w, do)

#endif /* SLJIT_CONFIG_X86_32 */

#define TMP_FREG        (0)

/* Size flags for emit_x86_instruction: */
#define EX86_BIN_INS            0x0010
#define EX86_SHIFT_INS          0x0020
#define EX86_REX                0x0040
#define EX86_NO_REXW            0x0080
#define EX86_BYTE_ARG           0x0100
#define EX86_HALF_ARG           0x0200
#define EX86_PREF_66            0x0400
#define EX86_PREF_F2            0x0800
#define EX86_PREF_F3            0x1000
#define EX86_SSE2_OP1           0x2000
#define EX86_SSE2_OP2           0x4000
#define EX86_SSE2               (EX86_SSE2_OP1 | EX86_SSE2_OP2)

/* --------------------------------------------------------------------- */
/*  Instrucion forms                                                     */
/* --------------------------------------------------------------------- */

#define ADD             (/* BINARY */ 0 << 3)
#define ADD_EAX_i32     0x05
#define ADD_r_rm        0x03
#define ADD_rm_r        0x01
#define ADDSD_x_xm      0x58
#define ADC             (/* BINARY */ 2 << 3)
#define ADC_EAX_i32     0x15
#define ADC_r_rm        0x13
#define ADC_rm_r        0x11
#define AND             (/* BINARY */ 4 << 3)
#define AND_EAX_i32     0x25
#define AND_r_rm        0x23
#define AND_rm_r        0x21
#define ANDPD_x_xm      0x54
#define BSR_r_rm        (/* GROUP_0F */ 0xbd)
#define CALL_i32        0xe8
#define CALL_rm         (/* GROUP_FF */ 2 << 3)
#define CDQ             0x99
#define CMOVNE_r_rm     (/* GROUP_0F */ 0x45)
#define CMP             (/* BINARY */ 7 << 3)
#define CMP_EAX_i32     0x3d
#define CMP_r_rm        0x3b
#define CMP_rm_r        0x39
#define CVTPD2PS_x_xm   0x5a
#define CVTSI2SD_x_rm   0x2a
#define CVTTSD2SI_r_xm  0x2c
#define DIV             (/* GROUP_F7 */ 6 << 3)
#define DIVSD_x_xm      0x5e
#define INT3            0xcc
#define IDIV            (/* GROUP_F7 */ 7 << 3)
#define IMUL            (/* GROUP_F7 */ 5 << 3)
#define IMUL_r_rm       (/* GROUP_0F */ 0xaf)
#define IMUL_r_rm_i8    0x6b
#define IMUL_r_rm_i32   0x69
#define JE_i8           0x74
#define JNE_i8          0x75
#define JMP_i8          0xeb
#define JMP_i32         0xe9
#define JMP_rm          (/* GROUP_FF */ 4 << 3)
#define LEA_r_m         0x8d
#define MOV_r_rm        0x8b
#define MOV_r_i32       0xb8
#define MOV_rm_r        0x89
#define MOV_rm_i32      0xc7
#define MOV_rm8_i8      0xc6
#define MOV_rm8_r8      0x88
#define MOVSD_x_xm      0x10
#define MOVSD_xm_x      0x11
#define MOVSXD_r_rm     0x63
#define MOVSX_r_rm8     (/* GROUP_0F */ 0xbe)
#define MOVSX_r_rm16    (/* GROUP_0F */ 0xbf)
#define MOVZX_r_rm8     (/* GROUP_0F */ 0xb6)
#define MOVZX_r_rm16    (/* GROUP_0F */ 0xb7)
#define MUL             (/* GROUP_F7 */ 4 << 3)
#define MULSD_x_xm      0x59
#define NEG_rm          (/* GROUP_F7 */ 3 << 3)
#define NOP             0x90
#define NOT_rm          (/* GROUP_F7 */ 2 << 3)
#define OR              (/* BINARY */ 1 << 3)
#define OR_r_rm         0x0b
#define OR_EAX_i32      0x0d
#define OR_rm_r         0x09
#define OR_rm8_r8       0x08
#define POP_r           0x58
#define POP_rm          0x8f
#define POPF            0x9d
#define PUSH_i32        0x68
#define PUSH_r          0x50
#define PUSH_rm         (/* GROUP_FF */ 6 << 3)
#define PUSHF           0x9c
#define RET_near        0xc3
#define RET_i16         0xc2
#define SBB             (/* BINARY */ 3 << 3)
#define SBB_EAX_i32     0x1d
#define SBB_r_rm        0x1b
#define SBB_rm_r        0x19
#define SAR             (/* SHIFT */ 7 << 3)
#define SHL             (/* SHIFT */ 4 << 3)
#define SHR             (/* SHIFT */ 5 << 3)
#define SUB             (/* BINARY */ 5 << 3)
#define SUB_EAX_i32     0x2d
#define SUB_r_rm        0x2b
#define SUB_rm_r        0x29
#define SUBSD_x_xm      0x5c
#define TEST_EAX_i32    0xa9
#define TEST_rm_r       0x85
#define UCOMISD_x_xm    0x2e
#define UNPCKLPD_x_xm   0x14
#define XCHG_EAX_r      0x90
#define XCHG_r_rm       0x87
#define XOR             (/* BINARY */ 6 << 3)
#define XOR_EAX_i32     0x35
#define XOR_r_rm        0x33
#define XOR_rm_r        0x31
#define XORPD_x_xm      0x57

#define GROUP_0F        0x0f
#define GROUP_F7        0xf7
#define GROUP_FF        0xff
#define GROUP_BINARY_81 0x81
#define GROUP_BINARY_83 0x83
#define GROUP_SHIFT_1   0xd1
#define GROUP_SHIFT_N   0xc1
#define GROUP_SHIFT_CL  0xd3

#define MOD_REG         0xc0
#define MOD_DISP8       0x40

#define INC_SIZE(s)                     (*inst++ = (s), compiler->size += (s))

#define PUSH_REG(r)                     (*inst++ = (PUSH_r + (r)))
#define POP_REG(r)                      (*inst++ = (POP_r + (r)))
#define RET()                           (*inst++ = (RET_near))
#define RET_I16(n)                      (*inst++ = (RET_i16), *inst++ = n, *inst++ = 0)
/* r32, r/m32 */
#define MOV_RM(mod, reg, rm)            (*inst++ = (MOV_r_rm), *inst++ = (mod) << 6 | (reg) << 3 | (rm))

/* Multithreading does not affect these static variables, since they store
   built-in CPU features. Therefore they can be overwritten by different threads
   if they detect the CPU features in the same time. */
#if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
static sljit_si cpu_has_sse2 = -1;
#endif
static sljit_si cpu_has_cmov = -1;

#if defined(_MSC_VER) && _MSC_VER >= 1400
#include <intrin.h>
#endif

static void get_cpu_features(void)
{
        sljit_ui features;

#if defined(_MSC_VER) && _MSC_VER >= 1400

        int CPUInfo[4];
        __cpuid(CPUInfo, 1);
        features = (sljit_ui)CPUInfo[3];

#elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_C)

        /* AT&T syntax. */
        __asm__ (
                "movl $0x1, %%eax\n"
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
                /* On x86-32, there is no red zone, so this
                   should work (no need for a local variable). */
                "push %%ebx\n"
#endif
                "cpuid\n"
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
                "pop %%ebx\n"
#endif
                "movl %%edx, %0\n"
                : "=g" (features)
                :
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
                : "%eax", "%ecx", "%edx"
#else
                : "%rax", "%rbx", "%rcx", "%rdx"
#endif
        );

#else /* _MSC_VER && _MSC_VER >= 1400 */

        /* Intel syntax. */
        __asm {
                mov eax, 1
                cpuid
                mov features, edx
        }

#endif /* _MSC_VER && _MSC_VER >= 1400 */

#if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
        cpu_has_sse2 = (features >> 26) & 0x1;
#endif
        cpu_has_cmov = (features >> 15) & 0x1;
}

static sljit_ub get_jump_code(sljit_si type)
{
        switch (type) {
        case SLJIT_EQUAL:
        case SLJIT_D_EQUAL:
                return 0x84 /* je */;

        case SLJIT_NOT_EQUAL:
        case SLJIT_D_NOT_EQUAL:
                return 0x85 /* jne */;

        case SLJIT_LESS:
        case SLJIT_D_LESS:
                return 0x82 /* jc */;

        case SLJIT_GREATER_EQUAL:
        case SLJIT_D_GREATER_EQUAL:
                return 0x83 /* jae */;

        case SLJIT_GREATER:
        case SLJIT_D_GREATER:
                return 0x87 /* jnbe */;

        case SLJIT_LESS_EQUAL:
        case SLJIT_D_LESS_EQUAL:
                return 0x86 /* jbe */;

        case SLJIT_SIG_LESS:
                return 0x8c /* jl */;

        case SLJIT_SIG_GREATER_EQUAL:
                return 0x8d /* jnl */;

        case SLJIT_SIG_GREATER:
                return 0x8f /* jnle */;

        case SLJIT_SIG_LESS_EQUAL:
                return 0x8e /* jle */;

        case SLJIT_OVERFLOW:
        case SLJIT_MUL_OVERFLOW:
                return 0x80 /* jo */;

        case SLJIT_NOT_OVERFLOW:
        case SLJIT_MUL_NOT_OVERFLOW:
                return 0x81 /* jno */;

        case SLJIT_D_UNORDERED:
                return 0x8a /* jp */;

        case SLJIT_D_ORDERED:
                return 0x8b /* jpo */;
        }
        return 0;
}

static sljit_ub* generate_far_jump_code(struct sljit_jump *jump, sljit_ub *code_ptr, sljit_si type);

#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
static sljit_ub* generate_fixed_jump(sljit_ub *code_ptr, sljit_sw addr, sljit_si type);
#endif

static sljit_ub* generate_near_jump_code(struct sljit_jump *jump, sljit_ub *code_ptr, sljit_ub *code, sljit_si type)
{
        sljit_si short_jump;
        sljit_uw label_addr;

        if (jump->flags & JUMP_LABEL)
                label_addr = (sljit_uw)(code + jump->u.label->size);
        else
                label_addr = jump->u.target;
        short_jump = (sljit_sw)(label_addr - (jump->addr + 2)) >= -128 && (sljit_sw)(label_addr - (jump->addr + 2)) <= 127;

#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
        if ((sljit_sw)(label_addr - (jump->addr + 1)) > HALFWORD_MAX || (sljit_sw)(label_addr - (jump->addr + 1)) < HALFWORD_MIN)
                return generate_far_jump_code(jump, code_ptr, type);
#endif

        if (type == SLJIT_JUMP) {
                if (short_jump)
                        *code_ptr++ = JMP_i8;
                else
                        *code_ptr++ = JMP_i32;
                jump->addr++;
        }
        else if (type >= SLJIT_FAST_CALL) {
                short_jump = 0;
                *code_ptr++ = CALL_i32;
                jump->addr++;
        }
        else if (short_jump) {
                *code_ptr++ = get_jump_code(type) - 0x10;
                jump->addr++;
        }
        else {
                *code_ptr++ = GROUP_0F;
                *code_ptr++ = get_jump_code(type);
                jump->addr += 2;
        }

        if (short_jump) {
                jump->flags |= PATCH_MB;
                code_ptr += sizeof(sljit_sb);
        } else {
                jump->flags |= PATCH_MW;
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
                code_ptr += sizeof(sljit_sw);
#else
                code_ptr += sizeof(sljit_si);
#endif
        }

        return code_ptr;
}

SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compiler)
{
        struct sljit_memory_fragment *buf;
        sljit_ub *code;
        sljit_ub *code_ptr;
        sljit_ub *buf_ptr;
        sljit_ub *buf_end;
        sljit_ub len;

        struct sljit_label *label;
        struct sljit_jump *jump;
        struct sljit_const *const_;

        CHECK_ERROR_PTR();
        CHECK_PTR(check_sljit_generate_code(compiler));
        reverse_buf(compiler);

        /* Second code generation pass. */
        code = (sljit_ub*)SLJIT_MALLOC_EXEC(compiler->size);
        PTR_FAIL_WITH_EXEC_IF(code);
        buf = compiler->buf;

        code_ptr = code;
        label = compiler->labels;
        jump = compiler->jumps;
        const_ = compiler->consts;
        do {
                buf_ptr = buf->memory;
                buf_end = buf_ptr + buf->used_size;
                do {
                        len = *buf_ptr++;
                        if (len > 0) {
                                /* The code is already generated. */
                                SLJIT_MEMMOVE(code_ptr, buf_ptr, len);
                                code_ptr += len;
                                buf_ptr += len;
                        }
                        else {
                                if (*buf_ptr >= 4) {
                                        jump->addr = (sljit_uw)code_ptr;
                                        if (!(jump->flags & SLJIT_REWRITABLE_JUMP))
                                                code_ptr = generate_near_jump_code(jump, code_ptr, code, *buf_ptr - 4);
                                        else
                                                code_ptr = generate_far_jump_code(jump, code_ptr, *buf_ptr - 4);
                                        jump = jump->next;
                                }
                                else if (*buf_ptr == 0) {
                                        label->addr = (sljit_uw)code_ptr;
                                        label->size = code_ptr - code;
                                        label = label->next;
                                }
                                else if (*buf_ptr == 1) {
                                        const_->addr = ((sljit_uw)code_ptr) - sizeof(sljit_sw);
                                        const_ = const_->next;
                                }
                                else {
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
                                        *code_ptr++ = (*buf_ptr == 2) ? CALL_i32 : JMP_i32;
                                        buf_ptr++;
                                        *(sljit_sw*)code_ptr = *(sljit_sw*)buf_ptr - ((sljit_sw)code_ptr + sizeof(sljit_sw));
                                        code_ptr += sizeof(sljit_sw);
                                        buf_ptr += sizeof(sljit_sw) - 1;
#else
                                        code_ptr = generate_fixed_jump(code_ptr, *(sljit_sw*)(buf_ptr + 1), *buf_ptr);
                                        buf_ptr += sizeof(sljit_sw);
#endif
                                }
                                buf_ptr++;
                        }
                } while (buf_ptr < buf_end);
                SLJIT_ASSERT(buf_ptr == buf_end);
                buf = buf->next;
        } while (buf);

        SLJIT_ASSERT(!label);
        SLJIT_ASSERT(!jump);
        SLJIT_ASSERT(!const_);

        jump = compiler->jumps;
        while (jump) {
                if (jump->flags & PATCH_MB) {
                        SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_sb))) >= -128 && (sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_sb))) <= 127);
                        *(sljit_ub*)jump->addr = (sljit_ub)(jump->u.label->addr - (jump->addr + sizeof(sljit_sb)));
                } else if (jump->flags & PATCH_MW) {
                        if (jump->flags & JUMP_LABEL) {
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
                                *(sljit_sw*)jump->addr = (sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_sw)));
#else
                                SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_si))) >= HALFWORD_MIN && (sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_si))) <= HALFWORD_MAX);
                                *(sljit_si*)jump->addr = (sljit_si)(jump->u.label->addr - (jump->addr + sizeof(sljit_si)));
#endif
                        }
                        else {
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
                                *(sljit_sw*)jump->addr = (sljit_sw)(jump->u.target - (jump->addr + sizeof(sljit_sw)));
#else
                                SLJIT_ASSERT((sljit_sw)(jump->u.target - (jump->addr + sizeof(sljit_si))) >= HALFWORD_MIN && (sljit_sw)(jump->u.target - (jump->addr + sizeof(sljit_si))) <= HALFWORD_MAX);
                                *(sljit_si*)jump->addr = (sljit_si)(jump->u.target - (jump->addr + sizeof(sljit_si)));
#endif
                        }
                }
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
                else if (jump->flags & PATCH_MD)
                        *(sljit_sw*)jump->addr = jump->u.label->addr;
#endif

                jump = jump->next;
        }

        /* Maybe we waste some space because of short jumps. */
        SLJIT_ASSERT(code_ptr <= code + compiler->size);
        compiler->error = SLJIT_ERR_COMPILED;
        compiler->executable_size = code_ptr - code;
        return (void*)code;
}

/* --------------------------------------------------------------------- */
/*  Operators                                                            */
/* --------------------------------------------------------------------- */

static sljit_si emit_cum_binary(struct sljit_compiler *compiler,
        sljit_ub op_rm, sljit_ub op_mr, sljit_ub op_imm, sljit_ub op_eax_imm,
        sljit_si dst, sljit_sw dstw,
        sljit_si src1, sljit_sw src1w,
        sljit_si src2, sljit_sw src2w);

static sljit_si emit_non_cum_binary(struct sljit_compiler *compiler,
        sljit_ub op_rm, sljit_ub op_mr, sljit_ub op_imm, sljit_ub op_eax_imm,
        sljit_si dst, sljit_sw dstw,
        sljit_si src1, sljit_sw src1w,
        sljit_si src2, sljit_sw src2w);

static sljit_si emit_mov(struct sljit_compiler *compiler,
        sljit_si dst, sljit_sw dstw,
        sljit_si src, sljit_sw srcw);

static SLJIT_INLINE sljit_si emit_save_flags(struct sljit_compiler *compiler)
{
        sljit_ub *inst;

#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
        inst = (sljit_ub*)ensure_buf(compiler, 1 + 5);
        FAIL_IF(!inst);
        INC_SIZE(5);
#else
        inst = (sljit_ub*)ensure_buf(compiler, 1 + 6);
        FAIL_IF(!inst);
        INC_SIZE(6);
        *inst++ = REX_W;
#endif
        *inst++ = LEA_r_m; /* lea esp/rsp, [esp/rsp + sizeof(sljit_sw)] */
        *inst++ = 0x64;
        *inst++ = 0x24;
        *inst++ = (sljit_ub)sizeof(sljit_sw);
        *inst++ = PUSHF;
        compiler->flags_saved = 1;
        return SLJIT_SUCCESS;
}

static SLJIT_INLINE sljit_si emit_restore_flags(struct sljit_compiler *compiler, sljit_si keep_flags)
{
        sljit_ub *inst;

#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
        inst = (sljit_ub*)ensure_buf(compiler, 1 + 5);
        FAIL_IF(!inst);
        INC_SIZE(5);
        *inst++ = POPF;
#else
        inst = (sljit_ub*)ensure_buf(compiler, 1 + 6);
        FAIL_IF(!inst);
        INC_SIZE(6);
        *inst++ = POPF;
        *inst++ = REX_W;
#endif
        *inst++ = LEA_r_m; /* lea esp/rsp, [esp/rsp - sizeof(sljit_sw)] */
        *inst++ = 0x64;
        *inst++ = 0x24;
        *inst++ = (sljit_ub)-(sljit_sb)sizeof(sljit_sw);
        compiler->flags_saved = keep_flags;
        return SLJIT_SUCCESS;
}

#ifdef _WIN32
#include <malloc.h>

static void SLJIT_CALL sljit_grow_stack(sljit_sw local_size)
{
        /* Workaround for calling the internal _chkstk() function on Windows.
        This function touches all 4k pages belongs to the requested stack space,
        which size is passed in local_size. This is necessary on Windows where
        the stack can only grow in 4k steps. However, this function just burn
        CPU cycles if the stack is large enough. However, you don't know it in
        advance, so it must always be called. I think this is a bad design in
        general even if it has some reasons. */
        *(volatile sljit_si*)alloca(local_size) = 0;
}

#endif

#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
#include "sljitNativeX86_32.c"
#else
#include "sljitNativeX86_64.c"
#endif

static sljit_si emit_mov(struct sljit_compiler *compiler,
        sljit_si dst, sljit_sw dstw,
        sljit_si src, sljit_sw srcw)
{
        sljit_ub* inst;

        if (dst == SLJIT_UNUSED) {
                /* No destination, doesn't need to setup flags. */
                if (src & SLJIT_MEM) {
                        inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
                        FAIL_IF(!inst);
                        *inst = MOV_r_rm;
                }
                return SLJIT_SUCCESS;
        }
        if (FAST_IS_REG(src)) {
                inst = emit_x86_instruction(compiler, 1, src, 0, dst, dstw);
                FAIL_IF(!inst);
                *inst = MOV_rm_r;
                return SLJIT_SUCCESS;
        }
        if (src & SLJIT_IMM) {
                if (FAST_IS_REG(dst)) {
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
                        return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
#else
                        if (!compiler->mode32) {
                                if (NOT_HALFWORD(srcw))
                                        return emit_load_imm64(compiler, dst, srcw);
                        }
                        else
                                return emit_do_imm32(compiler, (reg_map[dst] >= 8) ? REX_B : 0, MOV_r_i32 + reg_lmap[dst], srcw);
#endif
                }
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
                if (!compiler->mode32 && NOT_HALFWORD(srcw)) {
                        FAIL_IF(emit_load_imm64(compiler, TMP_REG2, srcw));
                        inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, dst, dstw);
                        FAIL_IF(!inst);
                        *inst = MOV_rm_r;
                        return SLJIT_SUCCESS;
                }
#endif
                inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, dstw);
                FAIL_IF(!inst);
                *inst = MOV_rm_i32;
                return SLJIT_SUCCESS;
        }
        if (FAST_IS_REG(dst)) {
                inst = emit_x86_instruction(compiler, 1, dst, 0, src, srcw);
                FAIL_IF(!inst);
                *inst = MOV_r_rm;
                return SLJIT_SUCCESS;
        }

        /* Memory to memory move. Requires two instruction. */
        inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
        FAIL_IF(!inst);
        *inst = MOV_r_rm;
        inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
        FAIL_IF(!inst);
        *inst = MOV_rm_r;
        return SLJIT_SUCCESS;
}

#define EMIT_MOV(compiler, dst, dstw, src, srcw) \
        FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));

SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler, sljit_si op)
{
        sljit_ub *inst;
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
        sljit_si size;
#endif

        CHECK_ERROR();
        CHECK(check_sljit_emit_op0(compiler, op));

        switch (GET_OPCODE(op)) {
        case SLJIT_BREAKPOINT:
                inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
                FAIL_IF(!inst);
                INC_SIZE(1);
                *inst = INT3;
                break;
        case SLJIT_NOP:
                inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
                FAIL_IF(!inst);
                INC_SIZE(1);
                *inst = NOP;
                break;
        case SLJIT_LUMUL:
        case SLJIT_LSMUL:
        case SLJIT_LUDIV:
        case SLJIT_LSDIV:
                compiler->flags_saved = 0;
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
#ifdef _WIN64
                SLJIT_COMPILE_ASSERT(
                        reg_map[SLJIT_R0] == 0
                        && reg_map[SLJIT_R1] == 2
                        && reg_map[TMP_REG1] > 7,
                        invalid_register_assignment_for_div_mul);
#else
                SLJIT_COMPILE_ASSERT(
                        reg_map[SLJIT_R0] == 0
                        && reg_map[SLJIT_R1] < 7
                        && reg_map[TMP_REG1] == 2,
                        invalid_register_assignment_for_div_mul);
#endif
                compiler->mode32 = op & SLJIT_INT_OP;
#endif

                op = GET_OPCODE(op);
                if (op == SLJIT_LUDIV) {
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
                        EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
                        inst = emit_x86_instruction(compiler, 1, SLJIT_R1, 0, SLJIT_R1, 0);
#else
                        inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
#endif
                        FAIL_IF(!inst);
                        *inst = XOR_r_rm;
                }

                if (op == SLJIT_LSDIV) {
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
                        EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
#endif

#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
                        inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
                        FAIL_IF(!inst);
                        INC_SIZE(1);
                        *inst = CDQ;
#else
                        if (compiler->mode32) {
                                inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
                                FAIL_IF(!inst);
                                INC_SIZE(1);
                                *inst = CDQ;
                        } else {
                                inst = (sljit_ub*)ensure_buf(compiler, 1 + 2);
                                FAIL_IF(!inst);
                                INC_SIZE(2);
                                *inst++ = REX_W;
                                *inst = CDQ;
                        }
#endif
                }

#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
                inst = (sljit_ub*)ensure_buf(compiler, 1 + 2);
                FAIL_IF(!inst);
                INC_SIZE(2);
                *inst++ = GROUP_F7;
                *inst = MOD_REG | ((op >= SLJIT_LUDIV) ? reg_map[TMP_REG1] : reg_map[SLJIT_R1]);
#else
#ifdef _WIN64
                size = (!compiler->mode32 || op >= SLJIT_LUDIV) ? 3 : 2;
#else
                size = (!compiler->mode32) ? 3 : 2;
#endif
                inst = (sljit_ub*)ensure_buf(compiler, 1 + size);
                FAIL_IF(!inst);
                INC_SIZE(size);
#ifdef _WIN64
                if (!compiler->mode32)
                        *inst++ = REX_W | ((op >= SLJIT_LUDIV) ? REX_B : 0);
                else if (op >= SLJIT_LUDIV)
                        *inst++ = REX_B;
                *inst++ = GROUP_F7;
                *inst = MOD_REG | ((op >= SLJIT_LUDIV) ? reg_lmap[TMP_REG1] : reg_lmap[SLJIT_R1]);
#else
                if (!compiler->mode32)
                        *inst++ = REX_W;
                *inst++ = GROUP_F7;
                *inst = MOD_REG | reg_map[SLJIT_R1];
#endif
#endif
                switch (op) {
                case SLJIT_LUMUL:
                        *inst |= MUL;
                        break;
                case SLJIT_LSMUL:
                        *inst |= IMUL;
                        break;
                case SLJIT_LUDIV:
                        *inst |= DIV;
                        break;
                case SLJIT_LSDIV:
                        *inst |= IDIV;
                        break;
                }
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && !defined(_WIN64)
                EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
#endif
                break;
        }

        return SLJIT_SUCCESS;
}

#define ENCODE_PREFIX(prefix) \
        do { \
                inst = (sljit_ub*)ensure_buf(compiler, 1 + 1); \
                FAIL_IF(!inst); \
                INC_SIZE(1); \
                *inst = (prefix); \
        } while (0)

static sljit_si emit_mov_byte(struct sljit_compiler *compiler, sljit_si sign,
        sljit_si dst, sljit_sw dstw,
        sljit_si src, sljit_sw srcw)
{
        sljit_ub* inst;
        sljit_si dst_r;
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
        sljit_si work_r;
#endif

#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
        compiler->mode32 = 0;
#endif

        if (dst == SLJIT_UNUSED && !(src & SLJIT_MEM))
                return SLJIT_SUCCESS; /* Empty instruction. */

        if (src & SLJIT_IMM) {
                if (FAST_IS_REG(dst)) {
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
                        return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
#else
                        inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
                        FAIL_IF(!inst);
                        *inst = MOV_rm_i32;
                        return SLJIT_SUCCESS;
#endif
                }
                inst = emit_x86_instruction(compiler, 1 | EX86_BYTE_ARG | EX86_NO_REXW, SLJIT_IMM, srcw, dst, dstw);
                FAIL_IF(!inst);
                *inst = MOV_rm8_i8;
                return SLJIT_SUCCESS;
        }

        dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;

        if ((dst & SLJIT_MEM) && FAST_IS_REG(src)) {
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
                if (reg_map[src] >= 4) {
                        SLJIT_ASSERT(dst_r == TMP_REG1);
                        EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
                } else
                        dst_r = src;
#else
                dst_r = src;
#endif
        }
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
        else if (FAST_IS_REG(src) && reg_map[src] >= 4) {
                /* src, dst are registers. */
                SLJIT_ASSERT(SLOW_IS_REG(dst));
                if (reg_map[dst] < 4) {
                        if (dst != src)
                                EMIT_MOV(compiler, dst, 0, src, 0);
                        inst = emit_x86_instruction(compiler, 2, dst, 0, dst, 0);
                        FAIL_IF(!inst);
                        *inst++ = GROUP_0F;
                        *inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
                }
                else {
                        if (dst != src)
                                EMIT_MOV(compiler, dst, 0, src, 0);
                        if (sign) {
                                /* shl reg, 24 */
                                inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
                                FAIL_IF(!inst);
                                *inst |= SHL;
                                /* sar reg, 24 */
                                inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
                                FAIL_IF(!inst);
                                *inst |= SAR;
                        }
                        else {
                                inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 0xff, dst, 0);
                                FAIL_IF(!inst);
                                *(inst + 1) |= AND;
                        }
                }
                return SLJIT_SUCCESS;
        }
#endif
        else {
                /* src can be memory addr or reg_map[src] < 4 on x86_32 architectures. */
                inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
                FAIL_IF(!inst);
                *inst++ = GROUP_0F;
                *inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
        }

        if (dst & SLJIT_MEM) {
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
                if (dst_r == TMP_REG1) {
                        /* Find a non-used register, whose reg_map[src] < 4. */
                        if ((dst & REG_MASK) == SLJIT_R0) {
                                if ((dst & OFFS_REG_MASK) == TO_OFFS_REG(SLJIT_R1))
                                        work_r = SLJIT_R2;
                                else
                                        work_r = SLJIT_R1;
                        }
                        else {
                                if ((dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R0))
                                        work_r = SLJIT_R0;
                                else if ((dst & REG_MASK) == SLJIT_R1)
                                        work_r = SLJIT_R2;
                                else
                                        work_r = SLJIT_R1;
                        }

                        if (work_r == SLJIT_R0) {
                                ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]);
                        }
                        else {
                                inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
                                FAIL_IF(!inst);
                                *inst = XCHG_r_rm;
                        }

                        inst = emit_x86_instruction(compiler, 1, work_r, 0, dst, dstw);
                        FAIL_IF(!inst);
                        *inst = MOV_rm8_r8;

                        if (work_r == SLJIT_R0) {
                                ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]);
                        }
                        else {
                                inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
                                FAIL_IF(!inst);
                                *inst = XCHG_r_rm;
                        }
                }
                else {
                        inst = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
                        FAIL_IF(!inst);
                        *inst = MOV_rm8_r8;
                }
#else
                inst = emit_x86_instruction(compiler, 1 | EX86_REX | EX86_NO_REXW, dst_r, 0, dst, dstw);
                FAIL_IF(!inst);
                *inst = MOV_rm8_r8;
#endif
        }

        return SLJIT_SUCCESS;
}

static sljit_si emit_mov_half(struct sljit_compiler *compiler, sljit_si sign,
        sljit_si dst, sljit_sw dstw,
        sljit_si src, sljit_sw srcw)
{
        sljit_ub* inst;
        sljit_si dst_r;

#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
        compiler->mode32 = 0;
#endif

        if (dst == SLJIT_UNUSED && !(src & SLJIT_MEM))
                return SLJIT_SUCCESS; /* Empty instruction. */

        if (src & SLJIT_IMM) {
                if (FAST_IS_REG(dst)) {
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
                        return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
#else
                        inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
                        FAIL_IF(!inst);
                        *inst = MOV_rm_i32;
                        return SLJIT_SUCCESS;
#endif
                }
                inst = emit_x86_instruction(compiler, 1 | EX86_HALF_ARG | EX86_NO_REXW | EX86_PREF_66, SLJIT_IMM, srcw, dst, dstw);
                FAIL_IF(!inst);
                *inst = MOV_rm_i32;
                return SLJIT_SUCCESS;
        }

        dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;

        if ((dst & SLJIT_MEM) && FAST_IS_REG(src))
                dst_r = src;
        else {
                inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
                FAIL_IF(!inst);
                *inst++ = GROUP_0F;
                *inst = sign ? MOVSX_r_rm16 : MOVZX_r_rm16;
        }

        if (dst & SLJIT_MEM) {
                inst = emit_x86_instruction(compiler, 1 | EX86_NO_REXW | EX86_PREF_66, dst_r, 0, dst, dstw);
                FAIL_IF(!inst);
                *inst = MOV_rm_r;
        }

        return SLJIT_SUCCESS;
}

static sljit_si emit_unary(struct sljit_compiler *compiler, sljit_ub opcode,
        sljit_si dst, sljit_sw dstw,
        sljit_si src, sljit_sw srcw)
{
        sljit_ub* inst;

        if (dst == SLJIT_UNUSED) {
                EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
                inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
                FAIL_IF(!inst);
                *inst++ = GROUP_F7;
                *inst |= opcode;
                return SLJIT_SUCCESS;
        }
        if (dst == src && dstw == srcw) {
                /* Same input and output */
                inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
                FAIL_IF(!inst);
                *inst++ = GROUP_F7;
                *inst |= opcode;
                return SLJIT_SUCCESS;
        }
        if (FAST_IS_REG(dst)) {
                EMIT_MOV(compiler, dst, 0, src, srcw);
                inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
                FAIL_IF(!inst);
                *inst++ = GROUP_F7;
                *inst |= opcode;
                return SLJIT_SUCCESS;
        }
        EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
        inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
        FAIL_IF(!inst);
        *inst++ = GROUP_F7;
        *inst |= opcode;
        EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
        return SLJIT_SUCCESS;
}

static sljit_si emit_not_with_flags(struct sljit_compiler *compiler,
        sljit_si dst, sljit_sw dstw,
        sljit_si src, sljit_sw srcw)
{
        sljit_ub* inst;

        if (dst == SLJIT_UNUSED) {
                EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
                inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
                FAIL_IF(!inst);
                *inst++ = GROUP_F7;
                *inst |= NOT_rm;
                inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
                FAIL_IF(!inst);
                *inst = OR_r_rm;
                return SLJIT_SUCCESS;
        }
        if (FAST_IS_REG(dst)) {
                EMIT_MOV(compiler, dst, 0, src, srcw);
                inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
                FAIL_IF(!inst);
                *inst++ = GROUP_F7;
                *inst |= NOT_rm;
                inst = emit_x86_instruction(compiler, 1, dst, 0, dst, 0);
                FAIL_IF(!inst);
                *inst = OR_r_rm;
                return SLJIT_SUCCESS;
        }
        EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
        inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
        FAIL_IF(!inst);
        *inst++ = GROUP_F7;
        *inst |= NOT_rm;
        inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
        FAIL_IF(!inst);
        *inst = OR_r_rm;
        EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
        return SLJIT_SUCCESS;
}

static sljit_si emit_clz(struct sljit_compiler *compiler, sljit_si op_flags,
        sljit_si dst, sljit_sw dstw,
        sljit_si src, sljit_sw srcw)
{
        sljit_ub* inst;
        sljit_si dst_r;

        SLJIT_UNUSED_ARG(op_flags);
        if (SLJIT_UNLIKELY(dst == SLJIT_UNUSED)) {
                /* Just set the zero flag. */
                EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
                inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
                FAIL_IF(!inst);
                *inst++ = GROUP_F7;
                *inst |= NOT_rm;
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
                inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 31, TMP_REG1, 0);
#else
                inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, !(op_flags & SLJIT_INT_OP) ? 63 : 31, TMP_REG1, 0);
#endif
                FAIL_IF(!inst);
                *inst |= SHR;
                return SLJIT_SUCCESS;
        }

        if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
                EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
                src = TMP_REG1;
                srcw = 0;
        }

        inst = emit_x86_instruction(compiler, 2, TMP_REG1, 0, src, srcw);
        FAIL_IF(!inst);
        *inst++ = GROUP_0F;
        *inst = BSR_r_rm;

#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
        if (FAST_IS_REG(dst))
                dst_r = dst;
        else {
                /* Find an unused temporary register. */
                if ((dst & REG_MASK) != SLJIT_R0 && (dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R0))
                        dst_r = SLJIT_R0;
                else if ((dst & REG_MASK) != SLJIT_R1 && (dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R1))
                        dst_r = SLJIT_R1;
                else
                        dst_r = SLJIT_R2;
                EMIT_MOV(compiler, dst, dstw, dst_r, 0);
        }
        EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, 32 + 31);
#else
        dst_r = FAST_IS_REG(dst) ? dst : TMP_REG2;
        compiler->mode32 = 0;
        EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, !(op_flags & SLJIT_INT_OP) ? 64 + 63 : 32 + 31);
        compiler->mode32 = op_flags & SLJIT_INT_OP;
#endif

        if (cpu_has_cmov == -1)
                get_cpu_features();

        if (cpu_has_cmov) {
                inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG1, 0);
                FAIL_IF(!inst);
                *inst++ = GROUP_0F;
                *inst = CMOVNE_r_rm;
        } else {
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
                inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
                FAIL_IF(!inst);
                INC_SIZE(4);

                *inst++ = JE_i8;
                *inst++ = 2;
                *inst++ = MOV_r_rm;
                *inst++ = MOD_REG | (reg_map[dst_r] << 3) | reg_map[TMP_REG1];
#else
                inst = (sljit_ub*)ensure_buf(compiler, 1 + 5);
                FAIL_IF(!inst);
                INC_SIZE(5);

                *inst++ = JE_i8;
                *inst++ = 3;
                *inst++ = REX_W | (reg_map[dst_r] >= 8 ? REX_R : 0) | (reg_map[TMP_REG1] >= 8 ? REX_B : 0);
                *inst++ = MOV_r_rm;
                *inst++ = MOD_REG | (reg_lmap[dst_r] << 3) | reg_lmap[TMP_REG1];
#endif
        }

#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
        inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 31, dst_r, 0);
#else
        inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, !(op_flags & SLJIT_INT_OP) ? 63 : 31, dst_r, 0);
#endif
        FAIL_IF(!inst);
        *(inst + 1) |= XOR;

#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
        if (dst & SLJIT_MEM) {
                inst = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
                FAIL_IF(!inst);
                *inst = XCHG_r_rm;
        }
#else
        if (dst & SLJIT_MEM)
                EMIT_MOV(compiler, dst, dstw, TMP_REG2, 0);
#endif
        return SLJIT_SUCCESS;
}

SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op1(struct sljit_compiler *compiler, sljit_si op,
        sljit_si dst, sljit_sw dstw,
        sljit_si src, sljit_sw srcw)
{
        sljit_ub* inst;
        sljit_si update = 0;
        sljit_si op_flags = GET_ALL_FLAGS(op);
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
        sljit_si dst_is_ereg = 0;
        sljit_si src_is_ereg = 0;
#else
#       define src_is_ereg 0
#endif

        CHECK_ERROR();
        CHECK(check_sljit_emit_op1(compiler, op, dst, dstw, src, srcw));
        ADJUST_LOCAL_OFFSET(dst, dstw);
        ADJUST_LOCAL_OFFSET(src, srcw);

        CHECK_EXTRA_REGS(dst, dstw, dst_is_ereg = 1);
        CHECK_EXTRA_REGS(src, srcw, src_is_ereg = 1);
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
        compiler->mode32 = op_flags & SLJIT_INT_OP;
#endif

        op = GET_OPCODE(op);
        if (op >= SLJIT_MOV && op <= SLJIT_MOVU_P) {
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
                compiler->mode32 = 0;
#endif

                if (op_flags & SLJIT_INT_OP) {
                        if (FAST_IS_REG(src) && src == dst) {
                                if (!TYPE_CAST_NEEDED(op))
                                        return SLJIT_SUCCESS;
                        }
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
                        if (op == SLJIT_MOV_SI && (src & SLJIT_MEM))
                                op = SLJIT_MOV_UI;
                        if (op == SLJIT_MOVU_SI && (src & SLJIT_MEM))
                                op = SLJIT_MOVU_UI;
                        if (op == SLJIT_MOV_UI && (src & SLJIT_IMM))
                                op = SLJIT_MOV_SI;
                        if (op == SLJIT_MOVU_UI && (src & SLJIT_IMM))
                                op = SLJIT_MOVU_SI;
#endif
                }

                SLJIT_COMPILE_ASSERT(SLJIT_MOV + 8 == SLJIT_MOVU, movu_offset);
                if (op >= SLJIT_MOVU) {
                        update = 1;
                        op -= 8;
                }

                if (src & SLJIT_IMM) {
                        switch (op) {
                        case SLJIT_MOV_UB:
                                srcw = (sljit_ub)srcw;
                                break;
                        case SLJIT_MOV_SB:
                                srcw = (sljit_sb)srcw;
                                break;
                        case SLJIT_MOV_UH:
                                srcw = (sljit_uh)srcw;
                                break;
                        case SLJIT_MOV_SH:
                                srcw = (sljit_sh)srcw;
                                break;
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
                        case SLJIT_MOV_UI:
                                srcw = (sljit_ui)srcw;
                                break;
                        case SLJIT_MOV_SI:
                                srcw = (sljit_si)srcw;
                                break;
#endif
                        }
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
                        if (SLJIT_UNLIKELY(dst_is_ereg))
                                return emit_mov(compiler, dst, dstw, src, srcw);
#endif
                }

                if (SLJIT_UNLIKELY(update) && (src & SLJIT_MEM) && !src_is_ereg && (src & REG_MASK) && (srcw != 0 || (src & OFFS_REG_MASK) != 0)) {
                        inst = emit_x86_instruction(compiler, 1, src & REG_MASK, 0, src, srcw);
                        FAIL_IF(!inst);
                        *inst = LEA_r_m;
                        src &= SLJIT_MEM | 0xf;
                        srcw = 0;
                }

#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
                if (SLJIT_UNLIKELY(dst_is_ereg) && (!(op == SLJIT_MOV || op == SLJIT_MOV_UI || op == SLJIT_MOV_SI || op == SLJIT_MOV_P) || (src & SLJIT_MEM))) {
                        SLJIT_ASSERT(dst == SLJIT_MEM1(SLJIT_SP));
                        dst = TMP_REG1;
                }
#endif

                switch (op) {
                case SLJIT_MOV:
                case SLJIT_MOV_P:
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
                case SLJIT_MOV_UI:
                case SLJIT_MOV_SI:
#endif
                        FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
                        break;
                case SLJIT_MOV_UB:
                        FAIL_IF(emit_mov_byte(compiler, 0, dst, dstw, src, srcw));
                        break;
                case SLJIT_MOV_SB:
                        FAIL_IF(emit_mov_byte(compiler, 1, dst, dstw, src, srcw));
                        break;
                case SLJIT_MOV_UH:
                        FAIL_IF(emit_mov_half(compiler, 0, dst, dstw, src, srcw));
                        break;
                case SLJIT_MOV_SH:
                        FAIL_IF(emit_mov_half(compiler, 1, dst, dstw, src, srcw));
                        break;
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
                case SLJIT_MOV_UI:
                        FAIL_IF(emit_mov_int(compiler, 0, dst, dstw, src, srcw));
                        break;
                case SLJIT_MOV_SI:
                        FAIL_IF(emit_mov_int(compiler, 1, dst, dstw, src, srcw));
                        break;
#endif
                }

#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
                if (SLJIT_UNLIKELY(dst_is_ereg) && dst == TMP_REG1)
                        return emit_mov(compiler, SLJIT_MEM1(SLJIT_SP), dstw, TMP_REG1, 0);
#endif

                if (SLJIT_UNLIKELY(update) && (dst & SLJIT_MEM) && (dst & REG_MASK) && (dstw != 0 || (dst & OFFS_REG_MASK) != 0)) {
                        inst = emit_x86_instruction(compiler, 1, dst & REG_MASK, 0, dst, dstw);
                        FAIL_IF(!inst);
                        *inst = LEA_r_m;
                }
                return SLJIT_SUCCESS;
        }

        if (SLJIT_UNLIKELY(GET_FLAGS(op_flags)))
                compiler->flags_saved = 0;

        switch (op) {
        case SLJIT_NOT:
                if (SLJIT_UNLIKELY(op_flags & SLJIT_SET_E))
                        return emit_not_with_flags(compiler, dst, dstw, src, srcw);
                return emit_unary(compiler, NOT_rm, dst, dstw, src, srcw);

        case SLJIT_NEG:
                if (SLJIT_UNLIKELY(op_flags & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
                        FAIL_IF(emit_save_flags(compiler));
                return emit_unary(compiler, NEG_rm, dst, dstw, src, srcw);

        case SLJIT_CLZ:
                if (SLJIT_UNLIKELY(op_flags & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
                        FAIL_IF(emit_save_flags(compiler));
                return emit_clz(compiler, op_flags, dst, dstw, src, srcw);
        }

        return SLJIT_SUCCESS;

#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
#       undef src_is_ereg
#endif
}

#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)

#define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
        if (IS_HALFWORD(immw) || compiler->mode32) { \
                inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
                FAIL_IF(!inst); \
                *(inst + 1) |= (op_imm); \
        } \
        else { \
                FAIL_IF(emit_load_imm64(compiler, TMP_REG2, immw)); \
                inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, arg, argw); \
                FAIL_IF(!inst); \
                *inst = (op_mr); \
        }

#define BINARY_EAX_IMM(op_eax_imm, immw) \
        FAIL_IF(emit_do_imm32(compiler, (!compiler->mode32) ? REX_W : 0, (op_eax_imm), immw))

#else

#define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
        inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
        FAIL_IF(!inst); \
        *(inst + 1) |= (op_imm);

#define BINARY_EAX_IMM(op_eax_imm, immw) \
        FAIL_IF(emit_do_imm(compiler, (op_eax_imm), immw))

#endif

static sljit_si emit_cum_binary(struct sljit_compiler *compiler,
        sljit_ub op_rm, sljit_ub op_mr, sljit_ub op_imm, sljit_ub op_eax_imm,
        sljit_si dst, sljit_sw dstw,
        sljit_si src1, sljit_sw src1w,
        sljit_si src2, sljit_sw src2w)
{
        sljit_ub* inst;

        if (dst == SLJIT_UNUSED) {
                EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
                if (src2 & SLJIT_IMM) {
                        BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
                }
                else {
                        inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
                        FAIL_IF(!inst);
                        *inst = op_rm;
                }
                return SLJIT_SUCCESS;
        }

        if (dst == src1 && dstw == src1w) {
                if (src2 & SLJIT_IMM) {
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
                        if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
#else
                        if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128)) {
#endif
                                BINARY_EAX_IMM(op_eax_imm, src2w);
                        }
                        else {
                                BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
                        }
                }
                else if (FAST_IS_REG(dst)) {
                        inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
                        FAIL_IF(!inst);
                        *inst = op_rm;
                }
                else if (FAST_IS_REG(src2)) {
                        /* Special exception for sljit_emit_op_flags. */
                        inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
                        FAIL_IF(!inst);
                        *inst = op_mr;
                }
                else {
                        EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
                        inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
                        FAIL_IF(!inst);
                        *inst = op_mr;
                }
                return SLJIT_SUCCESS;
        }

        /* Only for cumulative operations. */
        if (dst == src2 && dstw == src2w) {
                if (src1 & SLJIT_IMM) {
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
                        if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
#else
                        if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128)) {
#endif
                                BINARY_EAX_IMM(op_eax_imm, src1w);
                        }
                        else {
                                BINARY_IMM(op_imm, op_mr, src1w, dst, dstw);
                        }
                }
                else if (FAST_IS_REG(dst)) {
                        inst = emit_x86_instruction(compiler, 1, dst, dstw, src1, src1w);
                        FAIL_IF(!inst);
                        *inst = op_rm;
                }
                else if (FAST_IS_REG(src1)) {
                        inst = emit_x86_instruction(compiler, 1, src1, src1w, dst, dstw);
                        FAIL_IF(!inst);
                        *inst = op_mr;
                }
                else {
                        EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
                        inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
                        FAIL_IF(!inst);
                        *inst = op_mr;
                }
                return SLJIT_SUCCESS;
        }

        /* General version. */
        if (FAST_IS_REG(dst)) {
                EMIT_MOV(compiler, dst, 0, src1, src1w);
                if (src2 & SLJIT_IMM) {
                        BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
                }
                else {
                        inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
                        FAIL_IF(!inst);
                        *inst = op_rm;
                }
        }
        else {
                /* This version requires less memory writing. */
                EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
                if (src2 & SLJIT_IMM) {
                        BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
                }
                else {
                        inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
                        FAIL_IF(!inst);
                        *inst = op_rm;
                }
                EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
        }

        return SLJIT_SUCCESS;
}

static sljit_si emit_non_cum_binary(struct sljit_compiler *compiler,
        sljit_ub op_rm, sljit_ub op_mr, sljit_ub op_imm, sljit_ub op_eax_imm,
        sljit_si dst, sljit_sw dstw,
        sljit_si src1, sljit_sw src1w,
        sljit_si src2, sljit_sw src2w)
{
        sljit_ub* inst;

        if (dst == SLJIT_UNUSED) {
                EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
                if (src2 & SLJIT_IMM) {
                        BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
                }
                else {
                        inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
                        FAIL_IF(!inst);
                        *inst = op_rm;
                }
                return SLJIT_SUCCESS;
        }

        if (dst == src1 && dstw == src1w) {
                if (src2 & SLJIT_IMM) {
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
                        if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
#else
                        if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128)) {
#endif
                                BINARY_EAX_IMM(op_eax_imm, src2w);
                        }
                        else {
                                BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
                        }
                }
                else if (FAST_IS_REG(dst)) {
                        inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
                        FAIL_IF(!inst);
                        *inst = op_rm;
                }
                else if (FAST_IS_REG(src2)) {
                        inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
                        FAIL_IF(!inst);
                        *inst = op_mr;
                }
                else {
                        EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
                        inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
                        FAIL_IF(!inst);
                        *inst = op_mr;
                }
                return SLJIT_SUCCESS;
        }

        /* General version. */
        if (FAST_IS_REG(dst) && dst != src2) {
                EMIT_MOV(compiler, dst, 0, src1, src1w);
                if (src2 & SLJIT_IMM) {
                        BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
                }
                else {
                        inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
                        FAIL_IF(!inst);
                        *inst = op_rm;
                }
        }
        else {
                /* This version requires less memory writing. */
                EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
                if (src2 & SLJIT_IMM) {
                        BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
                }
                else {
                        inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
                        FAIL_IF(!inst);
                        *inst = op_rm;
                }
                EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
        }

        return SLJIT_SUCCESS;
}

static sljit_si emit_mul(struct sljit_compiler *compiler,
        sljit_si dst, sljit_sw dstw,
        sljit_si src1, sljit_sw src1w,
        sljit_si src2, sljit_sw src2w)
{
        sljit_ub* inst;
        sljit_si dst_r;

        dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;

        /* Register destination. */
        if (dst_r == src1 && !(src2 & SLJIT_IMM)) {
                inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
                FAIL_IF(!inst);
                *inst++ = GROUP_0F;
                *inst = IMUL_r_rm;
        }
        else if (dst_r == src2 && !(src1 & SLJIT_IMM)) {
                inst = emit_x86_instruction(compiler, 2, dst_r, 0, src1, src1w);
                FAIL_IF(!inst);
                *inst++ = GROUP_0F;
                *inst = IMUL_r_rm;
        }
        else if (src1 & SLJIT_IMM) {
                if (src2 & SLJIT_IMM) {
                        EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, src2w);
                        src2 = dst_r;
                        src2w = 0;
                }

                if (src1w <= 127 && src1w >= -128) {
                        inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
                        FAIL_IF(!inst);
                        *inst = IMUL_r_rm_i8;
                        inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
                        FAIL_IF(!inst);
                        INC_SIZE(1);
                        *inst = (sljit_sb)src1w;
                }
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
                else {
                        inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
                        FAIL_IF(!inst);
                        *inst = IMUL_r_rm_i32;
                        inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
                        FAIL_IF(!inst);
                        INC_SIZE(4);
                        *(sljit_sw*)inst = src1w;
                }
#else
                else if (IS_HALFWORD(src1w)) {
                        inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
                        FAIL_IF(!inst);
                        *inst = IMUL_r_rm_i32;
                        inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
                        FAIL_IF(!inst);
                        INC_SIZE(4);
                        *(sljit_si*)inst = (sljit_si)src1w;
                }
                else {
                        EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src1w);
                        if (dst_r != src2)
                                EMIT_MOV(compiler, dst_r, 0, src2, src2w);
                        inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
                        FAIL_IF(!inst);
                        *inst++ = GROUP_0F;
                        *inst = IMUL_r_rm;
                }
#endif
        }
        else if (src2 & SLJIT_IMM) {
                /* Note: src1 is NOT immediate. */

                if (src2w <= 127 && src2w >= -128) {
                        inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
                        FAIL_IF(!inst);
                        *inst = IMUL_r_rm_i8;
                        inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
                        FAIL_IF(!inst);
                        INC_SIZE(1);
                        *inst = (sljit_sb)src2w;
                }
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
                else {
                        inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
                        FAIL_IF(!inst);
                        *inst = IMUL_r_rm_i32;
                        inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
                        FAIL_IF(!inst);
                        INC_SIZE(4);
                        *(sljit_sw*)inst = src2w;
                }
#else
                else if (IS_HALFWORD(src2w)) {
                        inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
                        FAIL_IF(!inst);
                        *inst = IMUL_r_rm_i32;
                        inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
                        FAIL_IF(!inst);
                        INC_SIZE(4);
                        *(sljit_si*)inst = (sljit_si)src2w;
                }
                else {
                        EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src2w);
                        if (dst_r != src1)
                                EMIT_MOV(compiler, dst_r, 0, src1, src1w);
                        inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
                        FAIL_IF(!inst);
                        *inst++ = GROUP_0F;
                        *inst = IMUL_r_rm;
                }
#endif
        }
        else {
                /* Neither argument is immediate. */
                if (ADDRESSING_DEPENDS_ON(src2, dst_r))
                        dst_r = TMP_REG1;
                EMIT_MOV(compiler, dst_r, 0, src1, src1w);
                inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
                FAIL_IF(!inst);
                *inst++ = GROUP_0F;
                *inst = IMUL_r_rm;
        }

        if (dst_r == TMP_REG1)
                EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);

        return SLJIT_SUCCESS;
}

static sljit_si emit_lea_binary(struct sljit_compiler *compiler, sljit_si keep_flags,
        sljit_si dst, sljit_sw dstw,
        sljit_si src1, sljit_sw src1w,
        sljit_si src2, sljit_sw src2w)
{
        sljit_ub* inst;
        sljit_si dst_r, done = 0;

        /* These cases better be left to handled by normal way. */
        if (!keep_flags) {
                if (dst == src1 && dstw == src1w)
                        return SLJIT_ERR_UNSUPPORTED;
                if (dst == src2 && dstw == src2w)
                        return SLJIT_ERR_UNSUPPORTED;
        }

        dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;

        if (FAST_IS_REG(src1)) {
                if (FAST_IS_REG(src2)) {
                        inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM2(src1, src2), 0);
                        FAIL_IF(!inst);
                        *inst = LEA_r_m;
                        done = 1;
                }
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
                if ((src2 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src2w))) {
                        inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), (sljit_si)src2w);
#else
                if (src2 & SLJIT_IMM) {
                        inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), src2w);
#endif
                        FAIL_IF(!inst);
                        *inst = LEA_r_m;
                        done = 1;
                }
        }
        else if (FAST_IS_REG(src2)) {
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
                if ((src1 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src1w))) {
                        inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), (sljit_si)src1w);
#else
                if (src1 & SLJIT_IMM) {
                        inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), src1w);
#endif
                        FAIL_IF(!inst);
                        *inst = LEA_r_m;
                        done = 1;
                }
        }

        if (done) {
                if (dst_r == TMP_REG1)
                        return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
                return SLJIT_SUCCESS;
        }
        return SLJIT_ERR_UNSUPPORTED;
}

static sljit_si emit_cmp_binary(struct sljit_compiler *compiler,
        sljit_si src1, sljit_sw src1w,
        sljit_si src2, sljit_sw src2w)
{
        sljit_ub* inst;

#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
        if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
#else
        if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
#endif
                BINARY_EAX_IMM(CMP_EAX_i32, src2w);
                return SLJIT_SUCCESS;
        }

        if (FAST_IS_REG(src1)) {
                if (src2 & SLJIT_IMM) {
                        BINARY_IMM(CMP, CMP_rm_r, src2w, src1, 0);
                }
                else {
                        inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
                        FAIL_IF(!inst);
                        *inst = CMP_r_rm;
                }
                return SLJIT_SUCCESS;
        }

        if (FAST_IS_REG(src2) && !(src1 & SLJIT_IMM)) {
                inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
                FAIL_IF(!inst);
                *inst = CMP_rm_r;
                return SLJIT_SUCCESS;
        }

        if (src2 & SLJIT_IMM) {
                if (src1 & SLJIT_IMM) {
                        EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
                        src1 = TMP_REG1;
                        src1w = 0;
                }
                BINARY_IMM(CMP, CMP_rm_r, src2w, src1, src1w);
        }
        else {
                EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
                inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
                FAIL_IF(!inst);
                *inst = CMP_r_rm;
        }
        return SLJIT_SUCCESS;
}

static sljit_si emit_test_binary(struct sljit_compiler *compiler,
        sljit_si src1, sljit_sw src1w,
        sljit_si src2, sljit_sw src2w)
{
        sljit_ub* inst;

#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
        if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
#else
        if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
#endif
                BINARY_EAX_IMM(TEST_EAX_i32, src2w);
                return SLJIT_SUCCESS;
        }

#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
        if (src2 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
#else
        if (src2 == SLJIT_R0 && (src1 & SLJIT_IMM) && (src1w > 127 || src1w < -128)) {
#endif
                BINARY_EAX_IMM(TEST_EAX_i32, src1w);
                return SLJIT_SUCCESS;
        }

        if (FAST_IS_REG(src1)) {
                if (src2 & SLJIT_IMM) {
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
                        if (IS_HALFWORD(src2w) || compiler->mode32) {
                                inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, 0);
                                FAIL_IF(!inst);
                                *inst = GROUP_F7;
                        }
                        else {
                                FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
                                inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src1, 0);
                                FAIL_IF(!inst);
                                *inst = TEST_rm_r;
                        }
#else
                        inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, 0);
                        FAIL_IF(!inst);
                        *inst = GROUP_F7;
#endif
                }
                else {
                        inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
                        FAIL_IF(!inst);
                        *inst = TEST_rm_r;
                }
                return SLJIT_SUCCESS;
        }

        if (FAST_IS_REG(src2)) {
                if (src1 & SLJIT_IMM) {
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
                        if (IS_HALFWORD(src1w) || compiler->mode32) {
                                inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src1w, src2, 0);
                                FAIL_IF(!inst);
                                *inst = GROUP_F7;
                        }
                        else {
                                FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src1w));
                                inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src2, 0);
                                FAIL_IF(!inst);
                                *inst = TEST_rm_r;
                        }
#else
                        inst = emit_x86_instruction(compiler, 1, src1, src1w, src2, 0);
                        FAIL_IF(!inst);
                        *inst = GROUP_F7;
#endif
                }
                else {
                        inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
                        FAIL_IF(!inst);
                        *inst = TEST_rm_r;
                }
                return SLJIT_SUCCESS;
        }

        EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
        if (src2 & SLJIT_IMM) {
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
                if (IS_HALFWORD(src2w) || compiler->mode32) {
                        inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
                        FAIL_IF(!inst);
                        *inst = GROUP_F7;
                }
                else {
                        FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
                        inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, TMP_REG1, 0);
                        FAIL_IF(!inst);
                        *inst = TEST_rm_r;
                }
#else
                inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
                FAIL_IF(!inst);
                *inst = GROUP_F7;
#endif
        }
        else {
                inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
                FAIL_IF(!inst);
                *inst = TEST_rm_r;
        }
        return SLJIT_SUCCESS;
}

static sljit_si emit_shift(struct sljit_compiler *compiler,
        sljit_ub mode,
        sljit_si dst, sljit_sw dstw,
        sljit_si src1, sljit_sw src1w,
        sljit_si src2, sljit_sw src2w)
{
        sljit_ub* inst;

        if ((src2 & SLJIT_IMM) || (src2 == SLJIT_PREF_SHIFT_REG)) {
                if (dst == src1 && dstw == src1w) {
                        inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, dstw);
                        FAIL_IF(!inst);
                        *inst |= mode;
                        return SLJIT_SUCCESS;
                }
                if (dst == SLJIT_UNUSED) {
                        EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
                        inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
                        FAIL_IF(!inst);
                        *inst |= mode;
                        return SLJIT_SUCCESS;
                }
                if (dst == SLJIT_PREF_SHIFT_REG && src2 == SLJIT_PREF_SHIFT_REG) {
                        EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
                        inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
                        FAIL_IF(!inst);
                        *inst |= mode;
                        EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
                        return SLJIT_SUCCESS;
                }
                if (FAST_IS_REG(dst)) {
                        EMIT_MOV(compiler, dst, 0, src1, src1w);
                        inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, 0);
                        FAIL_IF(!inst);
                        *inst |= mode;
                        return SLJIT_SUCCESS;
                }

                EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
                inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
                FAIL_IF(!inst);
                *inst |= mode;
                EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
                return SLJIT_SUCCESS;
        }

        if (dst == SLJIT_PREF_SHIFT_REG) {
                EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
                EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
                inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
                FAIL_IF(!inst);
                *inst |= mode;
                EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
        }
        else if (FAST_IS_REG(dst) && dst != src2 && !ADDRESSING_DEPENDS_ON(src2, dst)) {
                if (src1 != dst)
                        EMIT_MOV(compiler, dst, 0, src1, src1w);
                EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
                EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
                inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, dst, 0);
                FAIL_IF(!inst);
                *inst |= mode;
                EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
        }
        else {
                /* This case is really difficult, since ecx itself may used for
                   addressing, and we must ensure to work even in that case. */
                EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
                EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_PREF_SHIFT_REG, 0);
#else
                /* [esp+0] contains the flags. */
                EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_sw), SLJIT_PREF_SHIFT_REG, 0);
#endif
                EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
                inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
                FAIL_IF(!inst);
                *inst |= mode;
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
                EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG2, 0);
#else
                EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_sw));
#endif
                EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
        }

        return SLJIT_SUCCESS;
}

static sljit_si emit_shift_with_flags(struct sljit_compiler *compiler,
        sljit_ub mode, sljit_si set_flags,
        sljit_si dst, sljit_sw dstw,
        sljit_si src1, sljit_sw src1w,
        sljit_si src2, sljit_sw src2w)
{
        /* The CPU does not set flags if the shift count is 0. */
        if (src2 & SLJIT_IMM) {
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
                if ((src2w & 0x3f) != 0 || (compiler->mode32 && (src2w & 0x1f) != 0))
                        return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
#else
                if ((src2w & 0x1f) != 0)
                        return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
#endif
                if (!set_flags)
                        return emit_mov(compiler, dst, dstw, src1, src1w);
                /* OR dst, src, 0 */
                return emit_cum_binary(compiler, OR_r_rm, OR_rm_r, OR, OR_EAX_i32,
                        dst, dstw, src1, src1w, SLJIT_IMM, 0);
        }

        if (!set_flags)
                return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);

        if (!FAST_IS_REG(dst))
                FAIL_IF(emit_cmp_binary(compiler, src1, src1w, SLJIT_IMM, 0));

        FAIL_IF(emit_shift(compiler,mode, dst, dstw, src1, src1w, src2, src2w));

        if (FAST_IS_REG(dst))
                return emit_cmp_binary(compiler, dst, dstw, SLJIT_IMM, 0);
        return SLJIT_SUCCESS;
}

SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op2(struct sljit_compiler *compiler, sljit_si op,
        sljit_si dst, sljit_sw dstw,
        sljit_si src1, sljit_sw src1w,
        sljit_si src2, sljit_sw src2w)
{
        CHECK_ERROR();
        CHECK(check_sljit_emit_op2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
        ADJUST_LOCAL_OFFSET(dst, dstw);
        ADJUST_LOCAL_OFFSET(src1, src1w);
        ADJUST_LOCAL_OFFSET(src2, src2w);

        CHECK_EXTRA_REGS(dst, dstw, (void)0);
        CHECK_EXTRA_REGS(src1, src1w, (void)0);
        CHECK_EXTRA_REGS(src2, src2w, (void)0);
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
        compiler->mode32 = op & SLJIT_INT_OP;
#endif

        if (GET_OPCODE(op) >= SLJIT_MUL) {
                if (SLJIT_UNLIKELY(GET_FLAGS(op)))
                        compiler->flags_saved = 0;
                else if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
                        FAIL_IF(emit_save_flags(compiler));
        }

        switch (GET_OPCODE(op)) {
        case SLJIT_ADD:
                if (!GET_FLAGS(op)) {
                        if (emit_lea_binary(compiler, op & SLJIT_KEEP_FLAGS, dst, dstw, src1, src1w, src2, src2w) != SLJIT_ERR_UNSUPPORTED)
                                return compiler->error;
                }
                else
                        compiler->flags_saved = 0;
                if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
                        FAIL_IF(emit_save_flags(compiler));
                return emit_cum_binary(compiler, ADD_r_rm, ADD_rm_r, ADD, ADD_EAX_i32,
                        dst, dstw, src1, src1w, src2, src2w);
        case SLJIT_ADDC:
                if (SLJIT_UNLIKELY(compiler->flags_saved)) /* C flag must be restored. */
                        FAIL_IF(emit_restore_flags(compiler, 1));
                else if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS))
                        FAIL_IF(emit_save_flags(compiler));
                if (SLJIT_UNLIKELY(GET_FLAGS(op)))
                        compiler->flags_saved = 0;
                return emit_cum_binary(compiler, ADC_r_rm, ADC_rm_r, ADC, ADC_EAX_i32,
                        dst, dstw, src1, src1w, src2, src2w);
        case SLJIT_SUB:
                if (!GET_FLAGS(op)) {
                        if ((src2 & SLJIT_IMM) && emit_lea_binary(compiler, op & SLJIT_KEEP_FLAGS, dst, dstw, src1, src1w, SLJIT_IMM, -src2w) != SLJIT_ERR_UNSUPPORTED)
                                return compiler->error;
                }
                else
                        compiler->flags_saved = 0;
                if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
                        FAIL_IF(emit_save_flags(compiler));
                if (dst == SLJIT_UNUSED)
                        return emit_cmp_binary(compiler, src1, src1w, src2, src2w);
                return emit_non_cum_binary(compiler, SUB_r_rm, SUB_rm_r, SUB, SUB_EAX_i32,
                        dst, dstw, src1, src1w, src2, src2w);
        case SLJIT_SUBC:
                if (SLJIT_UNLIKELY(compiler->flags_saved)) /* C flag must be restored. */
                        FAIL_IF(emit_restore_flags(compiler, 1));
                else if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS))
                        FAIL_IF(emit_save_flags(compiler));
                if (SLJIT_UNLIKELY(GET_FLAGS(op)))
                        compiler->flags_saved = 0;
                return emit_non_cum_binary(compiler, SBB_r_rm, SBB_rm_r, SBB, SBB_EAX_i32,
                        dst, dstw, src1, src1w, src2, src2w);
        case SLJIT_MUL:
                return emit_mul(compiler, dst, dstw, src1, src1w, src2, src2w);
        case SLJIT_AND:
                if (dst == SLJIT_UNUSED)
                        return emit_test_binary(compiler, src1, src1w, src2, src2w);
                return emit_cum_binary(compiler, AND_r_rm, AND_rm_r, AND, AND_EAX_i32,
                        dst, dstw, src1, src1w, src2, src2w);
        case SLJIT_OR:
                return emit_cum_binary(compiler, OR_r_rm, OR_rm_r, OR, OR_EAX_i32,
                        dst, dstw, src1, src1w, src2, src2w);
        case SLJIT_XOR:
                return emit_cum_binary(compiler, XOR_r_rm, XOR_rm_r, XOR, XOR_EAX_i32,
                        dst, dstw, src1, src1w, src2, src2w);
        case SLJIT_SHL:
                return emit_shift_with_flags(compiler, SHL, GET_FLAGS(op),
                        dst, dstw, src1, src1w, src2, src2w);
        case SLJIT_LSHR:
                return emit_shift_with_flags(compiler, SHR, GET_FLAGS(op),
                        dst, dstw, src1, src1w, src2, src2w);
        case SLJIT_ASHR:
                return emit_shift_with_flags(compiler, SAR, GET_FLAGS(op),
                        dst, dstw, src1, src1w, src2, src2w);
        }

        return SLJIT_SUCCESS;
}

SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_register_index(sljit_si reg)
{
        CHECK_REG_INDEX(check_sljit_get_register_index(reg));
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
        if (reg >= SLJIT_R3 && reg <= SLJIT_R6)
                return -1;
#endif
        return reg_map[reg];
}

SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_float_register_index(sljit_si reg)
{
        CHECK_REG_INDEX(check_sljit_get_float_register_index(reg));
        return reg;
}

SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_custom(struct sljit_compiler *compiler,
        void *instruction, sljit_si size)
{
        sljit_ub *inst;

        CHECK_ERROR();
        CHECK(check_sljit_emit_op_custom(compiler, instruction, size));

        inst = (sljit_ub*)ensure_buf(compiler, 1 + size);
        FAIL_IF(!inst);
        INC_SIZE(size);
        SLJIT_MEMMOVE(inst, instruction, size);
        return SLJIT_SUCCESS;
}

/* --------------------------------------------------------------------- */
/*  Floating point operators                                             */
/* --------------------------------------------------------------------- */

/* Alignment + 2 * 16 bytes. */
static sljit_si sse2_data[3 + (4 + 4) * 2];
static sljit_si *sse2_buffer;

static void init_compiler(void)
{
        sse2_buffer = (sljit_si*)(((sljit_uw)sse2_data + 15) & ~0xf);
        /* Single precision constants. */
        sse2_buffer[0] = 0x80000000;
        sse2_buffer[4] = 0x7fffffff;
        /* Double precision constants. */
        sse2_buffer[8] = 0;
        sse2_buffer[9] = 0x80000000;
        sse2_buffer[12] = 0xffffffff;
        sse2_buffer[13] = 0x7fffffff;
}

SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_is_fpu_available(void)
{
#ifdef SLJIT_IS_FPU_AVAILABLE
        return SLJIT_IS_FPU_AVAILABLE;
#elif (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
        if (cpu_has_sse2 == -1)
                get_cpu_features();
        return cpu_has_sse2;
#else /* SLJIT_DETECT_SSE2 */
        return 1;
#endif /* SLJIT_DETECT_SSE2 */
}

static sljit_si emit_sse2(struct sljit_compiler *compiler, sljit_ub opcode,
        sljit_si single, sljit_si xmm1, sljit_si xmm2, sljit_sw xmm2w)
{
        sljit_ub *inst;

        inst = emit_x86_instruction(compiler, 2 | (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
        FAIL_IF(!inst);
        *inst++ = GROUP_0F;
        *inst = opcode;
        return SLJIT_SUCCESS;
}

static sljit_si emit_sse2_logic(struct sljit_compiler *compiler, sljit_ub opcode,
        sljit_si pref66, sljit_si xmm1, sljit_si xmm2, sljit_sw xmm2w)
{
        sljit_ub *inst;

        inst = emit_x86_instruction(compiler, 2 | (pref66 ? EX86_PREF_66 : 0) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
        FAIL_IF(!inst);
        *inst++ = GROUP_0F;
        *inst = opcode;
        return SLJIT_SUCCESS;
}

static SLJIT_INLINE sljit_si emit_sse2_load(struct sljit_compiler *compiler,
        sljit_si single, sljit_si dst, sljit_si src, sljit_sw srcw)
{
        return emit_sse2(compiler, MOVSD_x_xm, single, dst, src, srcw);
}

static SLJIT_INLINE sljit_si emit_sse2_store(struct sljit_compiler *compiler,
        sljit_si single, sljit_si dst, sljit_sw dstw, sljit_si src)
{
        return emit_sse2(compiler, MOVSD_xm_x, single, src, dst, dstw);
}

static SLJIT_INLINE sljit_si sljit_emit_fop1_convw_fromd(struct sljit_compiler *compiler, sljit_si op,
        sljit_si dst, sljit_sw dstw,
        sljit_si src, sljit_sw srcw)
{
        sljit_si dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;
        sljit_ub *inst;

#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
        if (GET_OPCODE(op) == SLJIT_CONVW_FROMD)
                compiler->mode32 = 0;
#endif

        inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_SINGLE_OP) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP2, dst_r, 0, src, srcw);
        FAIL_IF(!inst);
        *inst++ = GROUP_0F;
        *inst = CVTTSD2SI_r_xm;

        if (dst_r == TMP_REG1 && dst != SLJIT_UNUSED)
                return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
        return SLJIT_SUCCESS;
}

static SLJIT_INLINE sljit_si sljit_emit_fop1_convd_fromw(struct sljit_compiler *compiler, sljit_si op,
        sljit_si dst, sljit_sw dstw,
        sljit_si src, sljit_sw srcw)
{
        sljit_si dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
        sljit_ub *inst;

#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
        if (GET_OPCODE(op) == SLJIT_CONVD_FROMW)
                compiler->mode32 = 0;
#endif

        if (src & SLJIT_IMM) {
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
                if (GET_OPCODE(op) == SLJIT_CONVD_FROMI)
                        srcw = (sljit_si)srcw;
#endif
                EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
                src = TMP_REG1;
                srcw = 0;
        }

        inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_SINGLE_OP) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP1, dst_r, 0, src, srcw);
        FAIL_IF(!inst);
        *inst++ = GROUP_0F;
        *inst = CVTSI2SD_x_rm;

#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
        compiler->mode32 = 1;
#endif
        if (dst_r == TMP_FREG)
                return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, TMP_FREG);
        return SLJIT_SUCCESS;
}

static SLJIT_INLINE sljit_si sljit_emit_fop1_cmp(struct sljit_compiler *compiler, sljit_si op,
        sljit_si src1, sljit_sw src1w,
        sljit_si src2, sljit_sw src2w)
{
        compiler->flags_saved = 0;
        if (!FAST_IS_REG(src1)) {
                FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, TMP_FREG, src1, src1w));
                src1 = TMP_FREG;
        }
        return emit_sse2_logic(compiler, UCOMISD_x_xm, !(op & SLJIT_SINGLE_OP), src1, src2, src2w);
}

SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fop1(struct sljit_compiler *compiler, sljit_si op,
        sljit_si dst, sljit_sw dstw,
        sljit_si src, sljit_sw srcw)
{
        sljit_si dst_r;

#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
        compiler->mode32 = 1;
#endif

        CHECK_ERROR();
        SELECT_FOP1_OPERATION_WITH_CHECKS(compiler, op, dst, dstw, src, srcw);

        if (GET_OPCODE(op) == SLJIT_DMOV) {
                if (FAST_IS_REG(dst))
                        return emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, dst, src, srcw);
                if (FAST_IS_REG(src))
                        return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, src);
                FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, TMP_FREG, src, srcw));
                return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, TMP_FREG);
        }

        if (GET_OPCODE(op) == SLJIT_CONVD_FROMS) {
                dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
                if (FAST_IS_REG(src)) {
                        /* We overwrite the high bits of source. From SLJIT point of view,
                           this is not an issue.
                           Note: In SSE3, we could also use MOVDDUP and MOVSLDUP. */
                        FAIL_IF(emit_sse2_logic(compiler, UNPCKLPD_x_xm, op & SLJIT_SINGLE_OP, src, src, 0));
                }
                else {
                        FAIL_IF(emit_sse2_load(compiler, !(op & SLJIT_SINGLE_OP), TMP_FREG, src, srcw));
                        src = TMP_FREG;
                }

                FAIL_IF(emit_sse2_logic(compiler, CVTPD2PS_x_xm, op & SLJIT_SINGLE_OP, dst_r, src, 0));
                if (dst_r == TMP_FREG)
                        return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, TMP_FREG);
                return SLJIT_SUCCESS;
        }

        if (SLOW_IS_REG(dst)) {
                dst_r = dst;
                if (dst != src)
                        FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, dst_r, src, srcw));
        }
        else {
                dst_r = TMP_FREG;
                FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, dst_r, src, srcw));
        }

        switch (GET_OPCODE(op)) {
        case SLJIT_DNEG:
                FAIL_IF(emit_sse2_logic(compiler, XORPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_SINGLE_OP ? sse2_buffer : sse2_buffer + 8)));
                break;

        case SLJIT_DABS:
                FAIL_IF(emit_sse2_logic(compiler, ANDPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_SINGLE_OP ? sse2_buffer + 4 : sse2_buffer + 12)));
                break;
        }

        if (dst_r == TMP_FREG)
                return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, TMP_FREG);
        return SLJIT_SUCCESS;
}

SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fop2(struct sljit_compiler *compiler, sljit_si op,
        sljit_si dst, sljit_sw dstw,
        sljit_si src1, sljit_sw src1w,
        sljit_si src2, sljit_sw src2w)
{
        sljit_si dst_r;

        CHECK_ERROR();
        CHECK(check_sljit_emit_fop2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
        ADJUST_LOCAL_OFFSET(dst, dstw);
        ADJUST_LOCAL_OFFSET(src1, src1w);
        ADJUST_LOCAL_OFFSET(src2, src2w);

#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
        compiler->mode32 = 1;
#endif

        if (FAST_IS_REG(dst)) {
                dst_r = dst;
                if (dst == src1)
                        ; /* Do nothing here. */
                else if (dst == src2 && (op == SLJIT_DADD || op == SLJIT_DMUL)) {
                        /* Swap arguments. */
                        src2 = src1;
                        src2w = src1w;
                }
                else if (dst != src2)
                        FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, dst_r, src1, src1w));
                else {
                        dst_r = TMP_FREG;
                        FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, TMP_FREG, src1, src1w));
                }
        }
        else {
                dst_r = TMP_FREG;
                FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, TMP_FREG, src1, src1w));
        }

        switch (GET_OPCODE(op)) {
        case SLJIT_DADD:
                FAIL_IF(emit_sse2(compiler, ADDSD_x_xm, op & SLJIT_SINGLE_OP, dst_r, src2, src2w));
                break;

        case SLJIT_DSUB:
                FAIL_IF(emit_sse2(compiler, SUBSD_x_xm, op & SLJIT_SINGLE_OP, dst_r, src2, src2w));
                break;

        case SLJIT_DMUL:
                FAIL_IF(emit_sse2(compiler, MULSD_x_xm, op & SLJIT_SINGLE_OP, dst_r, src2, src2w));
                break;

        case SLJIT_DDIV:
                FAIL_IF(emit_sse2(compiler, DIVSD_x_xm, op & SLJIT_SINGLE_OP, dst_r, src2, src2w));
                break;
        }

        if (dst_r == TMP_FREG)
                return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, TMP_FREG);
        return SLJIT_SUCCESS;
}

/* --------------------------------------------------------------------- */
/*  Conditional instructions                                             */
/* --------------------------------------------------------------------- */

SLJIT_API_FUNC_ATTRIBUTE struct sljit_label* sljit_emit_label(struct sljit_compiler *compiler)
{
        sljit_ub *inst;
        struct sljit_label *label;

        CHECK_ERROR_PTR();
        CHECK_PTR(check_sljit_emit_label(compiler));

        /* We should restore the flags before the label,
           since other taken jumps has their own flags as well. */
        if (SLJIT_UNLIKELY(compiler->flags_saved))
                PTR_FAIL_IF(emit_restore_flags(compiler, 0));

        if (compiler->last_label && compiler->last_label->size == compiler->size)
                return compiler->last_label;

        label = (struct sljit_label*)ensure_abuf(compiler, sizeof(struct sljit_label));
        PTR_FAIL_IF(!label);
        set_label(label, compiler);

        inst = (sljit_ub*)ensure_buf(compiler, 2);
        PTR_FAIL_IF(!inst);

        *inst++ = 0;
        *inst++ = 0;

        return label;
}

SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compiler *compiler, sljit_si type)
{
        sljit_ub *inst;
        struct sljit_jump *jump;

        CHECK_ERROR_PTR();
        CHECK_PTR(check_sljit_emit_jump(compiler, type));

        if (SLJIT_UNLIKELY(compiler->flags_saved)) {
                if ((type & 0xff) <= SLJIT_JUMP)
                        PTR_FAIL_IF(emit_restore_flags(compiler, 0));
                compiler->flags_saved = 0;
        }

        jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
        PTR_FAIL_IF_NULL(jump);
        set_jump(jump, compiler, type & SLJIT_REWRITABLE_JUMP);
        type &= 0xff;

        if (type >= SLJIT_CALL1)
                PTR_FAIL_IF(call_with_args(compiler, type));

        /* Worst case size. */
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
        compiler->size += (type >= SLJIT_JUMP) ? 5 : 6;
#else
        compiler->size += (type >= SLJIT_JUMP) ? (10 + 3) : (2 + 10 + 3);
#endif

        inst = (sljit_ub*)ensure_buf(compiler, 2);
        PTR_FAIL_IF_NULL(inst);

        *inst++ = 0;
        *inst++ = type + 4;
        return jump;
}

SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_ijump(struct sljit_compiler *compiler, sljit_si type, sljit_si src, sljit_sw srcw)
{
        sljit_ub *inst;
        struct sljit_jump *jump;

        CHECK_ERROR();
        CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
        ADJUST_LOCAL_OFFSET(src, srcw);

        CHECK_EXTRA_REGS(src, srcw, (void)0);

        if (SLJIT_UNLIKELY(compiler->flags_saved)) {
                if (type <= SLJIT_JUMP)
                        FAIL_IF(emit_restore_flags(compiler, 0));
                compiler->flags_saved = 0;
        }

        if (type >= SLJIT_CALL1) {
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
                if (src == SLJIT_R2) {
                        EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
                        src = TMP_REG1;
                }
                if (src == SLJIT_MEM1(SLJIT_SP) && type >= SLJIT_CALL3)
                        srcw += sizeof(sljit_sw);
#endif
#endif
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && defined(_WIN64)
                if (src == SLJIT_R2) {
                        EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
                        src = TMP_REG1;
                }
#endif
                FAIL_IF(call_with_args(compiler, type));
        }

        if (src == SLJIT_IMM) {
                jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
                FAIL_IF_NULL(jump);
                set_jump(jump, compiler, JUMP_ADDR);
                jump->u.target = srcw;

                /* Worst case size. */
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
                compiler->size += 5;
#else
                compiler->size += 10 + 3;
#endif

                inst = (sljit_ub*)ensure_buf(compiler, 2);
                FAIL_IF_NULL(inst);

                *inst++ = 0;
                *inst++ = type + 4;
        }
        else {
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
                /* REX_W is not necessary (src is not immediate). */
                compiler->mode32 = 1;
#endif
                inst = emit_x86_instruction(compiler, 1, 0, 0, src, srcw);
                FAIL_IF(!inst);
                *inst++ = GROUP_FF;
                *inst |= (type >= SLJIT_FAST_CALL) ? CALL_rm : JMP_rm;
        }
        return SLJIT_SUCCESS;
}

SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_si op,
        sljit_si dst, sljit_sw dstw,
        sljit_si src, sljit_sw srcw,
        sljit_si type)
{
        sljit_ub *inst;
        sljit_ub cond_set = 0;
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
        sljit_si reg;
#else
        /* CHECK_EXTRA_REGS migh overwrite these values. */
        sljit_si dst_save = dst;
        sljit_sw dstw_save = dstw;
#endif

        CHECK_ERROR();
        CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, src, srcw, type));
        SLJIT_UNUSED_ARG(srcw);

        if (dst == SLJIT_UNUSED)
                return SLJIT_SUCCESS;

        ADJUST_LOCAL_OFFSET(dst, dstw);
        CHECK_EXTRA_REGS(dst, dstw, (void)0);
        if (SLJIT_UNLIKELY(compiler->flags_saved))
                FAIL_IF(emit_restore_flags(compiler, op & SLJIT_KEEP_FLAGS));

        type &= 0xff;
        /* setcc = jcc + 0x10. */
        cond_set = get_jump_code(type) + 0x10;

#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
        if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && dst == src) {
                inst = (sljit_ub*)ensure_buf(compiler, 1 + 4 + 3);
                FAIL_IF(!inst);
                INC_SIZE(4 + 3);
                /* Set low register to conditional flag. */
                *inst++ = (reg_map[TMP_REG1] <= 7) ? REX : REX_B;
                *inst++ = GROUP_0F;
                *inst++ = cond_set;
                *inst++ = MOD_REG | reg_lmap[TMP_REG1];
                *inst++ = REX | (reg_map[TMP_REG1] <= 7 ? 0 : REX_R) | (reg_map[dst] <= 7 ? 0 : REX_B);
                *inst++ = OR_rm8_r8;
                *inst++ = MOD_REG | (reg_lmap[TMP_REG1] << 3) | reg_lmap[dst];
                return SLJIT_SUCCESS;
        }

        reg = (op == SLJIT_MOV && FAST_IS_REG(dst)) ? dst : TMP_REG1;

        inst = (sljit_ub*)ensure_buf(compiler, 1 + 4 + 4);
        FAIL_IF(!inst);
        INC_SIZE(4 + 4);
        /* Set low register to conditional flag. */
        *inst++ = (reg_map[reg] <= 7) ? REX : REX_B;
        *inst++ = GROUP_0F;
        *inst++ = cond_set;
        *inst++ = MOD_REG | reg_lmap[reg];
        *inst++ = REX_W | (reg_map[reg] <= 7 ? 0 : (REX_B | REX_R));
        *inst++ = GROUP_0F;
        *inst++ = MOVZX_r_rm8;
        *inst = MOD_REG | (reg_lmap[reg] << 3) | reg_lmap[reg];

        if (reg != TMP_REG1)
                return SLJIT_SUCCESS;

        if (GET_OPCODE(op) < SLJIT_ADD) {
                compiler->mode32 = GET_OPCODE(op) != SLJIT_MOV;
                return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
        }
#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
                || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
        compiler->skip_checks = 1;
#endif
        return sljit_emit_op2(compiler, op, dst, dstw, dst, dstw, TMP_REG1, 0);
#else /* SLJIT_CONFIG_X86_64 */
        if (GET_OPCODE(op) < SLJIT_ADD && FAST_IS_REG(dst)) {
                if (reg_map[dst] <= 4) {
                        /* Low byte is accessible. */
                        inst = (sljit_ub*)ensure_buf(compiler, 1 + 3 + 3);
                        FAIL_IF(!inst);
                        INC_SIZE(3 + 3);
                        /* Set low byte to conditional flag. */
                        *inst++ = GROUP_0F;
                        *inst++ = cond_set;
                        *inst++ = MOD_REG | reg_map[dst];

                        *inst++ = GROUP_0F;
                        *inst++ = MOVZX_r_rm8;
                        *inst = MOD_REG | (reg_map[dst] << 3) | reg_map[dst];
                        return SLJIT_SUCCESS;
                }

                /* Low byte is not accessible. */
                if (cpu_has_cmov == -1)
                        get_cpu_features();

                if (cpu_has_cmov) {
                        EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, 1);
                        /* a xor reg, reg operation would overwrite the flags. */
                        EMIT_MOV(compiler, dst, 0, SLJIT_IMM, 0);

                        inst = (sljit_ub*)ensure_buf(compiler, 1 + 3);
                        FAIL_IF(!inst);
                        INC_SIZE(3);

                        *inst++ = GROUP_0F;
                        /* cmovcc = setcc - 0x50. */
                        *inst++ = cond_set - 0x50;
                        *inst++ = MOD_REG | (reg_map[dst] << 3) | reg_map[TMP_REG1];
                        return SLJIT_SUCCESS;
                }

                inst = (sljit_ub*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
                FAIL_IF(!inst);
                INC_SIZE(1 + 3 + 3 + 1);
                *inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
                /* Set al to conditional flag. */
                *inst++ = GROUP_0F;
                *inst++ = cond_set;
                *inst++ = MOD_REG | 0 /* eax */;

                *inst++ = GROUP_0F;
                *inst++ = MOVZX_r_rm8;
                *inst++ = MOD_REG | (reg_map[dst] << 3) | 0 /* eax */;
                *inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
                return SLJIT_SUCCESS;
        }

        if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && dst == src && reg_map[dst] <= 4) {
                SLJIT_COMPILE_ASSERT(reg_map[SLJIT_R0] == 0, scratch_reg1_must_be_eax);
                if (dst != SLJIT_R0) {
                        inst = (sljit_ub*)ensure_buf(compiler, 1 + 1 + 3 + 2 + 1);
                        FAIL_IF(!inst);
                        INC_SIZE(1 + 3 + 2 + 1);
                        /* Set low register to conditional flag. */
                        *inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
                        *inst++ = GROUP_0F;
                        *inst++ = cond_set;
                        *inst++ = MOD_REG | 0 /* eax */;
                        *inst++ = OR_rm8_r8;
                        *inst++ = MOD_REG | (0 /* eax */ << 3) | reg_map[dst];
                        *inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
                }
                else {
                        inst = (sljit_ub*)ensure_buf(compiler, 1 + 2 + 3 + 2 + 2);
                        FAIL_IF(!inst);
                        INC_SIZE(2 + 3 + 2 + 2);
                        /* Set low register to conditional flag. */
                        *inst++ = XCHG_r_rm;
                        *inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1];
                        *inst++ = GROUP_0F;
                        *inst++ = cond_set;
                        *inst++ = MOD_REG | 1 /* ecx */;
                        *inst++ = OR_rm8_r8;
                        *inst++ = MOD_REG | (1 /* ecx */ << 3) | 0 /* eax */;
                        *inst++ = XCHG_r_rm;
                        *inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1];
                }
                return SLJIT_SUCCESS;
        }

        /* Set TMP_REG1 to the bit. */
        inst = (sljit_ub*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
        FAIL_IF(!inst);
        INC_SIZE(1 + 3 + 3 + 1);
        *inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
        /* Set al to conditional flag. */
        *inst++ = GROUP_0F;
        *inst++ = cond_set;
        *inst++ = MOD_REG | 0 /* eax */;

        *inst++ = GROUP_0F;
        *inst++ = MOVZX_r_rm8;
        *inst++ = MOD_REG | (0 << 3) /* eax */ | 0 /* eax */;

        *inst++ = XCHG_EAX_r + reg_map[TMP_REG1];

        if (GET_OPCODE(op) < SLJIT_ADD)
                return emit_mov(compiler, dst, dstw, TMP_REG1, 0);

#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
                || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
        compiler->skip_checks = 1;
#endif
        return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0);
#endif /* SLJIT_CONFIG_X86_64 */
}

SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_local_base(struct sljit_compiler *compiler, sljit_si dst, sljit_sw dstw, sljit_sw offset)
{
        CHECK_ERROR();
        CHECK(check_sljit_get_local_base(compiler, dst, dstw, offset));
        ADJUST_LOCAL_OFFSET(dst, dstw);

        CHECK_EXTRA_REGS(dst, dstw, (void)0);

#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
        compiler->mode32 = 0;
#endif

        ADJUST_LOCAL_OFFSET(SLJIT_MEM1(SLJIT_SP), offset);

#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
        if (NOT_HALFWORD(offset)) {
                FAIL_IF(emit_load_imm64(compiler, TMP_REG1, offset));
#if (defined SLJIT_DEBUG && SLJIT_DEBUG)
                SLJIT_ASSERT(emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_SP, 0, TMP_REG1, 0) != SLJIT_ERR_UNSUPPORTED);
                return compiler->error;
#else
                return emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_SP, 0, TMP_REG1, 0);
#endif
        }
#endif

        if (offset != 0)
                return emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_SP, 0, SLJIT_IMM, offset);
        return emit_mov(compiler, dst, dstw, SLJIT_SP, 0);
}

SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_si dst, sljit_sw dstw, sljit_sw init_value)
{
        sljit_ub *inst;
        struct sljit_const *const_;
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
        sljit_si reg;
#endif

        CHECK_ERROR_PTR();
        CHECK_PTR(check_sljit_emit_const(compiler, dst, dstw, init_value));
        ADJUST_LOCAL_OFFSET(dst, dstw);

        CHECK_EXTRA_REGS(dst, dstw, (void)0);

        const_ = (struct sljit_const*)ensure_abuf(compiler, sizeof(struct sljit_const));
        PTR_FAIL_IF(!const_);
        set_const(const_, compiler);

#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
        compiler->mode32 = 0;
        reg = SLOW_IS_REG(dst) ? dst : TMP_REG1;

        if (emit_load_imm64(compiler, reg, init_value))
                return NULL;
#else
        if (dst == SLJIT_UNUSED)
                dst = TMP_REG1;

        if (emit_mov(compiler, dst, dstw, SLJIT_IMM, init_value))
                return NULL;
#endif

        inst = (sljit_ub*)ensure_buf(compiler, 2);
        PTR_FAIL_IF(!inst);

        *inst++ = 0;
        *inst++ = 1;

#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
        if (dst & SLJIT_MEM)
                if (emit_mov(compiler, dst, dstw, TMP_REG1, 0))
                        return NULL;
#endif

        return const_;
}

SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_addr)
{
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
        *(sljit_sw*)addr = new_addr - (addr + 4);
#else
        *(sljit_uw*)addr = new_addr;
#endif
}

SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_constant)
{
        *(sljit_sw*)addr = new_constant;
}

/* [<][>][^][v][top][bottom][index][help] */