/* [<][>][^][v][top][bottom][index][help] */
DEFINITIONS
This source file includes following definitions.
- isOp2Imm
- decOp2Imm
- CountLeadingZeroes
- encOp2Imm
- asm_add_imm
- asm_sub_imm
- asm_and_imm
- asm_orr_imm
- asm_eor_imm
- nInit
- nBeginAssembly
- genPrologue
- nFragExit
- genEpilogue
- asm_arg
- asm_arg_64
- asm_regarg
- asm_stkarg
- asm_call
- nRegisterAllocFromSet
- nRegisterResetAll
- get_cc
- branch_is_B
- branch_is_LDR_PC
- is_ldstr_reg_fp_minus_imm
- is_ldstmdb_fp
- mk_ldstmdb_fp
- size_of_regSet
- do_peep_2_1
- does_next_instruction_exist
- nPatchBranch
- hint
- asm_qjoin
- asm_store32
- asm_restore
- asm_spill
- asm_load64
- asm_store64
- asm_quad_nochk
- asm_quad
- asm_nongp_copy
- asm_binop_rhs_reg
- asm_mmq
- verbose_only
- nativePageSetup
- underrunProtect
- JMP_far
- BranchWithLink
- BLX
- asm_ldr_chk
- asm_ld_imm
- B_cond_chk
- asm_i2f
- asm_u2f
- asm_fneg
- asm_fop
- asm_fcmp
- asm_branch
- asm_cmp
- asm_cmpi
- asm_fcond
- asm_cond
- asm_arith
- asm_neg_not
- asm_load32
- asm_cmov
- asm_qhi
- asm_qlo
- asm_param
- asm_int
- asm_ret
- asm_promote
- asm_jtbl
- swapCodeChunks
/* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: nil; tab-width: 4 -*- */
/* vi: set ts=4 sw=4 expandtab: (add to ~/.vimrc: set modeline modelines=5) */
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is [Open Source Virtual Machine].
*
* The Initial Developer of the Original Code is
* Adobe System Incorporated.
* Portions created by the Initial Developer are Copyright (C) 2004-2007
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Adobe AS3 Team
* Vladimir Vukicevic <vladimir@pobox.com>
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include "nanojit.h"
#ifdef UNDER_CE
#include <cmnintrin.h>
extern "C" bool blx_lr_broken();
#endif
#if defined(FEATURE_NANOJIT) && defined(NANOJIT_ARM)
namespace nanojit
{
#ifdef NJ_VERBOSE
const char* regNames[] = {"r0","r1","r2","r3","r4","r5","r6","r7","r8","r9","r10","fp","ip","sp","lr","pc",
"d0","d1","d2","d3","d4","d5","d6","d7","s14"};
const char* condNames[] = {"eq","ne","cs","cc","mi","pl","vs","vc","hi","ls","ge","lt","gt","le",""/*al*/,"nv"};
const char* shiftNames[] = { "lsl", "lsl", "lsr", "lsr", "asr", "asr", "ror", "ror" };
#endif
const Register Assembler::argRegs[] = { R0, R1, R2, R3 };
const Register Assembler::retRegs[] = { R0, R1 };
const Register Assembler::savedRegs[] = { R4, R5, R6, R7, R8, R9, R10 };
// --------------------------------
// ARM-specific utility functions.
// --------------------------------
#ifdef DEBUG
// Return true if enc is a valid Operand 2 encoding and thus can be used as-is
// in an ARM arithmetic operation that accepts such encoding.
//
// This utility does not know (or determine) the actual value that the encoded
// value represents, and thus cannot be used to ensure the correct operation of
// encOp2Imm, but it does ensure that the encoded value can be used to encode a
// valid ARM instruction. decOp2Imm can be used if you also need to check that
// a literal is correctly encoded (and thus that encOp2Imm is working
// correctly).
inline bool
Assembler::isOp2Imm(uint32_t enc)
{
return ((enc & 0xfff) == enc);
}
// Decodes operand 2 immediate values (for debug output and assertions).
inline uint32_t
Assembler::decOp2Imm(uint32_t enc)
{
NanoAssert(isOp2Imm(enc));
uint32_t imm8 = enc & 0xff;
uint32_t rot = 32 - ((enc >> 7) & 0x1e);
return imm8 << (rot & 0x1f);
}
#endif
// Calculate the number of leading zeroes in data.
inline uint32_t
Assembler::CountLeadingZeroes(uint32_t data)
{
uint32_t leading_zeroes;
// We can't do CLZ on anything earlier than ARMv5. Architectures as early
// as that aren't supported, but assert that we aren't running on one
// anyway.
// If ARMv4 support is required in the future for some reason, we can do a
// run-time check on config.arch and fall back to the C routine, but for
// now we can avoid the cost of the check as we don't intend to support
// ARMv4 anyway.
NanoAssert(ARM_ARCH >= 5);
#if defined(__ARMCC__)
// ARMCC can do this with an intrinsic.
leading_zeroes = __clz(data);
// current Android GCC compiler incorrectly refuses to compile 'clz' for armv5
// (even though this is a legal instruction there). Since we currently only compile for ARMv5
// for emulation, we don't care too much (but we DO care for ARMv6+ since those are "real"
// devices).
#elif defined(__GNUC__) && !(defined(ANDROID) && __ARM_ARCH__ <= 5)
// GCC can use inline assembler to insert a CLZ instruction.
__asm (
" clz %0, %1 \n"
: "=r" (leading_zeroes)
: "r" (data)
);
#elif defined(WINCE)
// WinCE can do this with an intrinsic.
leading_zeroes = _CountLeadingZeros(data);
#else
// Other platforms must fall back to a C routine. This won't be as
// efficient as the CLZ instruction, but it is functional.
uint32_t try_shift;
leading_zeroes = 0;
// This loop does a bisection search rather than the obvious rotation loop.
// This should be faster, though it will still be no match for CLZ.
for (try_shift = 16; try_shift != 0; try_shift /= 2) {
uint32_t shift = leading_zeroes + try_shift;
if (((data << shift) >> shift) == data) {
leading_zeroes = shift;
}
}
#endif
// Assert that the operation worked!
NanoAssert(((0xffffffff >> leading_zeroes) & data) == data);
return leading_zeroes;
}
// The ARM instruction set allows some flexibility to the second operand of
// most arithmetic operations. When operand 2 is an immediate value, it takes
// the form of an 8-bit value rotated by an even value in the range 0-30.
//
// Some values that can be encoded this scheme — such as 0xf000000f — are
// probably fairly rare in practice and require extra code to detect, so this
// function implements a fast CLZ-based heuristic to detect any value that can
// be encoded using just a shift, and not a full rotation. For example,
// 0xff000000 and 0x000000ff are both detected, but 0xf000000f is not.
//
// This function will return true to indicate that the encoding was successful,
// or false to indicate that the literal could not be encoded as an operand 2
// immediate. If successful, the encoded value will be written to *enc.
inline bool
Assembler::encOp2Imm(uint32_t literal, uint32_t * enc)
{
// The number of leading zeroes in the literal. This is used to calculate
// the rotation component of the encoding.
uint32_t leading_zeroes;
// Components of the operand 2 encoding.
int32_t rot;
uint32_t imm8;
// Check the literal to see if it is a simple 8-bit value. I suspect that
// most literals are in fact small values, so doing this check early should
// give a decent speed-up.
if (literal < 256)
{
*enc = literal;
return true;
}
// Determine the number of leading zeroes in the literal. This is used to
// calculate the required rotation.
leading_zeroes = CountLeadingZeroes(literal);
// We've already done a check to see if the literal is an 8-bit value, so
// leading_zeroes must be less than (and not equal to) (32-8)=24. However,
// if it is greater than 24, this algorithm will break, so debug code
// should use an assertion here to check that we have a value that we
// expect.
NanoAssert(leading_zeroes < 24);
// Assuming that we have a field of no more than 8 bits for a valid
// literal, we can calculate the required rotation by subtracting
// leading_zeroes from (32-8):
//
// Example:
// 0: Known to be zero.
// 1: Known to be one.
// X: Either zero or one.
// .: Zero in a valid operand 2 literal.
//
// Literal: [ 1XXXXXXX ........ ........ ........ ]
// leading_zeroes = 0
// Therefore rot (left) = 24.
// Encoded 8-bit literal: [ 1XXXXXXX ]
//
// Literal: [ ........ ..1XXXXX XX...... ........ ]
// leading_zeroes = 10
// Therefore rot (left) = 14.
// Encoded 8-bit literal: [ 1XXXXXXX ]
//
// Note, however, that we can only encode even shifts, and so
// "rot=24-leading_zeroes" is not sufficient by itself. By ignoring
// zero-bits in odd bit positions, we can ensure that we get a valid
// encoding.
//
// Example:
// Literal: [ 01XXXXXX ........ ........ ........ ]
// leading_zeroes = 1
// Therefore rot (left) = round_up(23) = 24.
// Encoded 8-bit literal: [ 01XXXXXX ]
rot = 24 - (leading_zeroes & ~1);
// The imm8 component of the operand 2 encoding can be calculated from the
// rot value.
imm8 = literal >> rot;
// The validity of the literal can be checked by reversing the
// calculation. It is much easier to decode the immediate than it is to
// encode it!
if (literal != (imm8 << rot)) {
// The encoding is not valid, so report the failure. Calling code
// should use some other method of loading the value (such as LDR).
return false;
}
// The operand is valid, so encode it.
// Note that the ARM encoding is actually described by a rotate to the
// _right_, so rot must be negated here. Calculating a left shift (rather
// than calculating a right rotation) simplifies the above code.
*enc = ((-rot << 7) & 0xf00) | imm8;
// Assert that the operand was properly encoded.
NanoAssert(decOp2Imm(*enc) == literal);
return true;
}
// Encode "rd = rn + imm" using an appropriate instruction sequence.
// Set stat to 1 to update the status flags. Otherwise, set it to 0 or omit it.
// (The declaration in NativeARM.h defines the default value of stat as 0.)
//
// It is not valid to call this function if:
// (rd == IP) AND (rn == IP) AND !encOp2Imm(imm) AND !encOp2Imm(-imm)
// Where: if (encOp2Imm(imm)), imm can be encoded as an ARM operand 2 using the
// encOp2Imm method.
void
Assembler::asm_add_imm(Register rd, Register rn, int32_t imm, int stat /* =0 */)
{
// Operand 2 encoding of the immediate.
uint32_t op2imm;
NanoAssert(IsGpReg(rd));
NanoAssert(IsGpReg(rn));
NanoAssert((stat & 1) == stat);
// Try to encode the value directly as an operand 2 immediate value, then
// fall back to loading the value into a register.
if (encOp2Imm(imm, &op2imm)) {
ADDis(rd, rn, op2imm, stat);
} else if (encOp2Imm(-imm, &op2imm)) {
// We could not encode the value for ADD, so try to encode it for SUB.
// Note that this is valid even if stat is set, _unless_ imm is 0, but
// that case is caught above.
NanoAssert(imm != 0);
SUBis(rd, rn, op2imm, stat);
} else {
// We couldn't encode the value directly, so use an intermediate
// register to encode the value. We will use IP to do this unless rn is
// IP; in that case we can reuse rd. This allows every case other than
// "ADD IP, IP, =#imm".
Register rm = (rn == IP) ? (rd) : (IP);
NanoAssert(rn != rm);
ADDs(rd, rn, rm, stat);
asm_ld_imm(rm, imm);
}
}
// Encode "rd = rn - imm" using an appropriate instruction sequence.
// Set stat to 1 to update the status flags. Otherwise, set it to 0 or omit it.
// (The declaration in NativeARM.h defines the default value of stat as 0.)
//
// It is not valid to call this function if:
// (rd == IP) AND (rn == IP) AND !encOp2Imm(imm) AND !encOp2Imm(-imm)
// Where: if (encOp2Imm(imm)), imm can be encoded as an ARM operand 2 using the
// encOp2Imm method.
void
Assembler::asm_sub_imm(Register rd, Register rn, int32_t imm, int stat /* =0 */)
{
// Operand 2 encoding of the immediate.
uint32_t op2imm;
NanoAssert(IsGpReg(rd));
NanoAssert(IsGpReg(rn));
NanoAssert((stat & 1) == stat);
// Try to encode the value directly as an operand 2 immediate value, then
// fall back to loading the value into a register.
if (encOp2Imm(imm, &op2imm)) {
SUBis(rd, rn, op2imm, stat);
} else if (encOp2Imm(-imm, &op2imm)) {
// We could not encode the value for SUB, so try to encode it for ADD.
// Note that this is valid even if stat is set, _unless_ imm is 0, but
// that case is caught above.
NanoAssert(imm != 0);
ADDis(rd, rn, op2imm, stat);
} else {
// We couldn't encode the value directly, so use an intermediate
// register to encode the value. We will use IP to do this unless rn is
// IP; in that case we can reuse rd. This allows every case other than
// "SUB IP, IP, =#imm".
Register rm = (rn == IP) ? (rd) : (IP);
NanoAssert(rn != rm);
SUBs(rd, rn, rm, stat);
asm_ld_imm(rm, imm);
}
}
// Encode "rd = rn & imm" using an appropriate instruction sequence.
// Set stat to 1 to update the status flags. Otherwise, set it to 0 or omit it.
// (The declaration in NativeARM.h defines the default value of stat as 0.)
//
// It is not valid to call this function if:
// (rd == IP) AND (rn == IP) AND !encOp2Imm(imm) AND !encOp2Imm(~imm)
// Where: if (encOp2Imm(imm)), imm can be encoded as an ARM operand 2 using the
// encOp2Imm method.
void
Assembler::asm_and_imm(Register rd, Register rn, int32_t imm, int stat /* =0 */)
{
// Operand 2 encoding of the immediate.
uint32_t op2imm;
NanoAssert(IsGpReg(rd));
NanoAssert(IsGpReg(rn));
NanoAssert((stat & 1) == stat);
// Try to encode the value directly as an operand 2 immediate value, then
// fall back to loading the value into a register.
if (encOp2Imm(imm, &op2imm)) {
ANDis(rd, rn, op2imm, stat);
} else if (encOp2Imm(~imm, &op2imm)) {
// Use BIC with the inverted immediate.
BICis(rd, rn, op2imm, stat);
} else {
// We couldn't encode the value directly, so use an intermediate
// register to encode the value. We will use IP to do this unless rn is
// IP; in that case we can reuse rd. This allows every case other than
// "AND IP, IP, =#imm".
Register rm = (rn == IP) ? (rd) : (IP);
NanoAssert(rn != rm);
ANDs(rd, rn, rm, stat);
asm_ld_imm(rm, imm);
}
}
// Encode "rd = rn | imm" using an appropriate instruction sequence.
// Set stat to 1 to update the status flags. Otherwise, set it to 0 or omit it.
// (The declaration in NativeARM.h defines the default value of stat as 0.)
//
// It is not valid to call this function if:
// (rd == IP) AND (rn == IP) AND !encOp2Imm(imm)
// Where: if (encOp2Imm(imm)), imm can be encoded as an ARM operand 2 using the
// encOp2Imm method.
void
Assembler::asm_orr_imm(Register rd, Register rn, int32_t imm, int stat /* =0 */)
{
// Operand 2 encoding of the immediate.
uint32_t op2imm;
NanoAssert(IsGpReg(rd));
NanoAssert(IsGpReg(rn));
NanoAssert((stat & 1) == stat);
// Try to encode the value directly as an operand 2 immediate value, then
// fall back to loading the value into a register.
if (encOp2Imm(imm, &op2imm)) {
ORRis(rd, rn, op2imm, stat);
} else {
// We couldn't encode the value directly, so use an intermediate
// register to encode the value. We will use IP to do this unless rn is
// IP; in that case we can reuse rd. This allows every case other than
// "ORR IP, IP, =#imm".
Register rm = (rn == IP) ? (rd) : (IP);
NanoAssert(rn != rm);
ORRs(rd, rn, rm, stat);
asm_ld_imm(rm, imm);
}
}
// Encode "rd = rn ^ imm" using an appropriate instruction sequence.
// Set stat to 1 to update the status flags. Otherwise, set it to 0 or omit it.
// (The declaration in NativeARM.h defines the default value of stat as 0.)
//
// It is not valid to call this function if:
// (rd == IP) AND (rn == IP) AND !encOp2Imm(imm)
// Where: if (encOp2Imm(imm)), imm can be encoded as an ARM operand 2 using the
// encOp2Imm method.
void
Assembler::asm_eor_imm(Register rd, Register rn, int32_t imm, int stat /* =0 */)
{
// Operand 2 encoding of the immediate.
uint32_t op2imm;
NanoAssert(IsGpReg(rd));
NanoAssert(IsGpReg(rn));
NanoAssert((stat & 1) == stat);
// Try to encode the value directly as an operand 2 immediate value, then
// fall back to loading the value into a register.
if (encOp2Imm(imm, &op2imm)) {
EORis(rd, rn, op2imm, stat);
} else {
// We couldn't encoder the value directly, so use an intermediate
// register to encode the value. We will use IP to do this unless rn is
// IP; in that case we can reuse rd. This allows every case other than
// "EOR IP, IP, =#imm".
Register rm = (rn == IP) ? (rd) : (IP);
NanoAssert(rn != rm);
EORs(rd, rn, rm, stat);
asm_ld_imm(rm, imm);
}
}
// --------------------------------
// Assembler functions.
// --------------------------------
void
Assembler::nInit(AvmCore*)
{
#ifdef UNDER_CE
blx_lr_bug = blx_lr_broken();
#else
blx_lr_bug = 0;
#endif
}
void Assembler::nBeginAssembly()
{
max_out_args = 0;
}
NIns*
Assembler::genPrologue()
{
/**
* Prologue
*/
// NJ_RESV_OFFSET is space at the top of the stack for us
// to use for parameter passing (8 bytes at the moment)
uint32_t stackNeeded = max_out_args + STACK_GRANULARITY * _activation.tos;
uint32_t savingCount = 2;
uint32_t savingMask = rmask(FP) | rmask(LR);
// so for alignment purposes we've pushed return addr and fp
uint32_t stackPushed = STACK_GRANULARITY * savingCount;
uint32_t aligned = alignUp(stackNeeded + stackPushed, NJ_ALIGN_STACK);
int32_t amt = aligned - stackPushed;
// Make room on stack for what we are doing
if (amt)
asm_sub_imm(SP, SP, amt);
verbose_only( asm_output("## %p:",(void*)_nIns); )
verbose_only( asm_output("## patch entry"); )
NIns *patchEntry = _nIns;
MOV(FP, SP);
PUSH_mask(savingMask);
return patchEntry;
}
void
Assembler::nFragExit(LInsp guard)
{
SideExit * exit = guard->record()->exit;
Fragment * frag = exit->target;
bool target_is_known = frag && frag->fragEntry;
if (target_is_known) {
// The target exists so we can simply emit a branch to its location.
JMP_far(frag->fragEntry);
} else {
// The target doesn't exit yet, so emit a jump to the epilogue. If the
// target is created later on, the jump will be patched.
GuardRecord *gr = guard->record();
if (!_epilogue)
_epilogue = genEpilogue();
// Jump to the epilogue. This may get patched later, but JMP_far always
// emits two instructions even when only one is required, so patching
// will work correctly.
JMP_far(_epilogue);
// In the future you may want to move this further down so that we can
// overwrite the r0 guard record load during a patch to a different
// fragment with some assumed input-register state. Not today though.
gr->jmp = _nIns;
// NB: this is a workaround for the fact that, by patching a
// fragment-exit jump, we could be changing the *meaning* of the R0
// register we're passing to the jump target. If we jump to the
// epilogue, ideally R0 means "return value when exiting fragment".
// If we patch this to jump to another fragment however, R0 means
// "incoming 0th parameter". This is just a quirk of ARM ABI. So
// we compromise by passing "return value" to the epilogue in IP,
// not R0, and have the epilogue MOV(R0, IP) first thing.
asm_ld_imm(IP, int(gr));
}
#ifdef NJ_VERBOSE
if (config.show_stats) {
// load R1 with Fragment *fromFrag, target fragment
// will make use of this when calling fragenter().
int fromfrag = int((Fragment*)_thisfrag);
asm_ld_imm(argRegs[1], fromfrag);
}
#endif
// profiling for the exit
verbose_only(
if (_logc->lcbits & LC_FragProfile) {
asm_inc_m32( &guard->record()->profCount );
}
)
// Pop the stack frame.
MOV(SP, FP);
}
NIns*
Assembler::genEpilogue()
{
// On ARMv5+, loading directly to PC correctly handles interworking.
// Note that we don't support anything older than ARMv5.
NanoAssert(ARM_ARCH >= 5);
RegisterMask savingMask = rmask(FP) | rmask(PC);
POP_mask(savingMask); // regs
// NB: this is the later half of the dual-nature patchable exit branch
// workaround noted above in nFragExit. IP has the "return value"
// incoming, we need to move it to R0.
MOV(R0, IP);
return _nIns;
}
/*
* asm_arg will encode the specified argument according to the current ABI, and
* will update r and stkd as appropriate so that the next argument can be
* encoded.
*
* Linux has used ARM's EABI for some time. Windows CE uses the legacy ABI.
*
* Under EABI:
* - doubles are 64-bit aligned both in registers and on the stack.
* If the next available argument register is R1, it is skipped
* and the double is placed in R2:R3. If R0:R1 or R2:R3 are not
* available, the double is placed on the stack, 64-bit aligned.
* - 32-bit arguments are placed in registers and 32-bit aligned
* on the stack.
*
* Under legacy ABI:
* - doubles are placed in subsequent arg registers; if the next
* available register is r3, the low order word goes into r3
* and the high order goes on the stack.
* - 32-bit arguments are placed in the next available arg register,
* - both doubles and 32-bit arguments are placed on stack with 32-bit
* alignment.
*/
void
Assembler::asm_arg(ArgSize sz, LInsp arg, Register& r, int& stkd)
{
// The stack pointer must always be at least aligned to 4 bytes.
NanoAssert((stkd & 3) == 0);
if (sz == ARGSIZE_F) {
// This task is fairly complex and so is delegated to asm_arg_64.
asm_arg_64(arg, r, stkd);
} else if (sz & ARGSIZE_MASK_INT) {
// pre-assign registers R0-R3 for arguments (if they fit)
if (r < R4) {
asm_regarg(sz, arg, r);
r = nextreg(r);
} else {
asm_stkarg(arg, stkd);
stkd += 4;
}
} else {
NanoAssert(sz == ARGSIZE_Q);
// shouldn't have 64 bit int params on ARM
NanoAssert(false);
}
}
// Encode a 64-bit floating-point argument using the appropriate ABI.
// This function operates in the same way as asm_arg, except that it will only
// handle arguments where (ArgSize)sz == ARGSIZE_F.
void
Assembler::asm_arg_64(LInsp arg, Register& r, int& stkd)
{
// The stack pointer must always be at least aligned to 4 bytes.
NanoAssert((stkd & 3) == 0);
// The only use for this function when we are using soft floating-point
// is for LIR_qjoin.
NanoAssert(ARM_VFP || arg->isop(LIR_qjoin));
Register fp_reg = UnknownReg;
if (ARM_VFP) {
fp_reg = findRegFor(arg, FpRegs);
NanoAssert(isKnownReg(fp_reg));
}
#ifdef NJ_ARM_EABI
// EABI requires that 64-bit arguments are aligned on even-numbered
// registers, as R0:R1 or R2:R3. If the register base is at an
// odd-numbered register, advance it. Note that this will push r past
// R3 if r is R3 to start with, and will force the argument to go on
// the stack.
if ((r == R1) || (r == R3)) {
r = nextreg(r);
}
#endif
if (r < R3) {
Register ra = r;
Register rb = nextreg(r);
r = nextreg(rb);
#ifdef NJ_ARM_EABI
// EABI requires that 64-bit arguments are aligned on even-numbered
// registers, as R0:R1 or R2:R3.
NanoAssert( ((ra == R0) && (rb == R1)) || ((ra == R2) && (rb == R3)) );
#endif
// Put the argument in ra and rb. If the argument is in a VFP register,
// use FMRRD to move it to ra and rb. Otherwise, let asm_regarg deal
// with the argument as if it were two 32-bit arguments.
if (ARM_VFP) {
FMRRD(ra, rb, fp_reg);
} else {
asm_regarg(ARGSIZE_LO, arg->oprnd1(), ra);
asm_regarg(ARGSIZE_LO, arg->oprnd2(), rb);
}
#ifndef NJ_ARM_EABI
} else if (r == R3) {
// We only have one register left, but the legacy ABI requires that we
// put 32 bits of the argument in the register (R3) and the remaining
// 32 bits on the stack.
Register ra = r;
r = nextreg(r);
// This really just checks that nextreg() works properly, as we know
// that r was previously R3.
NanoAssert(r == R4);
// We're splitting the argument between registers and the stack. This
// must be the first time that the stack is used, so stkd must be at 0.
NanoAssert(stkd == 0);
if (ARM_VFP) {
// TODO: We could optimize the this to store directly from
// the VFP register to memory using "FMRRD ra, fp_reg[31:0]" and
// "STR fp_reg[63:32], [SP, #stkd]".
// Load from the floating-point register as usual, but use IP
// as a swap register.
STR(IP, SP, 0);
stkd += 4;
FMRRD(ra, IP, fp_reg);
} else {
// Without VFP, we can simply use asm_regarg and asm_stkarg to
// encode the two 32-bit words as we don't need to load from a VFP
// register.
asm_regarg(ARGSIZE_LO, arg->oprnd1(), ra);
asm_stkarg(arg->oprnd2(), 0);
stkd += 4;
}
#endif
} else {
// The argument won't fit in registers, so pass on to asm_stkarg.
#ifdef NJ_ARM_EABI
// EABI requires that 64-bit arguments are 64-bit aligned.
if ((stkd & 7) != 0) {
// stkd will always be aligned to at least 4 bytes; this was
// asserted on entry to this function.
stkd += 4;
}
#endif
asm_stkarg(arg, stkd);
stkd += 8;
}
}
void
Assembler::asm_regarg(ArgSize sz, LInsp p, Register r)
{
NanoAssert(isKnownReg(r));
if (sz & ARGSIZE_MASK_INT)
{
// arg goes in specific register
if (p->isconst()) {
asm_ld_imm(r, p->imm32());
} else {
if (p->isUsed()) {
if (!p->hasKnownReg()) {
// load it into the arg reg
int d = findMemFor(p);
if (p->isop(LIR_alloc)) {
asm_add_imm(r, FP, d, 0);
} else {
LDR(r, FP, d);
}
} else {
// it must be in a saved reg
MOV(r, p->getReg());
}
}
else {
// this is the last use, so fine to assign it
// to the scratch reg, it's dead after this point.
findSpecificRegFor(p, r);
}
}
}
else if (sz == ARGSIZE_Q) {
// 64 bit integer argument - should never happen on ARM
NanoAssert(false);
}
else
{
NanoAssert(sz == ARGSIZE_F);
// fpu argument in register - should never happen since FPU
// args are converted to two 32-bit ints on ARM
NanoAssert(false);
}
}
void
Assembler::asm_stkarg(LInsp arg, int stkd)
{
bool isQuad = arg->isQuad();
Register rr;
if (arg->isUsed() && (rr = arg->getReg(), isKnownReg(rr))) {
// The argument resides somewhere in registers, so we simply need to
// push it onto the stack.
if (!ARM_VFP || !isQuad) {
NanoAssert(IsGpReg(rr));
STR(rr, SP, stkd);
} else {
// According to the comments in asm_arg_64, LIR_qjoin
// can have a 64-bit argument even if VFP is disabled. However,
// asm_arg_64 will split the argument and issue two 32-bit
// arguments to asm_stkarg so we can ignore that case here and
// assert that we will never get 64-bit arguments unless VFP is
// available.
NanoAssert(ARM_VFP);
NanoAssert(IsFpReg(rr));
#ifdef NJ_ARM_EABI
// EABI requires that 64-bit arguments are 64-bit aligned.
NanoAssert((stkd & 7) == 0);
#endif
FSTD(rr, SP, stkd);
}
} else {
// The argument does not reside in registers, so we need to get some
// memory for it and then copy it onto the stack.
int d = findMemFor(arg);
if (!isQuad) {
STR(IP, SP, stkd);
if (arg->isop(LIR_alloc)) {
asm_add_imm(IP, FP, d);
} else {
LDR(IP, FP, d);
}
} else {
#ifdef NJ_ARM_EABI
// EABI requires that 64-bit arguments are 64-bit aligned.
NanoAssert((stkd & 7) == 0);
#endif
STR(IP, SP, stkd+4);
LDR(IP, FP, d+4);
STR(IP, SP, stkd);
LDR(IP, FP, d);
}
}
}
void
Assembler::asm_call(LInsp ins)
{
if (ARM_VFP && ins->isop(LIR_fcall)) {
/* Because ARM actually returns the result in (R0,R1), and not in a
* floating point register, the code to move the result into a correct
* register is below. We do nothing here.
*
* The reason being that if we did something here, the final code
* sequence we'd get would be something like:
* MOV {R0-R3},params [from below]
* BL function [from below]
* MOV {R0-R3},spilled data [from evictScratchRegs()]
* MOV Dx,{R0,R1} [from here]
* which is clearly broken.
*
* This is not a problem for non-floating point calls, because the
* restoring of spilled data into R0 is done via a call to
* prepResultReg(R0) in the other branch of this if-then-else,
* meaning that evictScratchRegs() will not modify R0. However,
* prepResultReg is not aware of the concept of using a register pair
* (R0,R1) for the result of a single operation, so it can only be
* used here with the ultimate VFP register, and not R0/R1, which
* potentially allows for R0/R1 to get corrupted as described.
*/
} else {
prepResultReg(ins, rmask(retRegs[0]));
}
// Do this after we've handled the call result, so we don't
// force the call result to be spilled unnecessarily.
evictScratchRegs();
const CallInfo* call = ins->callInfo();
ArgSize sizes[MAXARGS];
uint32_t argc = call->get_sizes(sizes);
bool indirect = call->isIndirect();
// If we aren't using VFP, assert that the LIR operation is an integer
// function call.
NanoAssert(ARM_VFP || ins->isop(LIR_icall));
// If we're using VFP, and the return type is a double, it'll come back in
// R0/R1. We need to either place it in the result fp reg, or store it.
// See comments above for more details as to why this is necessary here
// for floating point calls, but not for integer calls.
if (ARM_VFP && ins->isUsed()) {
// Determine the size (and type) of the instruction result.
ArgSize rsize = (ArgSize)(call->_argtypes & ARGSIZE_MASK_ANY);
// If the result size is a floating-point value, treat the result
// specially, as described previously.
if (rsize == ARGSIZE_F) {
Register rr = ins->getReg();
NanoAssert(ins->opcode() == LIR_fcall);
if (!isKnownReg(rr)) {
int d = disp(ins);
NanoAssert(d != 0);
freeRsrcOf(ins, false);
// The result doesn't have a register allocated, so store the
// result (in R0,R1) directly to its stack slot.
STR(R0, FP, d+0);
STR(R1, FP, d+4);
} else {
NanoAssert(IsFpReg(rr));
// Copy the result to the (VFP) result register.
prepResultReg(ins, rmask(rr));
FMDRR(rr, R0, R1);
}
}
}
// Emit the branch.
if (!indirect) {
verbose_only(if (_logc->lcbits & LC_Assembly)
outputf(" %p:", _nIns);
)
// Direct call: on v5 and above (where the calling sequence doesn't
// corrupt LR until the actual branch instruction), we can avoid an
// interlock in the "long" branch sequence by manually loading the
// target address into LR ourselves before setting up the parameters
// in other registers.
BranchWithLink((NIns*)call->_address);
} else {
// Indirect call: we assign the address arg to LR since it's not
// used for regular arguments, and is otherwise scratch since it's
// clobberred by the call. On v4/v4T, where we have to manually do
// the equivalent of a BLX, move LR into IP before corrupting LR
// with the return address.
if (blx_lr_bug) {
// workaround for msft device emulator bug (blx lr emulated as no-op)
underrunProtect(8);
BLX(IP);
MOV(IP,LR);
} else {
BLX(LR);
}
asm_regarg(ARGSIZE_LO, ins->arg(--argc), LR);
}
// Encode the arguments, starting at R0 and with an empty argument stack.
Register r = R0;
int stkd = 0;
// Iterate through the argument list and encode each argument according to
// the ABI.
// Note that we loop through the arguments backwards as LIR specifies them
// in reverse order.
uint32_t i = argc;
while(i--) {
asm_arg(sizes[i], ins->arg(i), r, stkd);
}
if (stkd > max_out_args) {
max_out_args = stkd;
}
}
Register
Assembler::nRegisterAllocFromSet(RegisterMask set)
{
NanoAssert(set != 0);
// The CountLeadingZeroes function will use the CLZ instruction where
// available. In other cases, it will fall back to a (slower) C
// implementation.
Register r = (Register)(31-CountLeadingZeroes(set));
_allocator.free &= ~rmask(r);
NanoAssert(IsGpReg(r) || IsFpReg(r));
NanoAssert((rmask(r) & set) == rmask(r));
return r;
}
void
Assembler::nRegisterResetAll(RegAlloc& a)
{
// add scratch registers to our free list for the allocator
a.clear();
a.free =
rmask(R0) | rmask(R1) | rmask(R2) | rmask(R3) | rmask(R4) |
rmask(R5) | rmask(R6) | rmask(R7) | rmask(R8) | rmask(R9) |
rmask(R10) | rmask(LR);
if (ARM_VFP)
a.free |= FpRegs;
debug_only(a.managed = a.free);
}
static inline ConditionCode
get_cc(NIns *ins)
{
return ConditionCode((*ins >> 28) & 0xF);
}
static inline bool
branch_is_B(NIns* branch)
{
return (*branch & 0x0E000000) == 0x0A000000;
}
static inline bool
branch_is_LDR_PC(NIns* branch)
{
return (*branch & 0x0F7FF000) == 0x051FF000;
}
// Is this an instruction of the form ldr/str reg, [fp, #-imm] ?
static inline bool
is_ldstr_reg_fp_minus_imm(/*OUT*/uint32_t* isLoad, /*OUT*/uint32_t* rX,
/*OUT*/uint32_t* immX, NIns i1)
{
if ((i1 & 0xFFEF0000) != 0xE50B0000)
return false;
*isLoad = (i1 >> 20) & 1;
*rX = (i1 >> 12) & 0xF;
*immX = i1 & 0xFFF;
return true;
}
// Is this an instruction of the form ldmdb/stmdb fp, regset ?
static inline bool
is_ldstmdb_fp(/*OUT*/uint32_t* isLoad, /*OUT*/uint32_t* regSet, NIns i1)
{
if ((i1 & 0xFFEF0000) != 0xE90B0000)
return false;
*isLoad = (i1 >> 20) & 1;
*regSet = i1 & 0xFFFF;
return true;
}
// Make an instruction of the form ldmdb/stmdb fp, regset
static inline NIns
mk_ldstmdb_fp(uint32_t isLoad, uint32_t regSet)
{
return 0xE90B0000 | (regSet & 0xFFFF) | ((isLoad & 1) << 20);
}
// Compute the number of 1 bits in the lowest 16 bits of regSet
static inline uint32_t
size_of_regSet(uint32_t regSet)
{
uint32_t x = regSet;
x = (x & 0x5555) + ((x >> 1) & 0x5555);
x = (x & 0x3333) + ((x >> 2) & 0x3333);
x = (x & 0x0F0F) + ((x >> 4) & 0x0F0F);
x = (x & 0x00FF) + ((x >> 8) & 0x00FF);
return x;
}
// See if two ARM instructions, i1 and i2, can be combined into one
static bool
do_peep_2_1(/*OUT*/NIns* merged, NIns i1, NIns i2)
{
uint32_t rX, rY, immX, immY, isLoadX, isLoadY, regSet;
/* ld/str rX, [fp, #-8]
ld/str rY, [fp, #-4]
==>
ld/stmdb fp, {rX, rY}
when
X < Y and X != fp and Y != fp and X != 15 and Y != 15
*/
if (is_ldstr_reg_fp_minus_imm(&isLoadX, &rX, &immX, i1) &&
is_ldstr_reg_fp_minus_imm(&isLoadY, &rY, &immY, i2) &&
immX == 8 && immY == 4 && rX < rY &&
isLoadX == isLoadY &&
rX != FP && rY != FP &&
rX != 15 && rY != 15) {
*merged = mk_ldstmdb_fp(isLoadX, (1 << rX) | (1<<rY));
return true;
}
/* ld/str rX, [fp, #-N]
ld/stmdb fp, regset
==>
ld/stmdb fp, union(regset,{rX})
when
regset is nonempty
X < all elements of regset
N == 4 * (1 + card(regset))
X != fp and X != 15
*/
if (is_ldstr_reg_fp_minus_imm(&isLoadX, &rX, &immX, i1) &&
is_ldstmdb_fp(&isLoadY, ®Set, i2) &&
regSet != 0 &&
(regSet & ((1 << (rX + 1)) - 1)) == 0 &&
immX == 4 * (1 + size_of_regSet(regSet)) &&
isLoadX == isLoadY &&
rX != FP && rX != 15) {
*merged = mk_ldstmdb_fp(isLoadX, regSet | (1 << rX));
return true;
}
return false;
}
// Determine whether or not it's safe to look at _nIns[1].
// Necessary condition for safe peepholing with do_peep_2_1.
static inline bool
does_next_instruction_exist(NIns* _nIns, NIns* codeStart, NIns* codeEnd,
NIns* exitStart, NIns* exitEnd)
{
return (exitStart <= _nIns && _nIns+1 < exitEnd) ||
(codeStart <= _nIns && _nIns+1 < codeEnd);
}
void
Assembler::nPatchBranch(NIns* branch, NIns* target)
{
// Patch the jump in a loop
//
// There are two feasible cases here, the first of which has 2 sub-cases:
//
// (1) We are patching a patchable unconditional jump emitted by
// JMP_far. All possible encodings we may be looking at with
// involve 2 words, though we *may* have to change from 1 word to
// 2 or vice verse.
//
// 1a: B ±32MB ; BKPT
// 1b: LDR PC [PC, #-4] ; $imm
//
// (2) We are patching a patchable conditional jump emitted by
// B_cond_chk. Short conditional jumps are non-patchable, so we
// won't have one here; will only ever have an instruction of the
// following form:
//
// LDRcc PC [PC, #lit] ...
//
// We don't actually know whether the lit-address is in the
// constant pool or in-line of the instruction stream, following
// the insn (with a jump over it) and we don't need to. For our
// purposes here, cases 2, 3 and 4 all look the same.
//
// For purposes of handling our patching task, we group cases 1b and 2
// together, and handle case 1a on its own as it might require expanding
// from a short-jump to a long-jump.
//
// We do not handle contracting from a long-jump to a short-jump, though
// this is a possible future optimisation for case 1b. For now it seems
// not worth the trouble.
//
if (branch_is_B(branch)) {
// Case 1a
// A short B branch, must be unconditional.
NanoAssert(get_cc(branch) == AL);
int32_t offset = PC_OFFSET_FROM(target, branch);
if (isS24(offset>>2)) {
// We can preserve the existing form, just rewrite its offset.
NIns cond = *branch & 0xF0000000;
*branch = (NIns)( cond | (0xA<<24) | ((offset>>2) & 0xFFFFFF) );
} else {
// We need to expand the existing branch to a long jump.
// make sure the next instruction is a dummy BKPT
NanoAssert(*(branch+1) == BKPT_insn);
// Set the branch instruction to LDRcc pc, [pc, #-4]
NIns cond = *branch & 0xF0000000;
*branch++ = (NIns)( cond | (0x51<<20) | (PC<<16) | (PC<<12) | (4));
*branch++ = (NIns)target;
}
} else {
// Case 1b & 2
// Not a B branch, must be LDR, might be any kind of condition.
NanoAssert(branch_is_LDR_PC(branch));
NIns *addr = branch+2;
int offset = (*branch & 0xFFF) / sizeof(NIns);
if (*branch & (1<<23)) {
addr += offset;
} else {
addr -= offset;
}
// Just redirect the jump target, leave the insn alone.
*addr = (NIns) target;
}
}
RegisterMask
Assembler::hint(LIns* i, RegisterMask allow /* = ~0 */)
{
uint32_t op = i->opcode();
int prefer = ~0;
if (op==LIR_icall)
prefer = rmask(R0);
else if (op == LIR_callh)
prefer = rmask(R1);
else if (op == LIR_param) {
if (i->paramArg() < 4)
prefer = rmask(argRegs[i->paramArg()]);
}
if (_allocator.free & allow & prefer)
allow &= prefer;
return allow;
}
void
Assembler::asm_qjoin(LIns *ins)
{
int d = findMemFor(ins);
NanoAssert(d);
LIns* lo = ins->oprnd1();
LIns* hi = ins->oprnd2();
Register r = findRegFor(hi, GpRegs);
STR(r, FP, d+4);
// okay if r gets recycled.
r = findRegFor(lo, GpRegs);
STR(r, FP, d);
freeRsrcOf(ins, false); // if we had a reg in use, emit a ST to flush it to mem
}
void
Assembler::asm_store32(LOpcode op, LIns *value, int dr, LIns *base)
{
switch (op) {
case LIR_sti:
// handled by mainline code below for now
break;
case LIR_stb:
case LIR_sts:
NanoAssertMsg(0, "NJ_EXPANDED_LOADSTORE_SUPPORTED not yet supported for this architecture");
return;
default:
NanoAssertMsg(0, "asm_store32 should never receive this LIR opcode");
return;
}
Register ra, rb;
if (base->isop(LIR_alloc)) {
rb = FP;
dr += findMemFor(base);
ra = findRegFor(value, GpRegs);
} else {
findRegFor2(GpRegs, value, ra, base, rb);
}
if (!isS12(dr)) {
STR(ra, IP, 0);
asm_add_imm(IP, rb, dr);
} else {
STR(ra, rb, dr);
}
}
void
Assembler::asm_restore(LInsp i, Register r)
{
if (i->isop(LIR_alloc)) {
asm_add_imm(r, FP, disp(i));
} else if (i->isconst()) {
if (!i->getArIndex()) {
i->markAsClear();
}
asm_ld_imm(r, i->imm32());
}
else {
// We can't easily load immediate values directly into FP registers, so
// ensure that memory is allocated for the constant and load it from
// memory.
int d = findMemFor(i);
if (ARM_VFP && IsFpReg(r)) {
if (isS8(d >> 2)) {
FLDD(r, FP, d);
} else {
FLDD(r, IP, 0);
asm_add_imm(IP, FP, d);
}
} else {
NIns merged;
LDR(r, FP, d);
// See if we can merge this load into an immediately following
// one, by creating or extending an LDM instruction.
if (/* is it safe to poke _nIns[1] ? */
does_next_instruction_exist(_nIns, codeStart, codeEnd,
exitStart, exitEnd)
&& /* can we merge _nIns[0] into _nIns[1] ? */
do_peep_2_1(&merged, _nIns[0], _nIns[1])) {
_nIns[1] = merged;
_nIns++;
verbose_only( asm_output("merge next into LDMDB"); )
}
}
}
}
void
Assembler::asm_spill(Register rr, int d, bool pop, bool quad)
{
(void) pop;
(void) quad;
if (d) {
if (ARM_VFP && IsFpReg(rr)) {
if (isS8(d >> 2)) {
FSTD(rr, FP, d);
} else {
FSTD(rr, IP, 0);
asm_add_imm(IP, FP, d);
}
} else {
NIns merged;
STR(rr, FP, d);
// See if we can merge this store into an immediately following one,
// one, by creating or extending a STM instruction.
if (/* is it safe to poke _nIns[1] ? */
does_next_instruction_exist(_nIns, codeStart, codeEnd,
exitStart, exitEnd)
&& /* can we merge _nIns[0] into _nIns[1] ? */
do_peep_2_1(&merged, _nIns[0], _nIns[1])) {
_nIns[1] = merged;
_nIns++;
verbose_only( asm_output("merge next into STMDB"); )
}
}
}
}
void
Assembler::asm_load64(LInsp ins)
{
//asm_output("<<< load64");
switch (ins->opcode()) {
case LIR_ldq:
case LIR_ldqc:
// handled by mainline code below for now
break;
case LIR_ld32f:
case LIR_ldc32f:
NanoAssertMsg(0, "NJ_EXPANDED_LOADSTORE_SUPPORTED not yet supported for this architecture");
return;
default:
NanoAssertMsg(0, "asm_load64 should never receive this LIR opcode");
return;
}
NanoAssert(ins->isQuad());
LIns* base = ins->oprnd1();
int offset = ins->disp();
Register rr = ins->getReg();
int d = disp(ins);
Register rb = findRegFor(base, GpRegs);
NanoAssert(IsGpReg(rb));
freeRsrcOf(ins, false);
//outputf("--- load64: Finished register allocation.");
if (ARM_VFP && isKnownReg(rr)) {
// VFP is enabled and the result will go into a register.
NanoAssert(IsFpReg(rr));
if (!isS8(offset >> 2) || (offset&3) != 0) {
FLDD(rr,IP,0);
asm_add_imm(IP, rb, offset);
} else {
FLDD(rr,rb,offset);
}
} else {
// Either VFP is not available or the result needs to go into memory;
// in either case, VFP instructions are not required. Note that the
// result will never be loaded into registers if VFP is not available.
NanoAssert(!isKnownReg(rr));
NanoAssert(d != 0);
// Check that the offset is 8-byte (64-bit) aligned.
NanoAssert((d & 0x7) == 0);
// *(uint64_t*)(FP+d) = *(uint64_t*)(rb+offset)
asm_mmq(FP, d, rb, offset);
}
//asm_output(">>> load64");
}
void
Assembler::asm_store64(LOpcode op, LInsp value, int dr, LInsp base)
{
//asm_output("<<< store64 (dr: %d)", dr);
switch (op) {
case LIR_stqi:
// handled by mainline code below for now
break;
case LIR_st32f:
NanoAssertMsg(0, "NJ_EXPANDED_LOADSTORE_SUPPORTED not yet supported for this architecture");
return;
default:
NanoAssertMsg(0, "asm_store64 should never receive this LIR opcode");
return;
}
if (ARM_VFP) {
Register rb = findRegFor(base, GpRegs);
if (value->isconstq()) {
underrunProtect(LD32_size*2 + 8);
// XXX use another reg, get rid of dependency
STR(IP, rb, dr);
asm_ld_imm(IP, value->imm64_0(), false);
STR(IP, rb, dr+4);
asm_ld_imm(IP, value->imm64_1(), false);
return;
}
Register rv = findRegFor(value, FpRegs);
NanoAssert(isKnownReg(rb));
NanoAssert(isKnownReg(rv));
Register baseReg = rb;
intptr_t baseOffset = dr;
if (!isS8(dr)) {
baseReg = IP;
baseOffset = 0;
}
FSTD(rv, baseReg, baseOffset);
if (!isS8(dr)) {
asm_add_imm(IP, rb, dr);
}
// if it's a constant, make sure our baseReg/baseOffset location
// has the right value
if (value->isconstq()) {
underrunProtect(4*4);
asm_quad_nochk(rv, value->imm64_0(), value->imm64_1());
}
} else {
int da = findMemFor(value);
Register rb = findRegFor(base, GpRegs);
// *(uint64_t*)(rb+dr) = *(uint64_t*)(FP+da)
asm_mmq(rb, dr, FP, da);
}
//asm_output(">>> store64");
}
// stick a quad into register rr, where p points to the two
// 32-bit parts of the quad, optinally also storing at FP+d
void
Assembler::asm_quad_nochk(Register rr, int32_t imm64_0, int32_t imm64_1)
{
// We're not going to use a slot, because it might be too far
// away. Instead, we're going to stick a branch in the stream to
// jump over the constants, and then load from a short PC relative
// offset.
// stream should look like:
// branch A
// imm64_0
// imm64_1
// A: FLDD PC-16
FLDD(rr, PC, -16);
*(--_nIns) = (NIns) imm64_1;
*(--_nIns) = (NIns) imm64_0;
B_nochk(_nIns+2);
}
void
Assembler::asm_quad(LInsp ins)
{
//asm_output(">>> asm_quad");
int d = disp(ins);
Register rr = ins->getReg();
freeRsrcOf(ins, false);
if (ARM_VFP && isKnownReg(rr))
{
asm_spill(rr, d, false, true);
underrunProtect(4*4);
asm_quad_nochk(rr, ins->imm64_0(), ins->imm64_1());
} else {
NanoAssert(d);
// asm_mmq might spill a reg, so don't call it;
// instead do the equivalent directly.
//asm_mmq(FP, d, PC, -16);
STR(IP, FP, d+4);
asm_ld_imm(IP, ins->imm64_1());
STR(IP, FP, d);
asm_ld_imm(IP, ins->imm64_0());
}
//asm_output("<<< asm_quad");
}
void
Assembler::asm_nongp_copy(Register r, Register s)
{
if (ARM_VFP && IsFpReg(r) && IsFpReg(s)) {
// fp->fp
FCPYD(r, s);
} else {
// We can't move a double-precision FP register into a 32-bit GP
// register, so assert that no calling code is trying to do that.
NanoAssert(0);
}
}
Register
Assembler::asm_binop_rhs_reg(LInsp)
{
return UnknownReg;
}
/**
* copy 64 bits: (rd+dd) <- (rs+ds)
*/
void
Assembler::asm_mmq(Register rd, int dd, Register rs, int ds)
{
// The value is either a 64bit struct or maybe a float that isn't live in
// an FPU reg. Either way, don't put it in an FPU reg just to load & store
// it.
// This operation becomes a simple 64-bit memcpy.
// In order to make the operation optimal, we will require two GP
// registers. We can't allocate a register here because the caller may have
// called freeRsrcOf, and allocating a register here may cause something
// else to spill onto the stack which has just be conveniently freed by
// freeRsrcOf (resulting in stack corruption).
//
// Falling back to a single-register implementation of asm_mmq is better
// than adjusting the callers' behaviour (to allow us to allocate another
// register here) because spilling a register will end up being slower than
// just using the same register twice anyway.
//
// Thus, if there is a free register which we can borrow, we will emit the
// following code:
// LDR rr, [rs, #ds]
// LDR ip, [rs, #(ds+4)]
// STR rr, [rd, #dd]
// STR ip, [rd, #(dd+4)]
// (Where rr is the borrowed register.)
//
// If there is no free register, don't spill an existing allocation. Just
// do the following:
// LDR ip, [rs, #ds]
// STR ip, [rd, #dd]
// LDR ip, [rs, #(ds+4)]
// STR ip, [rd, #(dd+4)]
// Ensure that the PC is not used as either base register. The instruction
// generation macros call underrunProtect, and a side effect of this is
// that we may be pushed onto another page, so the PC is not a reliable
// base register.
NanoAssert(rs != PC);
NanoAssert(rd != PC);
// Find the list of free registers from the allocator's free list and the
// GpRegs mask. This excludes any floating-point registers that may be on
// the free list.
RegisterMask free = _allocator.free & AllowableFlagRegs;
if (free) {
// There is at least one register on the free list, so grab one for
// temporary use. There is no need to allocate it explicitly because
// we won't need it after this function returns.
// The CountLeadingZeroes can be used to quickly find a set bit in the
// free mask.
Register rr = (Register)(31-CountLeadingZeroes(free));
// Note: Not every register in GpRegs is usable here. However, these
// registers will never appear on the free list.
NanoAssert((free & rmask(PC)) == 0);
NanoAssert((free & rmask(LR)) == 0);
NanoAssert((free & rmask(SP)) == 0);
NanoAssert((free & rmask(IP)) == 0);
NanoAssert((free & rmask(FP)) == 0);
// Emit the actual instruction sequence.
STR(IP, rd, dd+4);
STR(rr, rd, dd);
LDR(IP, rs, ds+4);
LDR(rr, rs, ds);
} else {
// There are no free registers, so fall back to using IP twice.
STR(IP, rd, dd+4);
LDR(IP, rs, ds+4);
STR(IP, rd, dd);
LDR(IP, rs, ds);
}
}
// Increment the 32-bit profiling counter at pCtr, without
// changing any registers.
verbose_only(
void Assembler::asm_inc_m32(uint32_t* pCtr)
{
// We need to temporarily free up two registers to do this, so
// just push r0 and r1 on the stack. This assumes that the area
// at r13 - 8 .. r13 - 1 isn't being used for anything else at
// this point. This guaranteed us by the EABI; although the
// situation with the legacy ABI I'm not sure of.
//
// Plan: emit the following bit of code. It's not efficient, but
// this is for profiling debug builds only, and is self contained,
// except for above comment re stack use.
//
// E92D0003 push {r0,r1}
// E59F0000 ldr r0, [r15] ; pCtr
// EA000000 b .+8 ; jump over imm
// 12345678 .word 0x12345678 ; pCtr
// E5901000 ldr r1, [r0]
// E2811001 add r1, r1, #1
// E5801000 str r1, [r0]
// E8BD0003 pop {r0,r1}
// We need keep the 4 words beginning at "ldr r0, [r15]"
// together. Simplest to underrunProtect the whole thing.
underrunProtect(8*4);
IMM32(0xE8BD0003); // pop {r0,r1}
IMM32(0xE5801000); // str r1, [r0]
IMM32(0xE2811001); // add r1, r1, #1
IMM32(0xE5901000); // ldr r1, [r0]
IMM32((uint32_t)pCtr); // .word pCtr
IMM32(0xEA000000); // b .+8
IMM32(0xE59F0000); // ldr r0, [r15]
IMM32(0xE92D0003); // push {r0,r1}
}
)
void
Assembler::nativePageReset()
{
_nSlot = 0;
_nExitSlot = 0;
}
void
Assembler::nativePageSetup()
{
NanoAssert(!_inExit);
if (!_nIns)
codeAlloc(codeStart, codeEnd, _nIns verbose_only(, codeBytes));
if (!_nExitIns)
codeAlloc(exitStart, exitEnd, _nExitIns verbose_only(, exitBytes));
// constpool starts at top of page and goes down,
// code starts at bottom of page and moves up
if (!_nSlot)
_nSlot = codeStart;
if (!_nExitSlot)
_nExitSlot = exitStart;
}
void
Assembler::underrunProtect(int bytes)
{
NanoAssertMsg(bytes<=LARGEST_UNDERRUN_PROT, "constant LARGEST_UNDERRUN_PROT is too small");
NanoAssert(_nSlot != 0 && int(_nIns)-int(_nSlot) <= 4096);
uintptr_t top = uintptr_t(_nSlot);
uintptr_t pc = uintptr_t(_nIns);
if (pc - bytes < top)
{
verbose_only(verbose_outputf(" %p:", _nIns);)
NIns* target = _nIns;
// This may be in a normal code chunk or an exit code chunk.
codeAlloc(codeStart, codeEnd, _nIns verbose_only(, codeBytes));
_nSlot = codeStart;
// _nSlot points to the first empty position in the new code block
// _nIns points just past the last empty position.
// Assume B_nochk won't ever try to write to _nSlot. See B_cond_chk macro.
B_nochk(target);
}
}
void
Assembler::JMP_far(NIns* addr)
{
// Even if a simple branch is all that is required, this function must emit
// two words so that the branch can be arbitrarily patched later on.
underrunProtect(8);
intptr_t offs = PC_OFFSET_FROM(addr,_nIns-2);
if (isS24(offs>>2)) {
// Emit a BKPT to ensure that we reserve enough space for a full 32-bit
// branch patch later on. The BKPT should never be executed.
BKPT_nochk();
asm_output("bkpt");
// B [PC+offs]
*(--_nIns) = (NIns)( COND_AL | (0xA<<24) | ((offs>>2) & 0xFFFFFF) );
asm_output("b %p", (void*)addr);
} else {
// Insert the target address as a constant in the instruction stream.
*(--_nIns) = (NIns)((addr));
// ldr pc, [pc, #-4] // load the address into pc, reading it from [pc-4] (e.g.,
// the next instruction)
*(--_nIns) = (NIns)( COND_AL | (0x51<<20) | (PC<<16) | (PC<<12) | (4));
asm_output("ldr pc, =%p", (void*)addr);
}
}
// Perform a branch with link, and ARM/Thumb exchange if necessary. The actual
// BLX instruction is only available from ARMv5 onwards, but as we don't
// support anything older than that this function will not attempt to output
// pre-ARMv5 sequences.
//
// Note: This function is not designed to be used with branches which will be
// patched later, though it will work if the patcher knows how to patch the
// generated instruction sequence.
void
Assembler::BranchWithLink(NIns* addr)
{
// Most branches emitted by TM are loaded through a register, so always
// reserve enough space for the LDR sequence. This should give us a slight
// net gain over reserving the exact amount required for shorter branches.
// This _must_ be called before PC_OFFSET_FROM as it can move _nIns!
underrunProtect(4+LD32_size);
// Calculate the offset from the instruction that is about to be
// written (at _nIns-1) to the target.
intptr_t offs = PC_OFFSET_FROM(addr,_nIns-1);
// ARMv5 and above can use BLX <imm> for branches within ±32MB of the
// PC and BLX Rm for long branches.
if (isS24(offs>>2)) {
// the value we need to stick in the instruction; masked,
// because it will be sign-extended back to 32 bits.
intptr_t offs2 = (offs>>2) & 0xffffff;
if (((intptr_t)addr & 1) == 0) {
// The target is ARM, so just emit a BL.
// BL target
*(--_nIns) = (NIns)( (COND_AL) | (0xB<<24) | (offs2) );
asm_output("bl %p", (void*)addr);
} else {
// The target is Thumb, so emit a BLX.
// We need to emit an ARMv5+ instruction, so assert that we have a
// suitable processor. Note that we don't support ARMv4(T), but
// this serves as a useful sanity check.
NanoAssert(ARM_ARCH >= 5);
// The (pre-shifted) value of the "H" bit in the BLX encoding.
uint32_t H = (offs & 0x2) << 23;
// BLX addr
*(--_nIns) = (NIns)( (0xF << 28) | (0x5<<25) | (H) | (offs2) );
asm_output("blx %p", (void*)addr);
}
} else {
// Load the target address into IP and branch to that. We've already
// done underrunProtect, so we can skip that here.
BLX(IP, false);
// LDR IP, =addr
asm_ld_imm(IP, (int32_t)addr, false);
}
}
// This is identical to BranchWithLink(NIns*) but emits a branch to an address
// held in a register rather than a literal address.
inline void
Assembler::BLX(Register addr, bool chk /* = true */)
{
// We need to emit an ARMv5+ instruction, so assert that we have a suitable
// processor. Note that we don't support ARMv4(T), but this serves as a
// useful sanity check.
NanoAssert(ARM_ARCH >= 5);
NanoAssert(IsGpReg(addr));
// There is a bug in the WinCE device emulator which stops "BLX LR" from
// working as expected. Assert that we never do that!
if (blx_lr_bug) { NanoAssert(addr != LR); }
if (chk) {
underrunProtect(4);
}
// BLX IP
*(--_nIns) = (NIns)( (COND_AL) | (0x12<<20) | (0xFFF<<8) | (0x3<<4) | (addr) );
asm_output("blx ip");
}
// Emit the code required to load a memory address into a register as follows:
// d = *(b+off)
// underrunProtect calls from this function can be disabled by setting chk to
// false. However, this function can use more than LD32_size bytes of space if
// the offset is out of the range of a LDR instruction; the maximum space this
// function requires for underrunProtect is 4+LD32_size.
void
Assembler::asm_ldr_chk(Register d, Register b, int32_t off, bool chk)
{
if (ARM_VFP && IsFpReg(d)) {
FLDD_chk(d,b,off,chk);
return;
}
NanoAssert(IsGpReg(d));
NanoAssert(IsGpReg(b));
// We can't use underrunProtect if the base register is the PC because
// underrunProtect might move the PC if there isn't enough space on the
// current page.
NanoAssert((b != PC) || (!chk));
if (isU12(off)) {
// LDR d, b, #+off
if (chk) underrunProtect(4);
*(--_nIns) = (NIns)( COND_AL | (0x59<<20) | (b<<16) | (d<<12) | off );
} else if (isU12(-off)) {
// LDR d, b, #-off
if (chk) underrunProtect(4);
*(--_nIns) = (NIns)( COND_AL | (0x51<<20) | (b<<16) | (d<<12) | -off );
} else {
// The offset is over 4096 (and outside the range of LDR), so we need
// to add a level of indirection to get the address into IP.
// Because of that, we can't do a PC-relative load unless it fits within
// the single-instruction forms above.
NanoAssert(b != PC);
NanoAssert(b != IP);
if (chk) underrunProtect(4+LD32_size);
*(--_nIns) = (NIns)( COND_AL | (0x79<<20) | (b<<16) | (d<<12) | IP );
asm_ld_imm(IP, off, false);
}
asm_output("ldr %s, [%s, #%d]",gpn(d),gpn(b),(off));
}
// Emit the code required to load an immediate value (imm) into general-purpose
// register d. Optimal (MOV-based) mechanisms are used if the immediate can be
// encoded using ARM's operand 2 encoding. Otherwise, a slot is used on the
// literal pool and LDR is used to load the value.
//
// chk can be explicitly set to false in order to disable underrunProtect calls
// from this function; this allows the caller to perform the check manually.
// This function guarantees not to use more than LD32_size bytes of space.
void
Assembler::asm_ld_imm(Register d, int32_t imm, bool chk /* = true */)
{
uint32_t op2imm;
NanoAssert(IsGpReg(d));
// Attempt to encode the immediate using the second operand of MOV or MVN.
// This is the simplest solution and generates the shortest and fastest
// code, but can only encode a limited set of values.
if (encOp2Imm(imm, &op2imm)) {
// Use MOV to encode the literal.
MOVis(d, op2imm, 0);
return;
}
if (encOp2Imm(~imm, &op2imm)) {
// Use MVN to encode the inverted literal.
MVNis(d, op2imm, 0);
return;
}
// Try to use simple MOV, MVN or MOV(W|T) instructions to load the
// immediate. If this isn't possible, load it from memory.
// - We cannot use MOV(W|T) on cores older than the introduction of
// Thumb-2 or if the target register is the PC.
if (ARM_THUMB2 && (d != PC)) {
// ARMv6T2 and above have MOVW and MOVT.
uint32_t high_h = (uint32_t)imm >> 16;
uint32_t low_h = imm & 0xffff;
if (high_h != 0) {
// Load the high half-word (if necessary).
MOVTi_chk(d, high_h, chk);
}
// Load the low half-word. This also zeroes the high half-word, and
// thus must execute _before_ MOVT, and is necessary even if low_h is 0
// because MOVT will not change the existing low half-word.
MOVWi_chk(d, low_h, chk);
return;
}
// We couldn't encode the literal in the instruction stream, so load it
// from memory.
// Because the literal pool is on the same page as the generated code, it
// will almost always be within the ±4096 range of a LDR. However, this may
// not be the case if _nSlot is at the start of the page and _nIns is at
// the end because the PC is 8 bytes ahead of _nIns. This is unlikely to
// happen, but if it does occur we can simply waste a word or two of
// literal space.
// We must do the underrunProtect before PC_OFFSET_FROM as underrunProtect
// can move the PC if there isn't enough space on the current page!
if (chk) {
underrunProtect(LD32_size);
}
int offset = PC_OFFSET_FROM(_nSlot, _nIns-1);
// If the offset is out of range, waste literal space until it is in range.
while (offset <= -4096) {
++_nSlot;
offset += sizeof(_nSlot);
}
NanoAssert(isS12(offset) && (offset <= -8));
// Write the literal.
*(_nSlot++) = imm;
asm_output("## imm= 0x%x", imm);
// Load the literal.
LDR_nochk(d,PC,offset);
NanoAssert(uintptr_t(_nIns) + 8 + offset == uintptr_t(_nSlot-1));
NanoAssert(*((int32_t*)_nSlot-1) == imm);
}
// Branch to target address _t with condition _c, doing underrun
// checks (_chk == 1) or skipping them (_chk == 0).
//
// Set the target address (_t) to 0 if the target is not yet known and the
// branch will be patched up later.
//
// If the jump is to a known address (with _t != 0) and it fits in a relative
// jump (±32MB), emit that.
// If the jump is unconditional, emit the dest address inline in
// the instruction stream and load it into pc.
// If the jump has a condition, but noone's mucked with _nIns and our _nSlot
// pointer is valid, stick the constant in the slot and emit a conditional
// load into pc.
// Otherwise, emit the conditional load into pc from a nearby constant,
// and emit a jump to jump over it it in case the condition fails.
//
// NB: B_nochk depends on this not calling samepage() when _c == AL
void
Assembler::B_cond_chk(ConditionCode _c, NIns* _t, bool _chk)
{
int32_t offs = PC_OFFSET_FROM(_t,_nIns-1);
//nj_dprintf("B_cond_chk target: 0x%08x offset: %d @0x%08x\n", _t, offs, _nIns-1);
// optimistically check if this will fit in 24 bits
if (_chk && isS24(offs>>2) && (_t != 0)) {
underrunProtect(4);
// recalculate the offset, because underrunProtect may have
// moved _nIns to a new page
offs = PC_OFFSET_FROM(_t,_nIns-1);
}
// Emit one of the following patterns:
//
// --- Short branch. This can never be emitted if the branch target is not
// known.
// B(cc) ±32MB
//
// --- Long unconditional branch.
// LDR PC, #lit
// lit: #target
//
// --- Long conditional branch. Note that conditional branches will never
// be patched, so the nPatchBranch function doesn't need to know where
// the literal pool is located.
// LDRcc PC, #lit
// ; #lit is in the literal pool at _nSlot
//
// --- Long conditional branch (if the slot isn't on the same page as the instruction).
// LDRcc PC, #lit
// B skip ; Jump over the literal data.
// lit: #target
// skip: [...]
if (isS24(offs>>2) && (_t != 0)) {
// The underrunProtect for this was done above (if required by _chk).
*(--_nIns) = (NIns)( ((_c)<<28) | (0xA<<24) | (((offs)>>2) & 0xFFFFFF) );
asm_output("b%s %p", _c == AL ? "" : condNames[_c], (void*)(_t));
} else if (_c == AL) {
if(_chk) underrunProtect(8);
*(--_nIns) = (NIns)(_t);
*(--_nIns) = (NIns)( COND_AL | (0x51<<20) | (PC<<16) | (PC<<12) | 0x4 );
asm_output("b%s %p", _c == AL ? "" : condNames[_c], (void*)(_t));
} else if (PC_OFFSET_FROM(_nSlot, _nIns-1) > -0x1000) {
if(_chk) underrunProtect(8);
*(_nSlot++) = (NIns)(_t);
offs = PC_OFFSET_FROM(_nSlot-1,_nIns-1);
NanoAssert(offs < 0);
*(--_nIns) = (NIns)( ((_c)<<28) | (0x51<<20) | (PC<<16) | (PC<<12) | ((-offs) & 0xFFF) );
asm_output("ldr%s %s, [%s, #-%d]", condNames[_c], gpn(PC), gpn(PC), -offs);
NanoAssert(uintptr_t(_nIns)+8+offs == uintptr_t(_nSlot-1));
} else {
if(_chk) underrunProtect(12);
// Emit a pointer to the target as a literal in the instruction stream.
*(--_nIns) = (NIns)(_t);
// Emit a branch to skip over the literal. The PC value is 8 bytes
// ahead of the executing instruction, so to branch two instructions
// forward this must branch 8-8=0 bytes.
*(--_nIns) = (NIns)( COND_AL | (0xA<<24) | 0x0 );
// Emit the conditional branch.
*(--_nIns) = (NIns)( ((_c)<<28) | (0x51<<20) | (PC<<16) | (PC<<12) | 0x0 );
asm_output("b%s %p", _c == AL ? "" : condNames[_c], (void*)(_t));
}
}
/*
* VFP
*/
void
Assembler::asm_i2f(LInsp ins)
{
Register rr = prepResultReg(ins, FpRegs);
Register srcr = findRegFor(ins->oprnd1(), GpRegs);
// todo: support int value in memory, as per x86
NanoAssert(isKnownReg(srcr));
FSITOD(rr, FpSingleScratch);
FMSR(FpSingleScratch, srcr);
}
void
Assembler::asm_u2f(LInsp ins)
{
Register rr = prepResultReg(ins, FpRegs);
Register sr = findRegFor(ins->oprnd1(), GpRegs);
// todo: support int value in memory, as per x86
NanoAssert(isKnownReg(sr));
FUITOD(rr, FpSingleScratch);
FMSR(FpSingleScratch, sr);
}
void
Assembler::asm_fneg(LInsp ins)
{
LInsp lhs = ins->oprnd1();
Register rr = prepResultReg(ins, FpRegs);
Register sr = ( lhs->isUnusedOrHasUnknownReg()
? findRegFor(lhs, FpRegs)
: lhs->getReg() );
FNEGD(rr, sr);
}
void
Assembler::asm_fop(LInsp ins)
{
LInsp lhs = ins->oprnd1();
LInsp rhs = ins->oprnd2();
LOpcode op = ins->opcode();
NanoAssert(op >= LIR_fadd && op <= LIR_fdiv);
// rr = ra OP rb
Register rr = prepResultReg(ins, FpRegs);
Register ra = findRegFor(lhs, FpRegs);
Register rb = (rhs == lhs) ? ra : findRegFor(rhs, FpRegs & ~rmask(ra));
// XXX special-case 1.0 and 0.0
switch (op)
{
case LIR_fadd: FADDD(rr,ra,rb); break;
case LIR_fsub: FSUBD(rr,ra,rb); break;
case LIR_fmul: FMULD(rr,ra,rb); break;
case LIR_fdiv: FDIVD(rr,ra,rb); break;
default: NanoAssert(0); break;
}
}
void
Assembler::asm_fcmp(LInsp ins)
{
LInsp lhs = ins->oprnd1();
LInsp rhs = ins->oprnd2();
LOpcode op = ins->opcode();
NanoAssert(op >= LIR_feq && op <= LIR_fge);
Register ra, rb;
findRegFor2(FpRegs, lhs, ra, rhs, rb);
int e_bit = (op != LIR_feq);
// do the comparison and get results loaded in ARM status register
FMSTAT();
FCMPD(ra, rb, e_bit);
}
/* Call this with targ set to 0 if the target is not yet known and the branch
* will be patched up later.
*/
NIns*
Assembler::asm_branch(bool branchOnFalse, LInsp cond, NIns* targ)
{
LOpcode condop = cond->opcode();
NanoAssert(cond->isCond());
NanoAssert(ARM_VFP || ((condop < LIR_feq) || (condop > LIR_fge)));
// The old "never" condition code has special meaning on newer ARM cores,
// so use "always" as a sensible default code.
ConditionCode cc = AL;
// Detect whether or not this is a floating-point comparison.
bool fp_cond;
// Because MUL can't set the V flag, we use SMULL and CMP to set the Z flag
// to detect overflow on multiply. Thus, if cond points to a LIR_ov which
// in turn points to a LIR_mul, we must be conditional on !Z, not V.
if ((condop == LIR_ov) && (cond->oprnd1()->isop(LIR_mul))) {
condop = LIR_eq;
branchOnFalse = !branchOnFalse;
}
// Select the appropriate ARM condition code to match the LIR instruction.
switch (condop)
{
// Floating-point conditions. Note that the VFP LT/LE conditions
// require use of the unsigned condition codes, even though
// float-point comparisons are always signed.
case LIR_feq: cc = EQ; fp_cond = true; break;
case LIR_flt: cc = LO; fp_cond = true; break;
case LIR_fle: cc = LS; fp_cond = true; break;
case LIR_fge: cc = GE; fp_cond = true; break;
case LIR_fgt: cc = GT; fp_cond = true; break;
// Standard signed and unsigned integer comparisons.
case LIR_eq: cc = EQ; fp_cond = false; break;
case LIR_ov: cc = VS; fp_cond = false; break;
case LIR_lt: cc = LT; fp_cond = false; break;
case LIR_le: cc = LE; fp_cond = false; break;
case LIR_gt: cc = GT; fp_cond = false; break;
case LIR_ge: cc = GE; fp_cond = false; break;
case LIR_ult: cc = LO; fp_cond = false; break;
case LIR_ule: cc = LS; fp_cond = false; break;
case LIR_ugt: cc = HI; fp_cond = false; break;
case LIR_uge: cc = HS; fp_cond = false; break;
// Default case for invalid or unexpected LIR instructions.
default: cc = AL; fp_cond = false; break;
}
// Invert the condition if required.
if (branchOnFalse)
cc = OppositeCond(cc);
// Ensure that we got a sensible condition code.
NanoAssert((cc != AL) && (cc != NV));
// Ensure that we don't hit floating-point LIR codes if VFP is disabled.
NanoAssert(ARM_VFP || !fp_cond);
// Emit a suitable branch instruction.
B_cond(cc, targ);
// Store the address of the branch instruction so that we can return it.
// asm_[f]cmp will move _nIns so we must do this now.
NIns *at = _nIns;
if (ARM_VFP && fp_cond)
asm_fcmp(cond);
else
asm_cmp(cond);
return at;
}
void
Assembler::asm_cmp(LIns *cond)
{
LOpcode condop = cond->opcode();
// LIR_ov recycles the flags set by arithmetic ops
if ((condop == LIR_ov))
return;
LInsp lhs = cond->oprnd1();
LInsp rhs = cond->oprnd2();
// Not supported yet.
NanoAssert(!lhs->isQuad() && !rhs->isQuad());
// ready to issue the compare
if (rhs->isconst()) {
int c = rhs->imm32();
if (c == 0 && cond->isop(LIR_eq)) {
Register r = findRegFor(lhs, GpRegs);
TST(r,r);
// No 64-bit immediates so fall-back to below
} else if (!rhs->isQuad()) {
Register r = getBaseReg(condop, lhs, c, GpRegs);
asm_cmpi(r, c);
} else {
NanoAssert(0);
}
} else {
Register ra, rb;
findRegFor2(GpRegs, lhs, ra, rhs, rb);
CMP(ra, rb);
}
}
void
Assembler::asm_cmpi(Register r, int32_t imm)
{
if (imm < 0) {
if (imm > -256) {
ALUi(AL, cmn, 1, 0, r, -imm);
} else {
underrunProtect(4 + LD32_size);
CMP(r, IP);
asm_ld_imm(IP, imm);
}
} else {
if (imm < 256) {
ALUi(AL, cmp, 1, 0, r, imm);
} else {
underrunProtect(4 + LD32_size);
CMP(r, IP);
asm_ld_imm(IP, imm);
}
}
}
void
Assembler::asm_fcond(LInsp ins)
{
// only want certain regs
Register r = prepResultReg(ins, AllowableFlagRegs);
switch (ins->opcode()) {
case LIR_feq: SETEQ(r); break;
case LIR_flt: SETLO(r); break; // } note: VFP LT/LE operations require use of
case LIR_fle: SETLS(r); break; // } unsigned LO/LS condition codes!
case LIR_fge: SETGE(r); break;
case LIR_fgt: SETGT(r); break;
default: NanoAssert(0); break;
}
asm_fcmp(ins);
}
void
Assembler::asm_cond(LInsp ins)
{
Register r = prepResultReg(ins, AllowableFlagRegs);
LOpcode op = ins->opcode();
switch(op)
{
case LIR_eq: SETEQ(r); break;
case LIR_lt: SETLT(r); break;
case LIR_le: SETLE(r); break;
case LIR_gt: SETGT(r); break;
case LIR_ge: SETGE(r); break;
case LIR_ult: SETLO(r); break;
case LIR_ule: SETLS(r); break;
case LIR_ugt: SETHI(r); break;
case LIR_uge: SETHS(r); break;
case LIR_ov:
// Because MUL can't set the V flag, we use SMULL and CMP to set
// the Z flag to detect overflow on multiply. Thus, if ins points
// to a LIR_ov which in turn points to a LIR_mul, we must be
// conditional on !Z, not V.
if (!ins->oprnd1()->isop(LIR_mul)) {
SETVS(r);
} else {
SETNE(r);
}
break;
default: NanoAssert(0); break;
}
asm_cmp(ins);
}
void
Assembler::asm_arith(LInsp ins)
{
LOpcode op = ins->opcode();
LInsp lhs = ins->oprnd1();
LInsp rhs = ins->oprnd2();
RegisterMask allow = GpRegs;
// We always need the result register and the first operand register.
Register rr = prepResultReg(ins, allow);
// If this is the last use of lhs in reg, we can re-use the result reg.
// Else, lhs already has a register assigned.
Register ra = ( lhs->isUnusedOrHasUnknownReg()
? findSpecificRegFor(lhs, rr)
: lhs->getReg() );
// Don't re-use the registers we've already allocated.
NanoAssert(isKnownReg(rr));
NanoAssert(isKnownReg(ra));
allow &= ~rmask(rr);
allow &= ~rmask(ra);
// If the rhs is constant, we can use the instruction-specific code to
// determine if the value can be encoded in an ARM instruction. If the
// value cannot be encoded, it will be loaded into a register.
//
// Note that the MUL instruction can never take an immediate argument so
// even if the argument is constant, we must allocate a register for it.
//
// Note: It is possible to use a combination of the barrel shifter and the
// basic arithmetic instructions to generate constant multiplications.
// However, LIR_mul is never invoked with a constant during
// trace-tests.js so it is very unlikely to be worthwhile implementing it.
if (rhs->isconst() && op != LIR_mul)
{
if ((op == LIR_add || op == LIR_iaddp) && lhs->isop(LIR_ialloc)) {
// Add alloc+const. The result should be the address of the
// allocated space plus a constant.
Register rs = prepResultReg(ins, allow);
int d = findMemFor(lhs) + rhs->imm32();
NanoAssert(isKnownReg(rs));
asm_add_imm(rs, FP, d);
}
int32_t imm32 = rhs->imm32();
switch (op)
{
case LIR_iaddp: asm_add_imm(rr, ra, imm32); break;
case LIR_add: asm_add_imm(rr, ra, imm32, 1); break;
case LIR_sub: asm_sub_imm(rr, ra, imm32, 1); break;
case LIR_and: asm_and_imm(rr, ra, imm32); break;
case LIR_or: asm_orr_imm(rr, ra, imm32); break;
case LIR_xor: asm_eor_imm(rr, ra, imm32); break;
case LIR_lsh: LSLi(rr, ra, imm32); break;
case LIR_rsh: ASRi(rr, ra, imm32); break;
case LIR_ush: LSRi(rr, ra, imm32); break;
default:
NanoAssertMsg(0, "Unsupported");
break;
}
// We've already emitted an instruction, so return now.
return;
}
// The rhs is either a register or cannot be encoded as a constant.
Register rb;
if (lhs == rhs) {
rb = ra;
} else {
rb = asm_binop_rhs_reg(ins);
if (!isKnownReg(rb))
rb = findRegFor(rhs, allow);
allow &= ~rmask(rb);
}
NanoAssert(isKnownReg(rb));
switch (op)
{
case LIR_iaddp: ADDs(rr, ra, rb, 0); break;
case LIR_add: ADDs(rr, ra, rb, 1); break;
case LIR_sub: SUBs(rr, ra, rb, 1); break;
case LIR_and: ANDs(rr, ra, rb, 0); break;
case LIR_or: ORRs(rr, ra, rb, 0); break;
case LIR_xor: EORs(rr, ra, rb, 0); break;
case LIR_mul:
// ARMv5 and earlier cores cannot do a MUL where the first operand
// is also the result, so we need a special case to handle that.
//
// We try to use rb as the first operand by default because it is
// common for (rr == ra) and is thus likely to be the most
// efficient method.
if ((ARM_ARCH > 5) || (rr != rb)) {
// IP is used to temporarily store the high word of the result from
// SMULL, so we make use of this to perform an overflow check, as
// ARM's MUL instruction can't set the overflow flag by itself.
// We can check for overflow using the following:
// SMULL rr, ip, ra, rb
// CMP ip, rr, ASR #31
// An explanation can be found in bug 521161. This sets Z if we did
// _not_ overflow, and clears it if we did.
ALUr_shi(AL, cmp, 1, IP, IP, rr, ASR_imm, 31);
SMULL(rr, IP, rb, ra);
} else {
// ARM_ARCH is ARMv5 (or below) and rr == rb, so we must
// find a different way to encode the instruction.
// If possible, swap the arguments to avoid the restriction.
if (rr != ra) {
// We know that rr == rb, so this will be something like
// rX = rY * rX.
// Other than swapping ra and rb, this works in the same as
// as the ARMv6+ case, above.
ALUr_shi(AL, cmp, 1, IP, IP, rr, ASR_imm, 31);
SMULL(rr, IP, ra, rb);
} else {
// We're trying to do rX = rX * rX, but we also need to
// check for overflow so we would need two extra registers
// on ARMv5 and below. We achieve this by observing the
// following:
// - abs(rX)*abs(rX) = rX*rX, so we force the input to be
// positive to simplify the detection logic.
// - Any argument greater than 0xffff will _always_
// overflow, and we can easily check that the top 16
// bits are zero.
// - Any argument lower than (or equal to) 0xffff that
// also overflows is guaranteed to set output bit 31.
//
// Thus, we know we have _not_ overflowed if:
// abs(rX)&0xffff0000 == 0 AND result[31] == 0
//
// The following instruction sequence will be emitted:
// MOVS IP, rX // Put abs(rX) into IP.
// RSBMI IP, IP, #0 // ...
// MUL rX, IP, IP // Do the actual multiplication.
// MOVS IP, IP, LSR #16 // Check that abs(arg)<=0xffff
// CMPEQ IP, rX, ASR #31 // Check that result[31] == 0
NanoAssert(rr != IP);
ALUr_shi(AL, cmp, 1, IP, rr, rr, ASR_imm, 31);
ALUr_shi(AL, mov, 1, IP, IP, IP, LSR_imm, 16);
MUL(rr, IP, IP);
ALUi(MI, rsb, 0, IP, IP, 0);
ALUr(AL, mov, 1, IP, ra, ra);
}
}
break;
// The shift operations need a mask to match the JavaScript
// specification because the ARM architecture allows a greater shift
// range than JavaScript.
case LIR_lsh:
LSL(rr, ra, IP);
ANDi(IP, rb, 0x1f);
break;
case LIR_rsh:
ASR(rr, ra, IP);
ANDi(IP, rb, 0x1f);
break;
case LIR_ush:
LSR(rr, ra, IP);
ANDi(IP, rb, 0x1f);
break;
default:
NanoAssertMsg(0, "Unsupported");
break;
}
}
void
Assembler::asm_neg_not(LInsp ins)
{
LOpcode op = ins->opcode();
Register rr = prepResultReg(ins, GpRegs);
LIns* lhs = ins->oprnd1();
// If this is the last use of lhs in reg, we can re-use result reg.
// Else, lhs already has a register assigned.
Register ra = ( lhs->isUnusedOrHasUnknownReg()
? findSpecificRegFor(lhs, rr)
: lhs->getReg() );
NanoAssert(isKnownReg(ra));
if (op == LIR_not)
MVN(rr, ra);
else
RSBS(rr, ra);
}
void
Assembler::asm_load32(LInsp ins)
{
LOpcode op = ins->opcode();
LIns* base = ins->oprnd1();
int d = ins->disp();
Register rr = prepResultReg(ins, GpRegs);
Register ra = getBaseReg(op, base, d, GpRegs);
switch(op) {
case LIR_ldzb:
case LIR_ldcb:
LDRB(rr, ra, d);
return;
case LIR_ldzs:
case LIR_ldcs:
// these are expected to be 2 or 4-byte aligned
LDRH(rr, ra, d);
return;
case LIR_ld:
case LIR_ldc:
// these are expected to be 4-byte aligned
LDR(rr, ra, d);
return;
case LIR_ldsb:
case LIR_ldss:
case LIR_ldcsb:
case LIR_ldcss:
NanoAssertMsg(0, "NJ_EXPANDED_LOADSTORE_SUPPORTED not yet supported for this architecture");
return;
default:
NanoAssertMsg(0, "asm_load32 should never receive this LIR opcode");
return;
}
}
void
Assembler::asm_cmov(LInsp ins)
{
NanoAssert(ins->opcode() == LIR_cmov);
LIns* condval = ins->oprnd1();
LIns* iftrue = ins->oprnd2();
LIns* iffalse = ins->oprnd3();
NanoAssert(condval->isCmp());
NanoAssert(!iftrue->isQuad() && !iffalse->isQuad());
const Register rr = prepResultReg(ins, GpRegs);
// this code assumes that neither LD nor MR nor MRcc set any of the condition flags.
// (This is true on Intel, is it true on all architectures?)
const Register iffalsereg = findRegFor(iffalse, GpRegs & ~rmask(rr));
switch (condval->opcode()) {
// note that these are all opposites...
case LIR_eq: MOVNE(rr, iffalsereg); break;
case LIR_lt: MOVGE(rr, iffalsereg); break;
case LIR_le: MOVGT(rr, iffalsereg); break;
case LIR_gt: MOVLE(rr, iffalsereg); break;
case LIR_ge: MOVLT(rr, iffalsereg); break;
case LIR_ult: MOVHS(rr, iffalsereg); break;
case LIR_ule: MOVHI(rr, iffalsereg); break;
case LIR_ugt: MOVLS(rr, iffalsereg); break;
case LIR_uge: MOVLO(rr, iffalsereg); break;
case LIR_ov:
// Because MUL can't set the V flag, we use SMULL and CMP to set
// the Z flag to detect overflow on multiply. Thus, if ins points
// to a LIR_ov which in turn points to a LIR_mul, we must be
// conditional on !Z, not V.
if (!condval->oprnd1()->isop(LIR_mul)) {
MOVVC(rr, iffalsereg);
} else {
MOVEQ(rr, iffalsereg);
}
break;
default: debug_only( NanoAssert(0) ); break;
}
/*const Register iftruereg =*/ findSpecificRegFor(iftrue, rr);
asm_cmp(condval);
}
void
Assembler::asm_qhi(LInsp ins)
{
Register rr = prepResultReg(ins, GpRegs);
LIns *q = ins->oprnd1();
int d = findMemFor(q);
LDR(rr, FP, d+4);
}
void
Assembler::asm_qlo(LInsp ins)
{
Register rr = prepResultReg(ins, GpRegs);
LIns *q = ins->oprnd1();
int d = findMemFor(q);
LDR(rr, FP, d);
}
void
Assembler::asm_param(LInsp ins)
{
uint32_t a = ins->paramArg();
uint32_t kind = ins->paramKind();
if (kind == 0) {
// ordinary param
AbiKind abi = _thisfrag->lirbuf->abi;
uint32_t abi_regcount = abi == ABI_CDECL ? 4 : abi == ABI_FASTCALL ? 2 : abi == ABI_THISCALL ? 1 : 0;
if (a < abi_regcount) {
// incoming arg in register
prepResultReg(ins, rmask(argRegs[a]));
} else {
// incoming arg is on stack, and EBP points nearby (see genPrologue)
Register r = prepResultReg(ins, GpRegs);
int d = (a - abi_regcount) * sizeof(intptr_t) + 8;
LDR(r, FP, d);
}
} else {
// saved param
prepResultReg(ins, rmask(savedRegs[a]));
}
}
void
Assembler::asm_int(LInsp ins)
{
Register rr = prepResultReg(ins, GpRegs);
asm_ld_imm(rr, ins->imm32());
}
void
Assembler::asm_ret(LIns *ins)
{
genEpilogue();
// NB: our contract with genEpilogue is actually that the return value
// we are intending for R0 is currently IP, not R0. This has to do with
// the strange dual-nature of the patchable jump in a side-exit. See
// nPatchBranch.
MOV(IP, R0);
// Pop the stack frame.
MOV(SP,FP);
assignSavedRegs();
LIns *value = ins->oprnd1();
if (ins->isop(LIR_ret)) {
findSpecificRegFor(value, R0);
}
else {
NanoAssert(ins->isop(LIR_fret));
if (ARM_VFP) {
Register reg = findRegFor(value, FpRegs);
FMRRD(R0, R1, reg);
} else {
NanoAssert(value->isop(LIR_qjoin));
findSpecificRegFor(value->oprnd1(), R0); // lo
findSpecificRegFor(value->oprnd2(), R1); // hi
}
}
}
void
Assembler::asm_promote(LIns *ins)
{
/* The LIR opcodes that result in a call to asm_promote are only generated
* if NANOJIT_64BIT is #define'd, which it never is for ARM.
*/
(void)ins;
NanoAssert(0);
}
void
Assembler::asm_jtbl(LIns* ins, NIns** table)
{
Register indexreg = findRegFor(ins->oprnd1(), GpRegs);
Register tmp = registerAllocTmp(GpRegs & ~rmask(indexreg));
LDR_scaled(PC, tmp, indexreg, 2); // LDR PC, [tmp + index*4]
asm_ld_imm(tmp, (int32_t)table); // tmp = #table
}
void Assembler::swapCodeChunks() {
SWAP(NIns*, _nIns, _nExitIns);
SWAP(NIns*, _nSlot, _nExitSlot); // this one is ARM-specific
SWAP(NIns*, codeStart, exitStart);
SWAP(NIns*, codeEnd, exitEnd);
verbose_only( SWAP(size_t, codeBytes, exitBytes); )
}
}
#endif /* FEATURE_NANOJIT */