This source file includes following definitions.
- compile
- visit
- visit
- visit
- visit
- uses_hvx
- acquire_hvx_context
- is_dense_ramp
- visit
- sloppy_unpredicate_loads
- compile_func
- init_module
- define_hvx_intrinsic
- define_hvx_intrinsic
- create_bitcast
- call_intrin_cast
- call_intrin_cast
- interleave_vectors
- is_strided_ramp
- is_concat_or_slice
- shuffle_vectors
- vlut
- vlut
- type_suffix
- type_suffix
- type_suffix
- type_suffix
- call_intrin
- call_intrin
- mcpu
- mattrs
- use_soft_float_abi
- native_vector_bits
- visit
- visit
- maybe_scalar
- visit
- mulhi_shr
- sorted_avg
- visit
- visit
- visit
- visit
- visit
- visit
- visit
- visit
- visit
- visit
- visit
- visit
- visit
#include <iostream>
#include <sstream>
#include <mutex>
#include "LLVM_Headers.h"
#include "CodeGen_Hexagon.h"
#include "CodeGen_Internal.h"
#include "IROperator.h"
#include "IRMatch.h"
#include "IREquality.h"
#include "IRMutator.h"
#include "Target.h"
#include "Debug.h"
#include "Util.h"
#include "Simplify.h"
#include "IRPrinter.h"
#include "EliminateBoolVectors.h"
#include "HexagonOptimize.h"
#include "AlignLoads.h"
#include "CSE.h"
#include "LoopCarry.h"
namespace Halide {
namespace Internal {
using std::vector;
using std::string;
using namespace llvm;
#error "Hexagon target requires LLVM version 3.9 or later."
#define IPICK(is_128B, i64) (is_128B ? i64##_128B : i64)
#define IPICK(is_128B, i64) (is_128B ? Intrinsic::not_intrinsic : Intrinsic::not_intrinsic)
CodeGen_Hexagon::CodeGen_Hexagon(Target t) : CodeGen_Posix(t) {
user_error << "hexagon not enabled for this build of Halide.\n";
<< "llvm 5.0 or later is required for Hexagon v62.\n";
user_assert(llvm_Hexagon_enabled) << "llvm build not configured with Hexagon target enabled.\n";
std::unique_ptr<llvm::Module> CodeGen_Hexagon::compile(const Module &module) {
auto llvm_module = CodeGen_Posix::compile(module);
static std::once_flag set_options_once;
std::call_once(set_options_once, []() {
cl::ParseEnvironmentOptions("halide-hvx-be", "HALIDE_LLVM_ARGS",
"Halide HVX internal compiler\n");
std::vector<const char *> options = {
if ({Halide::Target::HVX_128, Halide::Target::HVX_64})) {
user_error << "Both HVX_64 and HVX_128 set at same time\n";
return llvm_module;
namespace {
class UsesHvx : public IRVisitor {
using IRVisitor::visit;
void visit(const Variable *op) {
uses_hvx = uses_hvx || op->type.is_vector();
void visit(const Ramp *op) {
uses_hvx = uses_hvx || op->type.is_vector();
void visit(const Broadcast *op) {
uses_hvx = uses_hvx || op->lanes > 1;
void visit(const Call *op) {
uses_hvx = uses_hvx || op->type.is_vector();
bool uses_hvx = false;
bool uses_hvx(Stmt s) {
UsesHvx uses;
return uses.uses_hvx;
Stmt acquire_hvx_context(Stmt stmt, const Target &target) {
Expr hvx_mode = target.has_feature(Target::HVX_128) ? 128 : 64;
Expr hvx_lock = Call::make(Int(32), "halide_qurt_hvx_lock", {hvx_mode}, Call::Extern);
string hvx_lock_result_name = unique_name("hvx_lock_result");
Expr hvx_lock_result_var = Variable::make(Int(32), hvx_lock_result_name);
Stmt check_hvx_lock = LetStmt::make(hvx_lock_result_name, hvx_lock,
AssertStmt::make(EQ::make(hvx_lock_result_var, 0), hvx_lock_result_var));
Expr dummy_obj = reinterpret(Handle(), cast<uint64_t>(1));
Expr hvx_unlock = Call::make(Int(32), Call::register_destructor,
{Expr("halide_qurt_hvx_unlock_as_destructor"), dummy_obj}, Call::Intrinsic);
stmt = Block::make(Evaluate::make(hvx_unlock), stmt);
stmt = Block::make(check_hvx_lock, stmt);
return stmt;
bool is_dense_ramp(Expr x) {
const Ramp *r =<Ramp>();
if (!r) return false;
return is_one(r->stride);
class SloppyUnpredicateLoads : public IRMutator {
void visit(const Load *op) {
if (is_one(op->predicate) || op-><Broadcast>() || !is_dense_ramp(op->index)) {
Expr predicate = mutate(op->predicate);
Expr index = mutate(op->index);
Expr condition = Shuffle::make({predicate}, {0});
for (int i = 1; i < op->type.lanes(); i++) {
condition = condition || Shuffle::make({predicate}, {i});
predicate = Broadcast::make(condition, predicate.type().lanes());
expr = Load::make(op->type, op->name, index, op->image, op->param, predicate);
using IRMutator::visit;
Stmt sloppy_unpredicate_loads(Stmt s) {
return SloppyUnpredicateLoads().mutate(s);
void CodeGen_Hexagon::compile_func(const LoweredFunc &f,
const string &simple_name, const string &extern_name) {
CodeGen_Posix::begin_func(f.linkage, simple_name, extern_name, f.args);
Stmt body = f.body;
debug(1) << "Unpredicating loads and stores...\n";
body = sloppy_unpredicate_loads(body);
body = unpredicate_loads_stores(body);
debug(2) << "Lowering after unpredicating loads/stores:\n" << body << "\n\n";
debug(1) << "Optimizing shuffles...\n";
const int lut_alignment = 64;
body = optimize_hexagon_shuffles(body, lut_alignment);
debug(2) << "Lowering after optimizing shuffles:\n" << body << "\n\n";
debug(1) << "Aligning loads for HVX....\n";
body = align_loads(body, target.natural_vector_size(Int(8)));
body = common_subexpression_elimination(body);
debug(2) << "Lowering after aligning loads:\n" << body << "\n\n";
debug(1) << "Carrying values across loop iterations...\n";
body = loop_carry(body, 16);
body = simplify(body);
debug(2) << "Lowering after forwarding stores:\n" << body << "\n\n";
debug(1) << "Eliminating boolean vectors from Hexagon code...\n";
body = eliminate_bool_vectors(body);
debug(2) << "Lowering after eliminating boolean vectors: " << body << "\n\n";
debug(1) << "Optimizing Hexagon instructions...\n";
body = optimize_hexagon_instructions(body, target);
if (uses_hvx(body)) {
debug(1) << "Adding calls to qurt_hvx_lock...\n";
body = acquire_hvx_context(body, target);
debug(1) << "Hexagon function body:\n";
debug(1) << body << "\n";
void CodeGen_Hexagon::init_module() {
bool is_128B = target.has_feature(Halide::Target::HVX_128);
Type i8 = Int(8);
Type i16 = Int(16);
Type i32 = Int(32);
Type u8 = UInt(8);
Type u16 = UInt(16);
Type u32 = UInt(32);
Type i8v1 = i8.with_lanes(native_vector_bits() / 8);
Type i16v1 = i16.with_lanes(native_vector_bits() / 16);
Type i32v1 = i32.with_lanes(native_vector_bits() / 32);
Type u8v1 = u8.with_lanes(native_vector_bits() / 8);
Type u16v1 = u16.with_lanes(native_vector_bits() / 16);
Type u32v1 = u32.with_lanes(native_vector_bits() / 32);
Type i8v2 = i8v1.with_lanes(i8v1.lanes() * 2);
Type i16v2 = i16v1.with_lanes(i16v1.lanes() * 2);
Type i32v2 = i32v1.with_lanes(i32v1.lanes() * 2);
Type u8v2 = u8v1.with_lanes(u8v1.lanes() * 2);
Type u16v2 = u16v1.with_lanes(u16v1.lanes() * 2);
Type u32v2 = u32v1.with_lanes(u32v1.lanes() * 2);
struct HvxIntrinsic {
enum {
BroadcastScalarsToWords = 1 << 0,
Intrinsic::ID id;
Type ret_type;
const char *name;
vector<Type> arg_types;
int flags;
vector<HvxIntrinsic> intrinsic_wrappers = {
{ IPICK(is_128B, Intrinsic::hexagon_V6_vzb), u16v2, "zxt.vub", {u8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vzh), u32v2, "zxt.vuh", {u16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vsb), i16v2, "sxt.vb", {i8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vsh), i32v2, "sxt.vh", {i16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vunpackub), u16v2, "unpack.vub", {u8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vunpackuh), u32v2, "unpack.vuh", {u16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vunpackb), i16v2, "unpack.vb", {i8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vunpackh), i32v2, "unpack.vh", {i16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vshuffeb), i8v1, "trunc.vh", {i16v2} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vshufeh), i16v1, "trunc.vw", {i32v2} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vshuffob), i8v1, "trunclo.vh", {i16v2} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vshufoh), i16v1, "trunclo.vw", {i32v2} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vsathub), u8v1, "trunc_satub.vh", {i16v2} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vsatwh), i16v1, "trunc_sath.vw", {i32v2} },
#if LLVM_VERSION >= 50
{ IPICK(is_128B, Intrinsic::hexagon_V6_vsatuwuh), u16v1, "trunc_satuh.vuw", {u32v2} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vroundhub), u8v1, "trunc_satub_rnd.vh", {i16v2} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vroundhb), i8v1, "trunc_satb_rnd.vh", {i16v2} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vroundwuh), u16v1, "trunc_satuh_rnd.vw", {i32v2} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vroundwh), i16v1, "trunc_sath_rnd.vw", {i32v2} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vpackhub_sat), u8v1, "pack_satub.vh", {i16v2} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vpackwuh_sat), u16v1, "pack_satuh.vw", {i32v2} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vpackhb_sat), i8v1, "pack_satb.vh", {i16v2} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vpackwh_sat), i16v1, "pack_sath.vw", {i32v2} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vpackeb), i8v1, "pack.vh", {i16v2} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vpackeh), i16v1, "pack.vw", {i32v2} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vpackob), i8v1, "packhi.vh", {i16v2} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vpackoh), i16v1, "packhi.vw", {i32v2} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vaddb), i8v1, "add.vb.vb", {i8v1, i8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vaddh), i16v1, "add.vh.vh", {i16v1, i16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vaddw), i32v1, "add.vw.vw", {i32v1, i32v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vaddb_dv), i8v2, "add.vb.vb.dv", {i8v2, i8v2} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vaddh_dv), i16v2, "add.vh.vh.dv", {i16v2, i16v2} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vaddw_dv), i32v2, "add.vw.vw.dv", {i32v2, i32v2} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vaddubh), u16v2, "add_vuh.vub.vub", {u8v1, u8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vaddhw), i32v2, "add_vw.vh.vh", {i16v1, i16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vadduhw), u32v2, "add_vuw.vuh.vuh", {u16v1, u16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vsubb), i8v1, "sub.vb.vb", {i8v1, i8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vsubh), i16v1, "sub.vh.vh", {i16v1, i16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vsubw), i32v1, "sub.vw.vw", {i32v1, i32v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vsubb_dv), i8v2, "sub.vb.vb.dv", {i8v2, i8v2} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vsubh_dv), i16v2, "sub.vh.vh.dv", {i16v2, i16v2} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vsubw_dv), i32v2, "sub.vw.vw.dv", {i32v2, i32v2} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vsububh), u16v2, "sub_vuh.vub.vub", {u8v1, u8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vsubhw), i32v2, "sub_vw.vh.vh", {i16v1, i16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vsubuhw), u32v2, "sub_vuw.vuh.vuh", {u16v1, u16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vaddubsat), u8v1, "satub_add.vub.vub", {u8v1, u8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vadduhsat), u16v1, "satuh_add.vuh.vuh", {u16v1, u16v1} },
#if LLVM_VERSION >= 50
{ IPICK(is_128B, Intrinsic::hexagon_V6_vadduwsat), u32v1, "satuw_add.vuw.vuw", {u32v1, u32v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vaddhsat), i16v1, "sath_add.vh.vh", {i16v1, i16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vaddwsat), i32v1, "satw_add.vw.vw", {i32v1, i32v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vaddubsat_dv), u8v2, "satub_add.vub.vub.dv", {u8v2, u8v2} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vadduhsat_dv), u16v2, "satuh_add.vuh.vuh.dv", {u16v2, u16v2} },
#if LLVM_VERSION >= 50
{ IPICK(is_128B, Intrinsic::hexagon_V6_vadduwsat_dv), u32v2, "satuw_add.vuw.vuw.dv", {u32v2, u32v2} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vaddhsat_dv), i16v2, "sath_add.vh.vh.dv", {i16v2, i16v2} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vaddwsat_dv), i32v2, "satw_add.vw.vw.dv", {i32v2, i32v2} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vsububsat), u8v1, "satub_sub.vub.vub", {u8v1, u8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vsubuhsat), u16v1, "satuh_sub.vuh.vuh", {u16v1, u16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vsubhsat), i16v1, "sath_sub.vh.vh", {i16v1, i16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vsubwsat), i32v1, "satw_sub.vw.vw", {i32v1, i32v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vsububsat_dv), u8v2, "satub_sub.vub.vub.dv", {u8v2, u8v2} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vsubuhsat_dv), u16v2, "satuh_sub.vuh.vuh.dv", {u16v2, u16v2} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vsubhsat_dv), i16v2, "sath_sub.vh.vh.dv", {i16v2, i16v2} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vsubwsat_dv), i32v2, "satw_sub.vw.vw.dv", {i32v2, i32v2} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vabsh), u16v1, "abs.vh", {i16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vabsw), u32v1, "abs.vw", {i32v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vabsdiffub), u8v1, "absd.vub.vub", {u8v1, u8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vabsdiffuh), u16v1, "absd.vuh.vuh", {u16v1, u16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vabsdiffh), u16v1, "absd.vh.vh", {i16v1, i16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vabsdiffw), u32v1, "absd.vw.vw", {i32v1, i32v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vavgub), u8v1, "avg.vub.vub", {u8v1, u8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vavguh), u16v1, "avg.vuh.vuh", {u16v1, u16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vavgh), i16v1, "avg.vh.vh", {i16v1, i16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vavgw), i32v1, "avg.vw.vw", {i32v1, i32v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vavgubrnd), u8v1, "avg_rnd.vub.vub", {u8v1, u8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vavguhrnd), u16v1, "avg_rnd.vuh.vuh", {u16v1, u16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vavghrnd), i16v1, "avg_rnd.vh.vh", {i16v1, i16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vavgwrnd), i32v1, "avg_rnd.vw.vw", {i32v1, i32v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vnavgub), i8v1, "navg.vub.vub", {u8v1, u8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vnavgh), i16v1, "navg.vh.vh", {i16v1, i16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vnavgw), i32v1, "navg.vw.vw", {i32v1, i32v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmpyih), i16v1, "mul.vh.vh", {i16v1, i16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmpyihb), i16v1, "mul.vh.b", {i16v1, i8}, HvxIntrinsic::BroadcastScalarsToWords },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmpyiwh), i32v1, "mul.vw.h", {i32v1, i16}, HvxIntrinsic::BroadcastScalarsToWords },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmpyiwb), i32v1, "mul.vw.b", {i32v1, i8}, HvxIntrinsic::BroadcastScalarsToWords },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmpyih_acc), i16v1, "add_mul.vh.vh.vh", {i16v1, i16v1, i16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmpyihb_acc), i16v1, "add_mul.vh.vh.b", {i16v1, i16v1, i8}, HvxIntrinsic::BroadcastScalarsToWords },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmpyiwh_acc), i32v1, "add_mul.vw.vw.h", {i32v1, i32v1, i16}, HvxIntrinsic::BroadcastScalarsToWords },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmpyiwb_acc), i32v1, "add_mul.vw.vw.b", {i32v1, i32v1, i8}, HvxIntrinsic::BroadcastScalarsToWords },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmpyubv), u16v2, "mpy.vub.vub", {u8v1, u8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmpyuhv), u32v2, "mpy.vuh.vuh", {u16v1, u16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmpybv), i16v2, "mpy.vb.vb", {i8v1, i8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmpyhv), i32v2, "mpy.vh.vh", {i16v1, i16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmpyubv_acc), u16v2, "add_mpy.vuh.vub.vub", {u16v2, u8v1, u8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmpyuhv_acc), u32v2, "add_mpy.vuw.vuh.vuh", {u32v2, u16v1, u16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmpybv_acc), i16v2, "add_mpy.vh.vb.vb", {i16v2, i8v1, i8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmpyhv_acc), i32v2, "add_mpy.vw.vh.vh", {i32v2, i16v1, i16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmpybusv), i16v2, "mpy.vub.vb", {u8v1, i8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmpyhus), i32v2, "mpy.vh.vuh", {i16v1, u16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmpybusv_acc), i16v2, "add_mpy.vh.vub.vb", {i16v2, u8v1, i8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmpyhus_acc), i32v2, "add_mpy.vw.vh.vuh", {i32v2, i16v1, u16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmpyub), u16v2, "mpy.vub.ub", {u8v1, u8}, HvxIntrinsic::BroadcastScalarsToWords },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmpyuh), u32v2, "mpy.vuh.uh", {u16v1, u16}, HvxIntrinsic::BroadcastScalarsToWords },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmpyh), i32v2, "mpy.vh.h", {i16v1, i16}, HvxIntrinsic::BroadcastScalarsToWords },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmpybus), i16v2, "mpy.vub.b", {u8v1, i8}, HvxIntrinsic::BroadcastScalarsToWords },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmpyub_acc), u16v2, "add_mpy.vuh.vub.ub", {u16v2, u8v1, u8}, HvxIntrinsic::BroadcastScalarsToWords },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmpyuh_acc), u32v2, "add_mpy.vuw.vuh.uh", {u32v2, u16v1, u16}, HvxIntrinsic::BroadcastScalarsToWords },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmpybus_acc), i16v2, "add_mpy.vh.vub.b", {i16v2, u8v1, i8}, HvxIntrinsic::BroadcastScalarsToWords },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmpyhsat_acc), i32v2, "satw_add_mpy.vw.vh.h", {i32v2, i16v1, i16}, HvxIntrinsic::BroadcastScalarsToWords },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vrmpyubv), u32v1, "add_4mpy.vub.vub", {u8v1, u8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vrmpybv), i32v1, "add_4mpy.vb.vb", {i8v1, i8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vrmpybusv), i32v1, "add_4mpy.vub.vb", {i8v1, i8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vrmpyubv_acc), u32v1, "acc_add_4mpy.vuw.vub.vub", {u32v1, u8v1, u8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vrmpybv_acc), i32v1, "acc_add_4mpy.vw.vb.vb", {i32v1, i8v1, i8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vrmpybusv_acc), i32v1, "acc_add_4mpy.vw.vub.vb", {i32v1, i8v1, i8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vdmpybus), i16v1, "add_2mpy.vub.b", {u8v1, i16}, HvxIntrinsic::BroadcastScalarsToWords },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vdmpyhb), i32v1, "add_2mpy.vh.b", {i16v1, i16}, HvxIntrinsic::BroadcastScalarsToWords },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vdmpybus_acc), i16v1, "acc_add_2mpy.vh.vub.b", {i16v1, u8v1, i16}, HvxIntrinsic::BroadcastScalarsToWords },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vdmpyhb_acc), i32v1, "acc_add_2mpy.vw.vh.b", {i32v1, i16v1, i16}, HvxIntrinsic::BroadcastScalarsToWords },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vrmpybus), i32v1, "add_4mpy.vub.b", {u8v1, i32} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vrmpyub), u32v1, "add_4mpy.vub.ub", {u8v1, u32} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vrmpybus_acc), i32v1, "acc_add_4mpy.vw.vub.b", {i32v1, u8v1, i32} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vrmpyub_acc), u32v1, "acc_add_4mpy.vuw.vub.ub", {u32v1, u8v1, u32} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmpyhvsrs), i16v1, "trunc_satw_mpy2_rnd.vh.vh", {i16v1, i16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmpyhss), i16v1, "trunc_satw_mpy2.vh.h", {i16v1, i16}, HvxIntrinsic::BroadcastScalarsToWords },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmpyhsrs), i16v1, "trunc_satw_mpy2_rnd.vh.h", {i16v1, i16}, HvxIntrinsic::BroadcastScalarsToWords },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmux), i8v1, "mux.vb.vb", {i8v1, i8v1, i8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmux), i16v1, "mux.vh.vh", {i16v1, i16v1, i16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmux), i32v1, "mux.vw.vw", {i32v1, i32v1, i32v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_veqb), i8v1, "eq.vb.vb", {i8v1, i8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_veqh), i16v1, "eq.vh.vh", {i16v1, i16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_veqw), i32v1, "eq.vw.vw", {i32v1, i32v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vgtub), i8v1, "gt.vub.vub", {u8v1, u8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vgtuh), i16v1, "gt.vuh.vuh", {u16v1, u16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vgtuw), i32v1, "gt.vuw.vuw", {u32v1, u32v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vgtb), i8v1, "gt.vb.vb", {i8v1, i8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vgth), i16v1, "gt.vh.vh", {i16v1, i16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vgtw), i32v1, "gt.vw.vw", {i32v1, i32v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmaxub), u8v1, "max.vub.vub", {u8v1, u8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmaxuh), u16v1, "max.vuh.vuh", {u16v1, u16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmaxh), i16v1, "max.vh.vh", {i16v1, i16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vmaxw), i32v1, "max.vw.vw", {i32v1, i32v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vminub), u8v1, "min.vub.vub", {u8v1, u8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vminuh), u16v1, "min.vuh.vuh", {u16v1, u16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vminh), i16v1, "min.vh.vh", {i16v1, i16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vminw), i32v1, "min.vw.vw", {i32v1, i32v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vlsrhv), u16v1, "shr.vuh.vuh", {u16v1, u16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vlsrwv), u32v1, "shr.vuw.vuw", {u32v1, u32v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vasrhv), i16v1, "shr.vh.vh", {i16v1, i16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vasrwv), i32v1, "shr.vw.vw", {i32v1, i32v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vaslhv), u16v1, "shl.vuh.vuh", {u16v1, u16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vaslwv), u32v1, "shl.vuw.vuw", {u32v1, u32v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vaslhv), i16v1, "shl.vh.vh", {i16v1, i16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vaslwv), i32v1, "shl.vw.vw", {i32v1, i32v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vlsrh), u16v1, "shr.vuh.uh", {u16v1, u16} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vlsrw), u32v1, "shr.vuw.uw", {u32v1, u32} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vasrh), i16v1, "shr.vh.h", {i16v1, i16} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vasrw), i32v1, "shr.vw.w", {i32v1, i32} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vaslh), u16v1, "shl.vuh.uh", {u16v1, u16} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vaslw), u32v1, "shl.vuw.uw", {u32v1, u32} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vaslh), i16v1, "shl.vh.h", {i16v1, i16} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vaslw), i32v1, "shl.vw.w", {i32v1, i32} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vasrw_acc), i32v1, "add_shr.vw.vw.w", {i32v1, i32v1, i32} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vaslw_acc), i32v1, "add_shl.vw.vw.w", {i32v1, i32v1, i32} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vasrwh), i16v1, "trunc_shr.vw.w", {i32v2, i32} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vasrhubsat), u8v1, "trunc_satub_shr.vh.h", {i16v2, i16} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vasrwuhsat), u16v1, "trunc_satuh_shr.vw.w", {i32v2, i32} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vasrwhsat), i16v1, "trunc_sath_shr.vw.w", {i32v2, i32} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vand), u8v1, "and.vb.vb", {u8v1, u8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vand), u16v1, "and.vh.vh", {u16v1, u16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vand), u32v1, "and.vw.vw", {u32v1, u32v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vor), u8v1, "or.vb.vb", {u8v1, u8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vor), u16v1, "or.vh.vh", {u16v1, u16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vor), u32v1, "or.vw.vw", {u32v1, u32v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vxor), u8v1, "xor.vb.vb", {u8v1, u8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vxor), u16v1, "xor.vh.vh", {u16v1, u16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vxor), u32v1, "xor.vw.vw", {u32v1, u32v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vnot), u8v1, "not.vb", {u8v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vnot), u16v1, "not.vh", {u16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vnot), u32v1, "not.vw", {u32v1} },
#if LLVM_VERSION >= 50
{ IPICK(is_128B, Intrinsic::hexagon_V6_lvsplatb), u8v1, "splat_v62.b", {u8} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_lvsplath), u16v1, "splat_v62.h", {u16} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_lvsplatw), u32v1, "splat.w", {u32} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vcl0h), u16v1, "clz.vh", {u16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vcl0w), u32v1, "clz.vw", {u32v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vnormamth), u16v1, "cls.vh", {u16v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vnormamtw), u32v1, "cls.vw", {u32v1} },
{ IPICK(is_128B, Intrinsic::hexagon_V6_vpopcounth), u16v1, "popcount.vh", {u16v1} },
for (HvxIntrinsic &i : intrinsic_wrappers) {
define_hvx_intrinsic(, i.ret_type,, i.arg_types,
i.flags & HvxIntrinsic::BroadcastScalarsToWords);
llvm::Function *CodeGen_Hexagon::define_hvx_intrinsic(int id, Type ret_ty, const string &name,
const vector<Type> &arg_types, bool broadcast_scalar_word) {
internal_assert(id != Intrinsic::not_intrinsic);
llvm::Function *intrin = Intrinsic::getDeclaration(module.get(), (llvm::Intrinsic::ID)id);
return define_hvx_intrinsic(intrin, ret_ty, name, arg_types, broadcast_scalar_word);
llvm::Function *CodeGen_Hexagon::define_hvx_intrinsic(llvm::Function *intrin, Type ret_ty, const string &name,
vector<Type> arg_types, bool broadcast_scalar_word) {
internal_assert(intrin) << "Null definition for intrinsic '" << name << "'\n";
llvm::FunctionType *intrin_ty = intrin->getFunctionType();
vector<llvm::Type *> llvm_arg_types;
for (Type i : arg_types) {
llvm::FunctionType *wrapper_ty =
llvm::FunctionType::get(llvm_type_of(ret_ty), llvm_arg_types, false);
llvm::Function *wrapper =
llvm::Function::Create(wrapper_ty, llvm::GlobalValue::InternalLinkage,
"halide.hexagon." + name, module.get());
llvm::BasicBlock *block = llvm::BasicBlock::Create(module->getContext(), "entry", wrapper);
IRBuilderBase::InsertPoint here = builder->saveIP();
vector<Value *> args;
for (Value &arg : wrapper->args()) {
if (args.size() + 1 == intrin_ty->getNumParams()) {
Value *dv = args[0];
int vec_lanes = native_vector_bits()/arg_types[0].bits();
Value *low = slice_vector(dv, 0, vec_lanes);
Value *high = slice_vector(dv, vec_lanes, vec_lanes);
args[0] = high;
args.insert(args.begin() + 1, low);
Type split_type = arg_types.front().with_lanes(arg_types.front().lanes() / 2);
arg_types[0] = split_type;
arg_types.insert(arg_types.begin() + 1, split_type);
internal_assert(args.size() == intrin_ty->getNumParams());
for (size_t i = 0; i < args.size(); i++) {
llvm::Type *arg_ty = intrin_ty->getParamType(i);
if (args[i]->getType() != arg_ty) {
if (arg_ty->isVectorTy()) {
args[i] = builder->CreateBitCast(args[i], arg_ty);
} else {
if (broadcast_scalar_word) {
llvm::Function *fn = nullptr;
unsigned bits = arg_types[i].bits();
switch(bits) {
case 8:
fn = module->getFunction("halide.hexagon.dup4.b");
case 16:
fn = module->getFunction("halide.hexagon.dup2.h");
internal_error << "unhandled broadcast_scalar_word in define_hvx_intrinsic";
args[i] = builder->CreateCall(fn, { args[i] });
} else if (args[i]->getType()->isIntegerTy()) {
args[i] = builder->CreateIntCast(args[i], arg_ty, arg_types[i].is_int());
} else {
args[i] = builder->CreateBitCast(args[i], arg_ty);
Value *ret = builder->CreateCall(intrin, args);
if (ret->getType() != wrapper_ty->getReturnType()) {
ret = builder->CreateBitCast(ret, wrapper_ty->getReturnType());
return wrapper;
Value *CodeGen_Hexagon::create_bitcast(Value *v, llvm::Type *ty) {
if (BitCastInst *c = dyn_cast<BitCastInst>(v)) {
return create_bitcast(c->getOperand(0), ty);
} else if (isa<UndefValue>(v)) {
return UndefValue::get(ty);
} else if (v->getType() != ty) {
v = builder->CreateBitCast(v, ty);
return v;
Value *CodeGen_Hexagon::call_intrin_cast(llvm::Type *ret_ty,
llvm::Function *F,
vector<Value *> Ops) {
llvm::FunctionType *FType = F->getFunctionType();
internal_assert(FType->getNumParams() == Ops.size());
for (unsigned I = 0; I < FType->getNumParams(); ++I) {
Ops[I] = create_bitcast(Ops[I], FType->getParamType(I));
Value *ret = builder->CreateCall(F, Ops);
return create_bitcast(ret, ret_ty);
Value *CodeGen_Hexagon::call_intrin_cast(llvm::Type *ret_ty,
int id,
vector<Value *> Ops) {
llvm::Function *intrin = Intrinsic::getDeclaration(module.get(), (llvm::Intrinsic::ID)id);
return call_intrin_cast(ret_ty, intrin, Ops);
Value *CodeGen_Hexagon::interleave_vectors(const vector<llvm::Value *> &v) {
bool is_128B = target.has_feature(Halide::Target::HVX_128);
llvm::Type *v_ty = v[0]->getType();
llvm::Type *element_ty = v_ty->getVectorElementType();
int element_bits = element_ty->getScalarSizeInBits();
int native_elements = native_vector_bits()/element_ty->getScalarSizeInBits();
int result_elements = v_ty->getVectorNumElements()*v.size();
if (v.size() == 2) {
Value *a = v[0];
Value *b = v[1];
if (result_elements == native_elements && (element_bits == 8 || element_bits == 16)) {
llvm::Type *native_ty = llvm::VectorType::get(element_ty, native_elements);
Intrinsic::ID vshuff =
element_bits == 8 ? IPICK(is_128B, Intrinsic::hexagon_V6_vshuffb) : IPICK(is_128B, Intrinsic::hexagon_V6_vshuffh);
return call_intrin_cast(native_ty, vshuff, {concat_vectors({a, b})});
} else {
llvm::Type *native2_ty = llvm::VectorType::get(element_ty, native_elements*2);
Value *bytes = codegen(-static_cast<int>(element_bits/8));
vector<Value *> ret;
for (int i = 0; i < result_elements/2; i += native_elements) {
Value *a_i = slice_vector(a, i, native_elements);
Value *b_i = slice_vector(b, i, native_elements);
Value *ret_i = call_intrin_cast(native2_ty,
IPICK(is_128B, Intrinsic::hexagon_V6_vshuffvdd),
{b_i, a_i, bytes});
if ((i + native_elements)*2 > result_elements) {
ret_i = slice_vector(ret_i, 0, (i + native_elements)*2 - result_elements);
return concat_vectors(ret);
} else if (v.size() == 3) {
if (element_bits == 8 || element_bits == 16) {
Value *lut = concat_vectors(v);
std::vector<int> indices;
for (unsigned i = 0; i < v_ty->getVectorNumElements(); i++) {
for (size_t j = 0; j < v.size(); j++) {
indices.push_back(j * v_ty->getVectorNumElements() + i);
return vlut(lut, indices);
return CodeGen_Posix::interleave_vectors(v);
namespace {
bool is_strided_ramp(const vector<int> &indices, int &start, int &stride) {
int size = static_cast<int>(indices.size());
int x0 = -1;
int x1 = -1;
for (int i = 0; i < size; i++) {
if (indices[i] != -1) {
if (x0 == -1) {
x0 = i;
} else {
x1 = i;
if (x1 == -1) {
stride = 1;
start = x0 != -1 ? indices[x0] - x0 : 0;
return true;
int dx = x1 - x0;
int dy = indices[x1] - indices[x0];
stride = dy/dx;
start = indices[x0] - stride*x0;
for (int i = 0; i < size; i++) {
if (indices[i] != -1 && indices[i] != start + i*stride) {
return false;
return true;
bool is_concat_or_slice(const vector<int> &indices) {
size_t begin = 0;
while (begin < indices.size() && indices[begin] == -1) {
size_t end = indices.size();
while (end > 1 && indices[end - 1] == -1) {
for (size_t i = begin; i + 1 < end; i++) {
if (indices[i] + 1 != indices[i + 1]) {
return false;
return true;
Value *CodeGen_Hexagon::shuffle_vectors(Value *a, Value *b,
const vector<int> &indices) {
llvm::Type *a_ty = a->getType();
llvm::Type *b_ty = b->getType();
internal_assert(a_ty == b_ty);
bool is_128B = target.has_feature(Halide::Target::HVX_128);
int a_elements = static_cast<int>(a_ty->getVectorNumElements());
int b_elements = static_cast<int>(b_ty->getVectorNumElements());
llvm::Type *element_ty = a->getType()->getVectorElementType();
int element_bits = element_ty->getScalarSizeInBits();
int native_elements = native_vector_bits() / element_bits;
llvm::Type *native_ty = llvm::VectorType::get(element_ty, native_elements);
llvm::Type *native2_ty = llvm::VectorType::get(element_ty, native_elements*2);
int result_elements = static_cast<int>(indices.size());
internal_assert(result_elements > 0);
llvm::Type *result_ty = VectorType::get(element_ty, result_elements);
int min = indices[0];
for (size_t i = 1; i < indices.size(); i++) {
if (indices[i] != -1 && indices[i] < min) {
min = indices[i];
if (min >= a_elements) {
vector<int> shifted_indices(indices);
for (int &i : shifted_indices) {
if (i != -1) i -= a_elements;
return shuffle_vectors(b, UndefValue::get(b->getType()), shifted_indices);
int max = *std::max_element(indices.begin(), indices.end());
if (max < a_elements) {
BitCastInst *a_cast = dyn_cast<BitCastInst>(a);
CallInst *a_call = dyn_cast<CallInst>(a_cast ? a_cast->getOperand(0) : a);
llvm::Function *vcombine =
Intrinsic::getDeclaration(module.get(), IPICK(is_128B, Intrinsic::hexagon_V6_vcombine));
if (a_call && a_call->getCalledFunction() == vcombine) {
return shuffle_vectors(
create_bitcast(a_call->getArgOperand(1), native_ty),
create_bitcast(a_call->getArgOperand(0), native_ty),
} else if (ShuffleVectorInst *a_shuffle = dyn_cast<ShuffleVectorInst>(a)) {
bool is_identity = true;
for (int i = 0; i < a_elements; i++) {
int mask_i = a_shuffle->getMaskValue(i);
is_identity = is_identity && (mask_i == i || mask_i == -1);
if (is_identity) {
return shuffle_vectors(
int start = 0, stride = 0;
if (!is_strided_ramp(indices, start, stride)) {
if (is_concat_or_slice(indices) || element_bits > 16) {
return CodeGen_Posix::shuffle_vectors(a, b, indices);
} else if (max < 256) {
return vlut(concat_vectors({a, b}), indices);
return CodeGen_Posix::shuffle_vectors(a, b, indices);
if (stride == 1) {
if (result_ty == native2_ty && a_ty == native_ty && b_ty == native_ty) {
internal_assert(start == 0);
return call_intrin_cast(native2_ty, IPICK(is_128B, Intrinsic::hexagon_V6_vcombine), {b, a});
if (result_ty == native_ty && a_ty == native2_ty && max < a_elements) {
b = call_intrin_cast(native_ty, IPICK(is_128B, Intrinsic::hexagon_V6_hi), {a});
a = call_intrin_cast(native_ty, IPICK(is_128B, Intrinsic::hexagon_V6_lo), {a});
a_ty = a->getType();
b_ty = b->getType();
a_elements = a_ty->getVectorNumElements();
b_elements = b_ty->getVectorNumElements();
if (start == 0 && result_ty == a_ty) {
return a;
if (start == a_elements && result_ty == b_ty) {
return b;
if (result_ty == native_ty && a_ty == native_ty && b_ty == native_ty) {
int bytes_off = start * (element_bits / 8);
int reverse_bytes = (native_vector_bits() / 8) - bytes_off;
Intrinsic::ID intrin_id = IPICK(is_128B, Intrinsic::hexagon_V6_valignb);
if (bytes_off <= 7) {
intrin_id = IPICK(is_128B, Intrinsic::hexagon_V6_valignbi);
} else if (reverse_bytes <= 7) {
intrin_id = IPICK(is_128B, Intrinsic::hexagon_V6_vlalignbi);
bytes_off = reverse_bytes;
return call_intrin_cast(native_ty, intrin_id, {b, a, codegen(bytes_off)});
return CodeGen_Posix::shuffle_vectors(a, b, indices);
} else if (stride == 2 && result_elements*2 == a_elements + b_elements) {
internal_assert(start == 0 || start == 1);
Value *ab = max < a_elements ? a : concat_vectors({a, b});
vector<Value *> ret;
for (int i = 0; i < result_elements; i += native_elements) {
Value *ab_i0 = slice_vector(ab, i*2, native_elements);
Value *ab_i1 = slice_vector(ab, i*2 + native_elements, native_elements);
Value *ret_i;
if (element_bits == 8) {
Intrinsic::ID intrin =
start == 0 ? IPICK(is_128B, Intrinsic::hexagon_V6_vpackeb) : IPICK(is_128B, Intrinsic::hexagon_V6_vpackob);
ret_i = call_intrin_cast(native_ty, intrin, {ab_i1, ab_i0});
} else if (element_bits == 16) {
Intrinsic::ID intrin =
start == 0 ? IPICK(is_128B, Intrinsic::hexagon_V6_vpackeh) : IPICK(is_128B, Intrinsic::hexagon_V6_vpackoh);
ret_i = call_intrin_cast(native_ty, intrin, {ab_i1, ab_i0});
} else if (element_bits%8 == 0) {
int element_bytes = element_bits / 8;
Value *packed = call_intrin_cast(native2_ty,
IPICK(is_128B, Intrinsic::hexagon_V6_vdealvdd),
{ab_i1, ab_i0, ConstantInt::get(i32_t, -element_bytes)});
Intrinsic::ID intrin =
start == 0 ? IPICK(is_128B, Intrinsic::hexagon_V6_lo) : IPICK(is_128B, Intrinsic::hexagon_V6_hi);
ret_i = call_intrin_cast(native_ty, intrin, {packed});
} else {
return CodeGen_Posix::shuffle_vectors(a, b, indices);
if (i + native_elements > result_elements) {
ret_i = slice_vector(ret_i, 0, i + native_elements - result_elements);
return concat_vectors(ret);
if (element_bits <= 16 && max < 256) {
return vlut(concat_vectors({a, b}), indices);
} else {
return CodeGen_Posix::shuffle_vectors(a, b, indices);
Value *CodeGen_Hexagon::vlut(Value *lut, Value *idx, int min_index, int max_index) {
bool is_128B = target.has_feature(Halide::Target::HVX_128);
llvm::Type *lut_ty = lut->getType();
llvm::Type *idx_ty = idx->getType();
internal_assert(idx_ty->getScalarSizeInBits() == 8);
internal_assert(min_index >= 0);
internal_assert(max_index <= 255);
Intrinsic::ID vlut_id = Intrinsic::not_intrinsic;
Intrinsic::ID vlut_acc_id = Intrinsic::not_intrinsic;
Intrinsic::ID vshuff_id = Intrinsic::not_intrinsic;
if (lut_ty->getScalarSizeInBits() == 8) {
vlut_id = IPICK(is_128B, Intrinsic::hexagon_V6_vlutvvb);
vlut_acc_id = IPICK(is_128B, Intrinsic::hexagon_V6_vlutvvb_oracc);
vshuff_id = IPICK(is_128B, Intrinsic::hexagon_V6_vshuffb);
} else {
int replicate = lut_ty->getScalarSizeInBits() / 16;
if (replicate > 1) {
internal_error << "LUT with greater than 16 bit entries not implemented.\n";
vlut_id = IPICK(is_128B, Intrinsic::hexagon_V6_vlutvwh);
vlut_acc_id = IPICK(is_128B, Intrinsic::hexagon_V6_vlutvwh_oracc);
vshuff_id = IPICK(is_128B, Intrinsic::hexagon_V6_vshuffh);
max_index = std::min(max_index, static_cast<int>(lut_ty->getVectorNumElements()) - 1);
int native_idx_elements = native_vector_bits()/8;
int native_lut_elements = native_vector_bits()/lut_ty->getScalarSizeInBits();
vector<Value *> lut_slices;
for (int i = 0; i <= max_index; i += native_lut_elements) {
Value *lut_slice = slice_vector(lut, i, native_lut_elements);
lut_slice = call_intrin_cast(lut_slice->getType(), vshuff_id, {lut_slice});
llvm::Type *native_result_ty =
llvm::VectorType::get(lut_ty->getVectorElementType(), native_idx_elements);
int idx_elements = idx_ty->getVectorNumElements();
int lut_passes = is_128B ? 2 : 1;
vector<Value *> result;
for (int i = 0; i < idx_elements; i += native_idx_elements) {
Value *idx_i = slice_vector(idx, i, native_idx_elements);
if (lut_ty->getScalarSizeInBits() == 16) {
idx_i = call_intrin_cast(idx_i->getType(), IPICK(is_128B, Intrinsic::hexagon_V6_vshuffb), {idx_i});
Value *result_i = nullptr;
for (int j = 0; j < static_cast<int>(lut_slices.size()); j++) {
for (int k = 0; k < lut_passes; k++) {
int pass_index = lut_passes * j + k;
Value *mask[2] = {
ConstantInt::get(i32_t, 2 * pass_index + 0),
ConstantInt::get(i32_t, 2 * pass_index + 1),
if (result_i == nullptr) {
result_i = call_intrin_cast(native_result_ty, vlut_id,
{idx_i, lut_slices[j], mask[0]});
result_i = call_intrin_cast(native_result_ty, vlut_acc_id,
{result_i, idx_i, lut_slices[j], mask[1]});
} else if (max_index >= pass_index * native_lut_elements / lut_passes) {
for (int m = 0; m < 2; m++) {
result_i = call_intrin_cast(native_result_ty, vlut_acc_id,
{result_i, idx_i, lut_slices[j], mask[m]});
return slice_vector(concat_vectors(result), 0, idx_elements);
Value *CodeGen_Hexagon::vlut(Value *lut, const vector<int> &indices) {
vector<Constant *>llvm_indices;
int min_index = lut->getType()->getVectorNumElements();
int max_index = 0;
for (int i : indices) {
if (i != -1) {
min_index = std::min(min_index, i);
max_index = std::max(max_index, i);
llvm_indices.push_back(ConstantInt::get(i8_t, i));
if (max_index <= 255) {
return vlut(lut, ConstantVector::get(llvm_indices), min_index, max_index);
llvm::Type *i8x_t = VectorType::get(i8_t, indices.size());
llvm::Type *i16x_t = VectorType::get(i16_t, indices.size());
internal_assert(max_index < std::numeric_limits<int16_t>::max())
<< "vlut of more than 32k elements not supported \n";
vector<std::pair<Value *, Value *>> ranges;
for (int min_index_i = 0; min_index_i < max_index; min_index_i += 256) {
vector<Constant *> llvm_indices;
for (int i : indices) {
llvm_indices.push_back(ConstantInt::get(i16_t, i - min_index_i));
Value *llvm_index = ConstantVector::get(llvm_indices);
Value *minus_one = codegen(make_const(UInt(16, indices.size()), -1));
Value *use_index = call_intrin(i16x_t, "", {llvm_index, minus_one});
llvm_index = call_intrin(i8x_t, "halide.hexagon.pack.vh", {llvm_index});
use_index = call_intrin(i8x_t, "halide.hexagon.pack.vh", {use_index});
int range_extent_i = std::min(max_index - min_index_i, 255);
Value *range_i = vlut(slice_vector(lut, min_index_i, range_extent_i), llvm_index, 0, range_extent_i);
ranges.push_back({ range_i, use_index });
Value *result = ranges[0].first;
llvm::Type *element_ty = result->getType()->getVectorElementType();
string mux = "halide.hexagon.mux";
switch (element_ty->getScalarSizeInBits()) {
case 8: mux += ".vb.vb"; break;
case 16: mux += ".vh.vh"; break;
case 32: mux += ".vw.vw"; break;
default: internal_error << "Cannot constant select vector of " << element_ty->getScalarSizeInBits() << "\n";
for (size_t i = 1; i < ranges.size(); i++) {
result = call_intrin(result->getType(), mux, {ranges[i].second, ranges[i].first, result});
return result;
namespace {
string type_suffix(Type type, bool signed_variants = true) {
string prefix = type.is_vector() ? ".v" : ".";
if (type.is_int() || !signed_variants) {
switch (type.bits()) {
case 8: return prefix + "b";
case 16: return prefix + "h";
case 32: return prefix + "w";
} else if (type.is_uint()) {
switch (type.bits()) {
case 8: return prefix + "ub";
case 16: return prefix + "uh";
case 32: return prefix + "uw";
internal_error << "Unsupported HVX type: " << type << "\n";
return "";
string type_suffix(Expr a, bool signed_variants = true) {
return type_suffix(a.type(), signed_variants);
string type_suffix(Expr a, Expr b, bool signed_variants = true) {
return type_suffix(a, signed_variants) + type_suffix(b, signed_variants);
string type_suffix(const vector<Expr> &ops, bool signed_variants = true) {
if (ops.empty()) return "";
string suffix = type_suffix(ops.front(), signed_variants);
for (size_t i = 1; i < ops.size(); i++) {
suffix = suffix + type_suffix(ops[i], signed_variants);
return suffix;
Value *CodeGen_Hexagon::call_intrin(Type result_type, const string &name,
vector<Expr> args, bool maybe) {
llvm::Function *fn = module->getFunction(name);
if (maybe && !fn) return nullptr;
internal_assert(fn) << "Function '" << name << "' not found\n";
if (fn->getReturnType()->getVectorNumElements()*2 <= static_cast<unsigned>(result_type.lanes())) {
llvm::Function *fn2 = module->getFunction(name + ".dv");
if (fn2) {
fn = fn2;
return call_intrin(result_type,
Value *CodeGen_Hexagon::call_intrin(llvm::Type *result_type, const string &name,
vector<Value *> args, bool maybe) {
llvm::Function *fn = module->getFunction(name);
if (maybe && !fn) return nullptr;
internal_assert(fn) << "Function '" << name << "' not found\n";
if (fn->getReturnType()->getVectorNumElements()*2 <= result_type->getVectorNumElements()) {
llvm::Function *fn2 = module->getFunction(name + ".dv");
if (fn2) {
fn = fn2;
return call_intrin(result_type,
string CodeGen_Hexagon::mcpu() const {
if (target.has_feature(Halide::Target::HVX_v62)) {
return "hexagonv62";
} else {
return "hexagonv60";
string CodeGen_Hexagon::mattrs() const {
std::stringstream attrs;
if (target.has_feature(Halide::Target::HVX_128)) {
attrs << "+hvx-double";
} else {
attrs << "+hvx";
attrs << ",+long-calls";
return attrs.str();
bool CodeGen_Hexagon::use_soft_float_abi() const {
return false;
int CodeGen_Hexagon::native_vector_bits() const {
if (target.has_feature(Halide::Target::HVX_128)) {
return 128*8;
} else {
return 64*8;
void CodeGen_Hexagon::visit(const Add *op) {
if (op->type.is_vector()) {
value = call_intrin(op->type,
"halide.hexagon.add" + type_suffix(op->a, op->b, false),
{op->a, op->b});
} else {
void CodeGen_Hexagon::visit(const Sub *op) {
if (op->type.is_vector()) {
value = call_intrin(op->type,
"halide.hexagon.sub" + type_suffix(op->a, op->b, false),
{op->a, op->b});
} else {
namespace {
Expr maybe_scalar(Expr x) {
const Broadcast *xb =<Broadcast>();
if (xb) {
return xb->value;
} else {
return x;
void CodeGen_Hexagon::visit(const Mul *op) {
if (op->type.is_vector()) {
value = call_intrin(op->type,
"halide.hexagon.mul" + type_suffix(op->a, op->b),
{op->a, op->b},
true );
if (value) return;
value = call_intrin(op->type,
"halide.hexagon.mpy" + type_suffix(op->a, op->b),
{op->a, op->b},
true );
if (value) {
Type wide = op->type.with_bits(op->type.bits()*2);
value = call_intrin(llvm_type_of(op->type),
"halide.hexagon.trunc" + type_suffix(wide, false),
internal_error << "Unhandled HVX multiply "
<< op->a.type() << "*" << op->b.type() << "\n"
<< Expr(op) << "\n";
} else {
Expr CodeGen_Hexagon::mulhi_shr(Expr a, Expr b, int shr) {
Type ty = a.type();
if (ty.is_vector() && (ty.bits() == 8 || ty.bits() == 16)) {
Type wide_ty = ty.with_bits(ty.bits() * 2);
Expr p_wide = Call::make(wide_ty, "halide.hexagon.mpy" + type_suffix(a, b),
{a, b}, Call::PureExtern);
Expr p = Call::make(ty, "halide.hexagon.trunclo" + type_suffix(p_wide, false),
{p_wide}, Call::PureExtern);
if (shr != 0) {
p = p >> shr;
return p;
} else {
return CodeGen_Posix::mulhi_shr(a, b, shr);
Expr CodeGen_Hexagon::sorted_avg(Expr a, Expr b) {
Type ty = a.type();
if (ty.is_vector() && ((ty.is_uint() && (ty.bits() == 8 || ty.bits() == 16)) ||
(ty.is_int() && (ty.bits() == 16 || ty.bits() == 32)))) {
return Call::make(ty, "halide.hexagon.avg" + type_suffix(a, b),
{a, b}, Call::PureExtern);
} else {
return CodeGen_Posix::sorted_avg(a, b);
void CodeGen_Hexagon::visit(const Div *op) {
void CodeGen_Hexagon::visit(const Cast *op) {
void CodeGen_Hexagon::visit(const Call *op) {
internal_assert(op->call_type == Call::Extern ||
op->call_type == Call::Intrinsic ||
op->call_type == Call::PureExtern ||
op->call_type == Call::PureIntrinsic)
<< "Can only codegen extern calls and intrinsics\n";
static std::map<string, std::pair<string, bool>> functions = {
{ Call::abs, { "halide.hexagon.abs", true } },
{ Call::absd, { "halide.hexagon.absd", true } },
{ Call::bitwise_and, { "halide.hexagon.and", false } },
{ Call::bitwise_or, { "halide.hexagon.or", false } },
{ Call::bitwise_xor, { "halide.hexagon.xor", false } },
{ Call::bitwise_not, { "halide.hexagon.not", false } },
{ Call::count_leading_zeros, { "halide.hexagon.clz", false } },
{ Call::popcount, { "halide.hexagon.popcount", false } },
if (is_native_interleave(op) || is_native_deinterleave(op)) {
user_assert(op->type.lanes() % (native_vector_bits() * 2 / op->type.bits()) == 0)
<< "Interleave or deinterleave will result in miscompilation, "
<< "see\n" << Expr(op) << "\n";
if (starts_with(op->name, "halide.hexagon.")) {
value = call_intrin(op->type, op->name, op->args);
if (op->type.is_vector()) {
auto i = functions.find(op->name);
if (i != functions.end()) {
string intrin =
i->second.first + type_suffix(op->args, i->second.second);
value = call_intrin(op->type, intrin, op->args, true );
if (value) return;
} else if (op->is_intrinsic(Call::shift_left) ||
op->is_intrinsic(Call::shift_right)) {
internal_assert(op->args.size() == 2);
string instr = op->is_intrinsic(Call::shift_left) ? "halide.hexagon.shl" : "halide.hexagon.shr";
Expr b = maybe_scalar(op->args[1]);
value = call_intrin(op->type,
instr + type_suffix(op->args[0], b),
{op->args[0], b});
} else if (op->is_intrinsic("dynamic_shuffle")) {
internal_assert(op->args.size() == 4);
const int64_t *min_index = as_const_int(op->args[2]);
const int64_t *max_index = as_const_int(op->args[3]);
internal_assert(min_index && max_index);
Value *lut = codegen(op->args[0]);
Value *idx = codegen(op->args[1]);
value = vlut(lut, idx, *min_index, *max_index);
} else if (op->is_intrinsic(Call::select_mask)) {
internal_assert(op->args.size() == 3);
value = call_intrin(op->type,
"halide.hexagon.mux" +
type_suffix(op->args[1], op->args[2], false),
} else if (op->is_intrinsic(Call::cast_mask)) {
internal_error << "cast_mask should already have been handled in HexagonOptimize\n";
if (op->is_intrinsic(Call::bool_to_mask)) {
internal_assert(op->args.size() == 1);
if (op->args[0].type().is_vector()) {
} else {
Expr equiv = -Cast::make(op->type, op->args[0]);
} else if (op->is_intrinsic(Call::extract_mask_element)) {
internal_assert(op->args.size() == 2);
const int64_t *index = as_const_int(op->args[1]);
value = codegen(Cast::make(Bool(), Shuffle::make_extract_element(op->args[0], *index)));
if (op->is_intrinsic(Call::prefetch)) {
internal_assert((op->args.size() == 4) || (op->args.size() == 6))
<< "Hexagon only supports 1D or 2D prefetch\n";
vector<llvm::Value *> args;
args.push_back(codegen_buffer_pointer(codegen(op->args[0]), op->type, op->args[1]));
Expr extent_0_bytes = op->args[2] * op->args[3] * op->type.bytes();
llvm::Function *prefetch_fn = nullptr;
if (op->args.size() == 4) {
prefetch_fn = module->getFunction("_halide_prefetch");
} else {
prefetch_fn = module->getFunction("_halide_prefetch_2d");
Expr stride_1_bytes = op->args[5] * op->type.bytes();
llvm::Type *ptr_type = prefetch_fn->getFunctionType()->params()[0];
args[0] = builder->CreateBitCast(args[0], ptr_type);
value = builder->CreateCall(prefetch_fn, args);
void CodeGen_Hexagon::visit(const Broadcast *op) {
if (op->lanes * op->type.bits() <= 32) {
} else {
string v62_suffix = "";
if (target.has_feature(Target::HVX_v62) && (op->value.type().bits() == 8 || op->value.type().bits() == 16))
v62_suffix = "_v62";
value = call_intrin(op->type,
"halide.hexagon.splat" + v62_suffix + type_suffix(op->value, false),
void CodeGen_Hexagon::visit(const Max *op) {
if (op->type.is_vector()) {
value = call_intrin(op->type,
"halide.hexagon.max" + type_suffix(op->a, op->b),
{op->a, op->b},
true );
if (!value) {
Expr equiv =
Call::make(op->type, Call::select_mask, {op->a > op->b, op->a, op->b}, Call::PureIntrinsic);
equiv = common_subexpression_elimination(equiv);
value = codegen(equiv);
} else {
void CodeGen_Hexagon::visit(const Min *op) {
if (op->type.is_vector()) {
value = call_intrin(op->type,
"halide.hexagon.min" + type_suffix(op->a, op->b),
{op->a, op->b},
true );
if (!value) {
Expr equiv =
Call::make(op->type, Call::select_mask, {op->a > op->b, op->b, op->a}, Call::PureIntrinsic);
equiv = common_subexpression_elimination(equiv);
value = codegen(equiv);
} else {
void CodeGen_Hexagon::visit(const Select *op) {
internal_assert(op->condition.type().is_scalar()) << Expr(op) << "\n";
if (op->type.is_vector()) {
value = codegen(Call::make(op->type, Call::if_then_else,
{op->condition, op->true_value, op->false_value},
} else {
void CodeGen_Hexagon::visit(const GT *op) {
if (op->type.is_vector()) {
value = call_intrin(eliminated_bool_type(op->type, op->a.type()),
"" + type_suffix(op->a, op->b),
{op->a, op->b});
} else {
void CodeGen_Hexagon::visit(const EQ *op) {
if (op->type.is_vector()) {
value = call_intrin(eliminated_bool_type(op->type, op->a.type()),
"halide.hexagon.eq" + type_suffix(op->a, op->b, false),
{op->a, op->b});
} else {
void CodeGen_Hexagon::visit(const GE *op) {
if (op->type.is_vector()) {
Expr ge = Not::make(GT::make(op->b, op->a));
ge = eliminate_bool_vectors(ge);
} else {
void CodeGen_Hexagon::visit(const LE *op) {
if (op->type.is_vector()) {
Expr le = Not::make(GT::make(op->a, op->b));
le = eliminate_bool_vectors(le);
} else {
void CodeGen_Hexagon::visit(const LT *op) {
if (op->type.is_vector()) {
Expr lt = GT::make(op->b, op->a);
} else {
void CodeGen_Hexagon::visit(const NE *op) {
if (op->type.is_vector()) {
Expr eq = Not::make(EQ::make(op->a, op->b));
eq = eliminate_bool_vectors(eq);
} else {