This source file includes following definitions.
- add_kernel
- init_module
- simt_intrinsic
- visit
- visit
- visit
- visit
- march
- mcpu
- mattrs
- use_soft_float_abi
- compile_to_src
- native_vector_bits
- get_current_kernel_name
- dump
- print_gpu_name
#include "CodeGen_PTX_Dev.h"
#include "CodeGen_Internal.h"
#include "IROperator.h"
#include "IRPrinter.h"
#include "Debug.h"
#include "Target.h"
#include "LLVM_Headers.h"
#include "LLVM_Runtime_Linker.h"
#ifdef WITH_PTX
#if LLVM_VERSION >= 39
namespace llvm { FunctionPass *createNVVMReflectPass(const StringMap<int>& Mapping); }
#else
namespace llvm { ModulePass *createNVVMReflectPass(const StringMap<int>& Mapping); }
#endif
#endif
namespace Halide {
namespace Internal {
using std::vector;
using std::string;
using namespace llvm;
CodeGen_PTX_Dev::CodeGen_PTX_Dev(Target host) : CodeGen_LLVM(host) {
#if !(WITH_PTX)
user_error << "ptx not enabled for this build of Halide.\n";
#endif
user_assert(llvm_NVPTX_enabled) << "llvm build not configured with nvptx target enabled\n.";
context = new llvm::LLVMContext();
}
CodeGen_PTX_Dev::~CodeGen_PTX_Dev() {
module.reset();
delete context;
}
void CodeGen_PTX_Dev::add_kernel(Stmt stmt,
const std::string &name,
const std::vector<DeviceArgument> &args) {
internal_assert(module != nullptr);
debug(2) << "In CodeGen_PTX_Dev::add_kernel\n";
vector<llvm::Type *> arg_types(args.size());
for (size_t i = 0; i < args.size(); i++) {
if (args[i].is_buffer) {
arg_types[i] = llvm_type_of(UInt(8))->getPointerTo();
} else {
arg_types[i] = llvm_type_of(args[i].type);
}
}
FunctionType *func_t = FunctionType::get(void_t, arg_types, false);
function = llvm::Function::Create(func_t, llvm::Function::ExternalLinkage, name, module.get());
set_function_attributes_for_target(function, target);
for (size_t i = 0; i < args.size(); i++) {
if (args[i].is_buffer) {
function->setDoesNotAlias(i+1);
}
}
entry_block = BasicBlock::Create(*context, "entry", function);
builder->SetInsertPoint(entry_block);
vector<string> arg_sym_names;
{
size_t i = 0;
for (auto &fn_arg : function->args()) {
string arg_sym_name = args[i].name;
sym_push(arg_sym_name, &fn_arg);
fn_arg.setName(arg_sym_name);
arg_sym_names.push_back(arg_sym_name);
i++;
}
}
BasicBlock *body_block = BasicBlock::Create(*context, "body", function);
builder->SetInsertPoint(body_block);
debug(1) << "Generating llvm bitcode for kernel...\n";
stmt.accept(this);
builder->CreateRetVoid();
builder->SetInsertPoint(entry_block);
builder->CreateBr(body_block);
llvm::Metadata *md_args[] = {
llvm::ValueAsMetadata::get(function),
MDString::get(*context, "kernel"),
llvm::ValueAsMetadata::get(ConstantInt::get(i32_t, 1))
};
MDNode *md_node = MDNode::get(*context, md_args);
module->getOrInsertNamedMetadata("nvvm.annotations")->addOperand(md_node);
verifyFunction(*function);
verifyModule(*module);
debug(2) << "Done generating llvm bitcode for PTX\n";
for (size_t i = 0; i < arg_sym_names.size(); i++) {
sym_pop(arg_sym_names[i]);
}
}
void CodeGen_PTX_Dev::init_module() {
init_context();
#ifdef WITH_PTX
module = get_initial_module_for_ptx_device(target, context);
#endif
}
string CodeGen_PTX_Dev::simt_intrinsic(const string &name) {
if (ends_with(name, ".__thread_id_x")) {
return "llvm.nvvm.read.ptx.sreg.tid.x";
} else if (ends_with(name, ".__thread_id_y")) {
return "llvm.nvvm.read.ptx.sreg.tid.y";
} else if (ends_with(name, ".__thread_id_z")) {
return "llvm.nvvm.read.ptx.sreg.tid.z";
} else if (ends_with(name, ".__thread_id_w")) {
return "llvm.nvvm.read.ptx.sreg.tid.w";
} else if (ends_with(name, ".__block_id_x")) {
return "llvm.nvvm.read.ptx.sreg.ctaid.x";
} else if (ends_with(name, ".__block_id_y")) {
return "llvm.nvvm.read.ptx.sreg.ctaid.y";
} else if (ends_with(name, ".__block_id_z")) {
return "llvm.nvvm.read.ptx.sreg.ctaid.z";
} else if (ends_with(name, ".__block_id_w")) {
return "llvm.nvvm.read.ptx.sreg.ctaid.w";
}
internal_error << "simt_intrinsic called on bad variable name\n";
return "";
}
void CodeGen_PTX_Dev::visit(const For *loop) {
if (is_gpu_var(loop->name)) {
Expr simt_idx = Call::make(Int(32), simt_intrinsic(loop->name), std::vector<Expr>(), Call::Extern);
internal_assert(is_zero(loop->min));
sym_push(loop->name, codegen(simt_idx));
codegen(loop->body);
sym_pop(loop->name);
} else {
CodeGen_LLVM::visit(loop);
}
}
void CodeGen_PTX_Dev::visit(const Allocate *alloc) {
user_assert(!alloc->new_expr.defined()) << "Allocate node inside PTX kernel has custom new expression.\n" <<
"(Memoization is not supported inside GPU kernels at present.)\n";
if (alloc->name == "__shared") {
Value *shared_base = Constant::getNullValue(PointerType::get(i8_t, 3));
sym_push(alloc->name, shared_base);
} else {
debug(2) << "Allocate " << alloc->name << " on device\n";
string allocation_name = alloc->name;
debug(3) << "Pushing allocation called " << allocation_name << " onto the symbol table\n";
int32_t size = alloc->constant_allocation_size();
user_assert(size > 0)
<< "Allocation " << alloc->name << " has a dynamic size. "
<< "Only fixed-size allocations are supported on the gpu. "
<< "Try storing into shared memory instead.";
BasicBlock *here = builder->GetInsertBlock();
builder->SetInsertPoint(entry_block);
Value *ptr = builder->CreateAlloca(llvm_type_of(alloc->type), ConstantInt::get(i32_t, size));
builder->SetInsertPoint(here);
sym_push(allocation_name, ptr);
}
codegen(alloc->body);
}
void CodeGen_PTX_Dev::visit(const Free *f) {
sym_pop(f->name);
}
void CodeGen_PTX_Dev::visit(const AssertStmt *op) {
Expr trap = Call::make(Int(32), "halide_ptx_trap", {}, Call::Extern);
codegen(IfThenElse::make(!op->condition, Evaluate::make(trap)));
}
string CodeGen_PTX_Dev::march() const {
return "nvptx64";
}
string CodeGen_PTX_Dev::mcpu() const {
if (target.has_feature(Target::CUDACapability50)) {
return "sm_50";
} else if (target.has_feature(Target::CUDACapability35)) {
return "sm_35";
} else if (target.has_feature(Target::CUDACapability32)) {
return "sm_32";
} else if (target.has_feature(Target::CUDACapability30)) {
return "sm_30";
} else {
return "sm_20";
}
}
string CodeGen_PTX_Dev::mattrs() const {
if (target.features_any_of({Target::CUDACapability32,
Target::CUDACapability50})) {
return "+ptx40";
} else {
return "";
}
}
bool CodeGen_PTX_Dev::use_soft_float_abi() const {
return false;
}
vector<char> CodeGen_PTX_Dev::compile_to_src() {
#ifdef WITH_PTX
debug(2) << "In CodeGen_PTX_Dev::compile_to_src";
llvm::Triple triple(module->getTargetTriple());
std::string err_str;
const llvm::Target *target = TargetRegistry::lookupTarget(triple.str(), err_str);
internal_assert(target) << err_str << "\n";
TargetOptions options;
#if LLVM_VERSION < 50
options.LessPreciseFPMADOption = true;
#endif
options.PrintMachineCode = false;
options.AllowFPOpFusion = FPOpFusion::Fast;
options.UnsafeFPMath = true;
options.NoInfsFPMath = true;
options.NoNaNsFPMath = true;
options.HonorSignDependentRoundingFPMathOption = false;
options.NoZerosInBSS = false;
options.GuaranteedTailCallOpt = false;
options.StackAlignmentOverride = 0;
std::unique_ptr<TargetMachine>
target_machine(target->createTargetMachine(triple.str(),
mcpu(), mattrs(), options,
llvm::Reloc::PIC_,
llvm::CodeModel::Default,
CodeGenOpt::Aggressive));
internal_assert(target_machine.get()) << "Could not allocate target machine!";
llvm::SmallString<8> outstr;
raw_svector_ostream ostream(outstr);
ostream.SetUnbuffered();
legacy::FunctionPassManager function_pass_manager(module.get());
legacy::PassManager module_pass_manager;
module_pass_manager.add(createTargetTransformInfoWrapperPass(target_machine->getTargetIRAnalysis()));
function_pass_manager.add(createTargetTransformInfoWrapperPass(target_machine->getTargetIRAnalysis()));
#define kDefaultDenorms 0
#define kFTZDenorms 1
#if LLVM_VERSION <= 40
StringMap<int> reflect_mapping;
reflect_mapping[StringRef("__CUDA_FTZ")] = kFTZDenorms;
module_pass_manager.add(createNVVMReflectPass(reflect_mapping));
#else
module->addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz",
kFTZDenorms);
if (kFTZDenorms) {
for (llvm::Function &fn : *module) {
fn.addFnAttr("nvptx-f32ftz", "true");
}
}
#endif
PassManagerBuilder b;
b.OptLevel = 3;
#if LLVM_VERSION >= 50
b.Inliner = createFunctionInliningPass(b.OptLevel, 0, false);
#else
b.Inliner = createFunctionInliningPass(b.OptLevel, 0);
#endif
b.LoopVectorize = true;
b.SLPVectorize = true;
#if LLVM_VERSION > 40
target_machine->adjustPassManager(b);
#endif
b.populateFunctionPassManager(function_pass_manager);
b.populateModulePassManager(module_pass_manager);
target_machine->Options.MCOptions.AsmVerbose = true;
bool fail = target_machine->addPassesToEmitFile(module_pass_manager, ostream,
TargetMachine::CGFT_AssemblyFile,
true);
if (fail) {
internal_error << "Failed to set up passes to emit PTX source\n";
}
function_pass_manager.doInitialization();
for (llvm::Module::iterator i = module->begin(); i != module->end(); i++) {
function_pass_manager.run(*i);
}
function_pass_manager.doFinalization();
module_pass_manager.run(*module);
#if LLVM_VERSION < 38
ostream.flush();
#endif
if (debug::debug_level() >= 2) {
dump();
}
debug(2) << "Done with CodeGen_PTX_Dev::compile_to_src";
debug(1) << "PTX kernel:\n" << outstr.c_str() << "\n";
vector<char> buffer(outstr.begin(), outstr.end());
buffer.push_back(0);
return buffer;
#else
return vector<char>();
#endif
}
int CodeGen_PTX_Dev::native_vector_bits() const {
return 64;
}
string CodeGen_PTX_Dev::get_current_kernel_name() {
return function->getName();
}
void CodeGen_PTX_Dev::dump() {
#if LLVM_VERSION >= 50
module->print(dbgs(), nullptr, false, true);
#else
module->dump();
#endif
}
std::string CodeGen_PTX_Dev::print_gpu_name(const std::string &name) {
return name;
}
}}