This source file includes following definitions.
- found_shared
- visit
- visit
- visit
- compile_func
- visit
- get_module_state
#include <sstream>
#include "CodeGen_GPU_Host.h"
#include "CodeGen_PTX_Dev.h"
#include "CodeGen_OpenCL_Dev.h"
#include "CodeGen_Metal_Dev.h"
#include "CodeGen_OpenGL_Dev.h"
#include "CodeGen_OpenGLCompute_Dev.h"
#include "IROperator.h"
#include "IRPrinter.h"
#include "Debug.h"
#include "CodeGen_Internal.h"
#include "Util.h"
#include "ExprUsesVar.h"
#include "Simplify.h"
#include "VaryingAttributes.h"
namespace Halide {
namespace Internal {
using std::vector;
using std::string;
using std::map;
using std::pair;
using namespace llvm;
class ExtractBounds : public IRVisitor {
public:
Expr num_threads[4];
Expr num_blocks[4];
Expr shared_mem_size;
ExtractBounds() : shared_mem_size(0), found_shared(false) {
for (int i = 0; i < 4; i++) {
num_threads[i] = num_blocks[i] = 1;
}
}
private:
bool found_shared;
using IRVisitor::visit;
void visit(const For *op) {
if (CodeGen_GPU_Dev::is_gpu_var(op->name)) {
internal_assert(is_zero(op->min));
}
if (ends_with(op->name, ".__thread_id_x")) {
num_threads[0] = op->extent;
} else if (ends_with(op->name, ".__thread_id_y")) {
num_threads[1] = op->extent;
} else if (ends_with(op->name, ".__thread_id_z")) {
num_threads[2] = op->extent;
} else if (ends_with(op->name, ".__thread_id_w")) {
num_threads[3] = op->extent;
} else if (ends_with(op->name, ".__block_id_x")) {
num_blocks[0] = op->extent;
} else if (ends_with(op->name, ".__block_id_y")) {
num_blocks[1] = op->extent;
} else if (ends_with(op->name, ".__block_id_z")) {
num_blocks[2] = op->extent;
} else if (ends_with(op->name, ".__block_id_w")) {
num_blocks[3] = op->extent;
}
op->body.accept(this);
}
void visit(const LetStmt *op) {
if (expr_uses_var(shared_mem_size, op->name)) {
shared_mem_size = Let::make(op->name, op->value, shared_mem_size);
}
op->body.accept(this);
}
void visit(const Allocate *allocate) {
user_assert(!allocate->new_expr.defined()) << "Allocate node inside GPU kernel has custom new expression.\n" <<
"(Memoization is not supported inside GPU kernels at present.)\n";
if (allocate->name == "__shared") {
internal_assert(allocate->type == UInt(8) && allocate->extents.size() == 1);
shared_mem_size = allocate->extents[0];
found_shared = true;
}
allocate->body.accept(this);
}
};
template<typename CodeGen_CPU>
CodeGen_GPU_Host<CodeGen_CPU>::CodeGen_GPU_Host(Target target) : CodeGen_CPU(target) {
if (target.has_feature(Target::OpenGL)) {
debug(1) << "Constructing OpenGL device codegen\n";
cgdev[DeviceAPI::GLSL] = new CodeGen_OpenGL_Dev(target);
}
if (target.has_feature(Target::OpenGLCompute)) {
debug(1) << "Constructing OpenGL Compute device codegen\n";
cgdev[DeviceAPI::OpenGLCompute] = new CodeGen_OpenGLCompute_Dev(target);
}
if (target.has_feature(Target::CUDA)) {
debug(1) << "Constructing CUDA device codegen\n";
cgdev[DeviceAPI::CUDA] = new CodeGen_PTX_Dev(target);
}
if (target.has_feature(Target::OpenCL)) {
debug(1) << "Constructing OpenCL device codegen\n";
cgdev[DeviceAPI::OpenCL] = new CodeGen_OpenCL_Dev(target);
}
if (target.has_feature(Target::Metal)) {
debug(1) << "Constructing Metal device codegen\n";
cgdev[DeviceAPI::Metal] = new CodeGen_Metal_Dev(target);
}
if (cgdev.empty()) {
internal_error << "Requested unknown GPU target: " << target.to_string() << "\n";
}
}
template<typename CodeGen_CPU>
CodeGen_GPU_Host<CodeGen_CPU>::~CodeGen_GPU_Host() {
for (pair<const DeviceAPI, CodeGen_GPU_Dev *> &i : cgdev) {
delete i.second;
}
}
template<typename CodeGen_CPU>
void CodeGen_GPU_Host<CodeGen_CPU>::compile_func(const LoweredFunc &f,
const std::string &simple_name,
const std::string &extern_name) {
function_name = simple_name;
for (pair<const DeviceAPI, CodeGen_GPU_Dev *> &i : cgdev) {
i.second->init_module();
}
CodeGen_CPU::compile_func(f, simple_name, extern_name);
BasicBlock *entry = &function->getEntryBlock();
llvm::Instruction *terminator = entry->getTerminator();
internal_assert(terminator);
BasicBlock *post_entry = entry->splitBasicBlock(terminator);
BasicBlock *init_kernels_bb = BasicBlock::Create(*context, "init_kernels",
function, post_entry);
entry->getTerminator()->eraseFromParent();
builder->SetInsertPoint(entry);
builder->CreateBr(init_kernels_bb);
builder->SetInsertPoint(init_kernels_bb);
for (pair<const DeviceAPI, CodeGen_GPU_Dev *> &i : cgdev) {
CodeGen_GPU_Dev *gpu_codegen = i.second;
std::string api_unique_name = gpu_codegen->api_unique_name();
llvm::Value *module_state = get_module_state(api_unique_name, false);
if (!module_state) {
continue;
}
debug(2) << "Generating init_kernels for " << api_unique_name << "\n";
std::vector<char> kernel_src = gpu_codegen->compile_to_src();
Value *kernel_src_ptr =
CodeGen_CPU::create_binary_blob(kernel_src,
"halide_" + function_name + "_" + api_unique_name + "_kernel_src");
if (f.args[0].name == "__user_context") {
sym_push("__user_context", iterator_to_pointer(function->arg_begin()));
}
Value *user_context = get_user_context();
Value *kernel_size = ConstantInt::get(i32_t, kernel_src.size());
std::string init_kernels_name = "halide_" + api_unique_name + "_initialize_kernels";
Value *init = module->getFunction(init_kernels_name);
internal_assert(init) << "Could not find function " + init_kernels_name + " in initial module\n";
vector<Value *> init_kernels_args = {user_context, module_state, kernel_src_ptr, kernel_size};
Value *result = builder->CreateCall(init, init_kernels_args);
Value *did_succeed = builder->CreateICmpEQ(result, ConstantInt::get(i32_t, 0));
CodeGen_CPU::create_assertion(did_succeed, Expr(), result);
}
builder->CreateBr(post_entry);
function_name = "";
}
template<typename CodeGen_CPU>
void CodeGen_GPU_Host<CodeGen_CPU>::visit(const For *loop) {
if (CodeGen_GPU_Dev::is_gpu_var(loop->name)) {
debug(2) << "Kernel launch: " << loop->name << "\n";
internal_assert(loop->device_api != DeviceAPI::Default_GPU)
<< "A concrete device API should have been selected before codegen.";
ExtractBounds bounds;
loop->accept(&bounds);
debug(2) << "Kernel bounds: ("
<< bounds.num_threads[0] << ", "
<< bounds.num_threads[1] << ", "
<< bounds.num_threads[2] << ", "
<< bounds.num_threads[3] << ") threads, ("
<< bounds.num_blocks[0] << ", "
<< bounds.num_blocks[1] << ", "
<< bounds.num_blocks[2] << ", "
<< bounds.num_blocks[3] << ") blocks\n";
string kernel_name = unique_name("kernel_" + loop->name);
for (size_t i = 0; i < kernel_name.size(); i++) {
if (!isalnum(kernel_name[i])) {
kernel_name[i] = '_';
}
}
Value *null_float_ptr = ConstantPointerNull::get(CodeGen_LLVM::f32_t->getPointerTo());
Value *zero_int32 = codegen(Expr(cast<int>(0)));
Value *gpu_num_padded_attributes = zero_int32;
Value *gpu_vertex_buffer = null_float_ptr;
Value *gpu_num_coords_dim0 = zero_int32;
Value *gpu_num_coords_dim1 = zero_int32;
if (loop->device_api == DeviceAPI::GLSL) {
gpu_num_padded_attributes = codegen(Variable::make(Int(32), "glsl.num_padded_attributes"));
gpu_num_coords_dim0 = codegen(Variable::make(Int(32), "glsl.num_coords_dim0"));
gpu_num_coords_dim1 = codegen(Variable::make(Int(32), "glsl.num_coords_dim1"));
gpu_vertex_buffer = codegen(Variable::make(type_of<float *>(), "glsl.vertex_buffer"));
gpu_vertex_buffer = builder->CreatePointerCast(gpu_vertex_buffer,
CodeGen_LLVM::f32_t->getPointerTo());
}
HostClosure c(loop->body, loop->name);
vector<DeviceArgument> closure_args = c.arguments();
if (loop->device_api == DeviceAPI::GLSL) {
int num_uniform_floats = 0;
int num_varying_floats = 2;
int num_uniform_ints = 0;
for (size_t i = 0; i < closure_args.size(); i++) {
if (closure_args[i].is_buffer) {
continue;
} else if (ends_with(closure_args[i].name, ".varying")) {
closure_args[i].packed_index = num_varying_floats++;
} else if (closure_args[i].type.is_float()) {
closure_args[i].packed_index = num_uniform_floats++;
} else if (closure_args[i].type.is_int()) {
closure_args[i].packed_index = num_uniform_ints++;
}
}
}
for (size_t i = 0; i < closure_args.size(); i++) {
if (closure_args[i].is_buffer && allocations.contains(closure_args[i].name)) {
closure_args[i].size = allocations.get(closure_args[i].name).constant_bytes;
}
}
CodeGen_GPU_Dev *gpu_codegen = cgdev[loop->device_api];
user_assert(gpu_codegen != nullptr)
<< "Loop is scheduled on device " << loop->device_api
<< " which does not appear in target " << target.to_string() << "\n";
gpu_codegen->add_kernel(loop, kernel_name, closure_args);
kernel_name = gpu_codegen->get_current_kernel_name();
debug(2) << "Compiled launch to kernel \"" << kernel_name << "\"\n";
Value *entry_name_str = builder->CreateGlobalStringPtr(kernel_name, "entry_name");
llvm::Type *target_size_t_type = (target.bits == 32) ? i32_t : i64_t;
llvm::PointerType *arg_t = i8_t->getPointerTo();
int num_args = (int)closure_args.size();
llvm::Type *gpu_args_arr_type = ArrayType::get(arg_t, num_args+1);
Value *gpu_args_arr =
create_alloca_at_entry(
gpu_args_arr_type,
num_args+1, false,
kernel_name + "_args");
llvm::Type *gpu_arg_sizes_arr_type = ArrayType::get(target_size_t_type,
num_args+1);
Value *gpu_arg_sizes_arr =
create_alloca_at_entry(
gpu_arg_sizes_arr_type,
num_args+1, false,
kernel_name + "_arg_sizes");
llvm::Type *gpu_arg_is_buffer_arr_type = ArrayType::get(i8_t, num_args+1);
Value *gpu_arg_is_buffer_arr =
create_alloca_at_entry(
gpu_arg_is_buffer_arr_type,
num_args+1, false,
kernel_name + "_arg_is_buffer");
for (int i = 0; i < num_args; i++) {
string name = closure_args[i].name;
Value *val;
if (closure_args[i].is_buffer) {
Expr buf = Variable::make(type_of<buffer_t *>(), name + ".buffer");
Expr get_dev = Call::make(UInt(64), Call::buffer_get_device, {buf}, Call::Extern);
val = codegen(get_dev);
} else if (ends_with(name, ".varying")) {
val = ConstantInt::get(target_size_t_type, 1);
} else {
val = sym_get(name);
}
Value *ptr = create_alloca_at_entry(val->getType(), 1, false, name+".stack");
builder->CreateStore(val, ptr);
Value *bits = builder->CreateBitCast(ptr, arg_t);
builder->CreateStore(bits,
builder->CreateConstGEP2_32(
gpu_args_arr_type,
gpu_args_arr,
0,
i));
int size_bytes = (closure_args[i].is_buffer) ? 8 : closure_args[i].type.bytes();
builder->CreateStore(ConstantInt::get(target_size_t_type, size_bytes),
builder->CreateConstGEP2_32(
gpu_arg_sizes_arr_type,
gpu_arg_sizes_arr,
0,
i));
builder->CreateStore(ConstantInt::get(i8_t, closure_args[i].is_buffer),
builder->CreateConstGEP2_32(
gpu_arg_is_buffer_arr_type,
gpu_arg_is_buffer_arr,
0,
i));
}
builder->CreateStore(ConstantPointerNull::get(arg_t),
builder->CreateConstGEP2_32(
gpu_args_arr_type,
gpu_args_arr,
0,
num_args));
builder->CreateStore(ConstantInt::get(target_size_t_type, 0),
builder->CreateConstGEP2_32(
gpu_arg_sizes_arr_type,
gpu_arg_sizes_arr,
0,
num_args));
builder->CreateStore(ConstantInt::get(i8_t, 0),
builder->CreateConstGEP2_32(
gpu_arg_is_buffer_arr_type,
gpu_arg_is_buffer_arr,
0,
num_args));
std::string api_unique_name = gpu_codegen->api_unique_name();
internal_assert(is_one(bounds.num_threads[3]) && is_one(bounds.num_blocks[3]))
<< bounds.num_threads[3] << ", " << bounds.num_blocks[3] << "\n";
debug(4) << "CodeGen_GPU_Host get_user_context returned " << get_user_context() << "\n";
debug(3) << "bounds.num_blocks[0] = " << bounds.num_blocks[0] << "\n";
debug(3) << "bounds.num_blocks[1] = " << bounds.num_blocks[1] << "\n";
debug(3) << "bounds.num_blocks[2] = " << bounds.num_blocks[2] << "\n";
debug(3) << "bounds.num_threads[0] = " << bounds.num_threads[0] << "\n";
debug(3) << "bounds.num_threads[1] = " << bounds.num_threads[1] << "\n";
debug(3) << "bounds.num_threads[2] = " << bounds.num_threads[2] << "\n";
Value *launch_args[] = {
get_user_context(),
builder->CreateLoad(get_module_state(api_unique_name)),
entry_name_str,
codegen(bounds.num_blocks[0]), codegen(bounds.num_blocks[1]), codegen(bounds.num_blocks[2]),
codegen(bounds.num_threads[0]), codegen(bounds.num_threads[1]), codegen(bounds.num_threads[2]),
codegen(bounds.shared_mem_size),
builder->CreateConstGEP2_32(
gpu_arg_sizes_arr_type,
gpu_arg_sizes_arr,
0,
0,
"gpu_arg_sizes_ar_ref" + api_unique_name),
builder->CreateConstGEP2_32(
gpu_args_arr_type,
gpu_args_arr,
0,
0,
"gpu_args_arr_ref" + api_unique_name),
builder->CreateConstGEP2_32(
gpu_arg_is_buffer_arr_type,
gpu_arg_is_buffer_arr,
0,
0,
"gpu_arg_is_buffer_ref" + api_unique_name),
gpu_num_padded_attributes,
gpu_vertex_buffer,
gpu_num_coords_dim0,
gpu_num_coords_dim1,
};
std::string run_fn_name = "halide_" + api_unique_name + "_run";
llvm::Function *dev_run_fn = module->getFunction(run_fn_name);
internal_assert(dev_run_fn) << "Could not find " << run_fn_name << " in module\n";
Value *result = builder->CreateCall(dev_run_fn, launch_args);
Value *did_succeed = builder->CreateICmpEQ(result, ConstantInt::get(i32_t, 0));
CodeGen_CPU::create_assertion(did_succeed,
halide_error_code_device_run_failed,
result);
} else {
CodeGen_CPU::visit(loop);
}
}
template<typename CodeGen_CPU>
Value *CodeGen_GPU_Host<CodeGen_CPU>::get_module_state(const std::string &api_unique_name,
bool create) {
std::string name = "module_state_" + function_name + "_" + api_unique_name;
GlobalVariable *module_state = module->getGlobalVariable(name, true);
if (!module_state && create)
{
PointerType *void_ptr_type = llvm::Type::getInt8PtrTy(*context);
module_state = new GlobalVariable(*module, void_ptr_type,
false, GlobalVariable::InternalLinkage,
ConstantPointerNull::get(void_ptr_type),
name);
debug(4) << "Created device module state global variable\n";
}
return module_state;
}
#ifdef WITH_X86
template class CodeGen_GPU_Host<CodeGen_X86>;
#endif
#if defined(WITH_ARM) || defined(WITH_AARCH64)
template class CodeGen_GPU_Host<CodeGen_ARM>;
#endif
#ifdef WITH_MIPS
template class CodeGen_GPU_Host<CodeGen_MIPS>;
#endif
#ifdef WITH_POWERPC
template class CodeGen_GPU_Host<CodeGen_PowerPC>;
#endif
}}