This source file includes following definitions.
- print_type
- print_reinterpret
- simt_intrinsic
- visit
- visit
- visit
- is_ramp1
- get_memory_space
- visit
- visit
- visit
- visit
- visit
- visit
- visit
- visit
- visit
- visit
- visit
- visit
- visit
- visit
- visit
- add_kernel
- add_kernel
- init_module
- compile_to_src
- get_current_kernel_name
- dump
- print_gpu_name
#include <sstream>
#include <algorithm>
#include "CodeGen_OpenCL_Dev.h"
#include "CodeGen_Internal.h"
#include "Debug.h"
#include "IROperator.h"
#include "IRMutator.h"
#include "EliminateBoolVectors.h"
namespace Halide {
namespace Internal {
using std::ostringstream;
using std::string;
using std::vector;
using std::sort;
CodeGen_OpenCL_Dev::CodeGen_OpenCL_Dev(Target t) :
clc(src_stream, t) {
}
string CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::print_type(Type type, AppendSpaceIfNeeded space) {
ostringstream oss;
if (type.is_float()) {
if (type.bits() == 16) {
oss << "half";
} else if (type.bits() == 32) {
oss << "float";
} else if (type.bits() == 64) {
oss << "double";
} else {
user_error << "Can't represent a float with this many bits in OpenCL C: " << type << "\n";
}
} else {
if (type.is_uint() && type.bits() > 1) oss << 'u';
switch (type.bits()) {
case 1:
internal_assert(type.lanes() == 1) << "Encountered vector of bool\n";
oss << "bool";
break;
case 8:
oss << "char";
break;
case 16:
oss << "short";
break;
case 32:
oss << "int";
break;
case 64:
oss << "long";
break;
default:
user_error << "Can't represent an integer with this many bits in OpenCL C: " << type << "\n";
}
}
if (type.lanes() != 1) {
switch (type.lanes()) {
case 2:
case 3:
case 4:
case 8:
case 16:
oss << type.lanes();
break;
default:
user_error << "Unsupported vector width in OpenCL C: " << type << "\n";
}
}
if (space == AppendSpace) {
oss << ' ';
}
return oss.str();
}
string CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::print_reinterpret(Type type, Expr e) {
ostringstream oss;
oss << "as_" << print_type(type) << "(" << print_expr(e) << ")";
return oss.str();
}
namespace {
string simt_intrinsic(const string &name) {
if (ends_with(name, ".__thread_id_x")) {
return "get_local_id(0)";
} else if (ends_with(name, ".__thread_id_y")) {
return "get_local_id(1)";
} else if (ends_with(name, ".__thread_id_z")) {
return "get_local_id(2)";
} else if (ends_with(name, ".__thread_id_w")) {
return "get_local_id(3)";
} else if (ends_with(name, ".__block_id_x")) {
return "get_group_id(0)";
} else if (ends_with(name, ".__block_id_y")) {
return "get_group_id(1)";
} else if (ends_with(name, ".__block_id_z")) {
return "get_group_id(2)";
} else if (ends_with(name, ".__block_id_w")) {
return "get_group_id(3)";
}
internal_error << "simt_intrinsic called on bad variable name: " << name << "\n";
return "";
}
}
void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const For *loop) {
if (is_gpu_var(loop->name)) {
internal_assert((loop->for_type == ForType::GPUBlock) ||
(loop->for_type == ForType::GPUThread))
<< "kernel loop must be either gpu block or gpu thread\n";
internal_assert(is_zero(loop->min));
do_indent();
stream << print_type(Int(32)) << " " << print_name(loop->name)
<< " = " << simt_intrinsic(loop->name) << ";\n";
loop->body.accept(this);
} else {
user_assert(loop->for_type != ForType::Parallel) << "Cannot use parallel loops inside OpenCL kernel\n";
CodeGen_C::visit(loop);
}
}
void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Ramp *op) {
string id_base = print_expr(op->base);
string id_stride = print_expr(op->stride);
ostringstream rhs;
rhs << id_base << " + " << id_stride << " * ("
<< print_type(op->type.with_lanes(op->lanes)) << ")(0";
for (int i = 1; i < op->lanes; ++i) {
rhs << ", " << i;
}
rhs << ")";
print_assignment(op->type.with_lanes(op->lanes), rhs.str());
}
void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Broadcast *op) {
string id_value = print_expr(op->value);
print_assignment(op->type.with_lanes(op->lanes), id_value);
}
namespace {
const char * vector_elements = "0123456789ABCDEF";
Expr is_ramp1(Expr e) {
const Ramp *r = e.as<Ramp>();
if (r == nullptr) {
return Expr();
}
const IntImm *i = r->stride.as<IntImm>();
if (i != nullptr && i->value == 1) {
return r->base;
}
return Expr();
}
}
string CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::get_memory_space(const string &buf) {
return "__address_space_" + print_name(buf);
}
void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Call *op) {
if (op->is_intrinsic(Call::bool_to_mask)) {
if (op->args[0].type().is_vector()) {
op->args[0].accept(this);
} else {
Expr equiv = -Cast::make(op->type, op->args[0]);
equiv.accept(this);
}
} else if (op->is_intrinsic(Call::cast_mask)) {
Expr equiv = Cast::make(op->type, op->args[0]);
equiv.accept(this);
} else if (op->is_intrinsic(Call::select_mask)) {
internal_assert(op->args.size() == 3);
string cond = print_expr(op->args[0]);
string true_val = print_expr(op->args[1]);
string false_val = print_expr(op->args[2]);
ostringstream rhs;
rhs << "select(" << false_val << ", " << true_val << ", " << cond << ")";
print_assignment(op->type, rhs.str());
} else if (op->is_intrinsic(Call::abs)) {
if (op->type.is_float()) {
ostringstream rhs;
rhs << "abs_f" << op->type.bits() << "(" << print_expr(op->args[0]) << ")";
print_assignment(op->type, rhs.str());
} else {
ostringstream rhs;
rhs << "abs(" << print_expr(op->args[0]) << ")";
print_assignment(op->type, rhs.str());
}
} else if (op->is_intrinsic(Call::absd)) {
ostringstream rhs;
rhs << "abs_diff(" << print_expr(op->args[0]) << ", " << print_expr(op->args[1]) << ")";
print_assignment(op->type, rhs.str());
} else {
CodeGen_C::visit(op);
}
}
void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Load *op) {
user_assert(is_one(op->predicate)) << "Predicated load is not supported inside OpenCL kernel.\n";
Expr ramp_base = is_ramp1(op->index);
if (ramp_base.defined()) {
internal_assert(op->type.is_vector());
string id_ramp_base = print_expr(ramp_base);
ostringstream rhs;
rhs << "vload" << op->type.lanes()
<< "(0, (" << get_memory_space(op->name) << " "
<< print_type(op->type.element_of()) << "*)"
<< print_name(op->name) << " + " << id_ramp_base << ")";
print_assignment(op->type, rhs.str());
return;
}
string id_index = print_expr(op->index);
bool type_cast_needed = !(allocations.contains(op->name) &&
allocations.get(op->name).type == op->type);
ostringstream rhs;
if (type_cast_needed) {
rhs << "((" << get_memory_space(op->name) << " "
<< print_type(op->type) << " *)"
<< print_name(op->name)
<< ")";
} else {
rhs << print_name(op->name);
}
rhs << "[" << id_index << "]";
std::map<string, string>::iterator cached = cache.find(rhs.str());
if (cached != cache.end()) {
id = cached->second;
return;
}
if (op->index.type().is_vector()) {
internal_assert(op->type.is_vector());
id = "_" + unique_name('V');
cache[rhs.str()] = id;
do_indent();
stream << print_type(op->type)
<< " " << id << ";\n";
for (int i = 0; i < op->type.lanes(); ++i) {
do_indent();
stream
<< id << ".s" << vector_elements[i]
<< " = ((" << get_memory_space(op->name) << " "
<< print_type(op->type.element_of()) << "*)"
<< print_name(op->name) << ")"
<< "[" << id_index << ".s" << vector_elements[i] << "];\n";
}
} else {
print_assignment(op->type, rhs.str());
}
}
void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Store *op) {
user_assert(is_one(op->predicate)) << "Predicated store is not supported inside OpenCL kernel.\n";
string id_value = print_expr(op->value);
Type t = op->value.type();
Expr ramp_base = is_ramp1(op->index);
if (ramp_base.defined()) {
internal_assert(op->value.type().is_vector());
string id_ramp_base = print_expr(ramp_base);
do_indent();
stream << "vstore" << t.lanes() << "("
<< id_value << ","
<< 0 << ", (" << get_memory_space(op->name) << " "
<< print_type(t.element_of()) << "*)"
<< print_name(op->name) << " + " << id_ramp_base
<< ");\n";
} else if (op->index.type().is_vector()) {
internal_assert(t.is_vector());
string id_index = print_expr(op->index);
for (int i = 0; i < t.lanes(); ++i) {
do_indent();
stream << "((" << get_memory_space(op->name) << " "
<< print_type(t.element_of()) << " *)"
<< print_name(op->name)
<< ")["
<< id_index << ".s" << vector_elements[i] << "] = "
<< id_value << ".s" << vector_elements[i] << ";\n";
}
} else {
bool type_cast_needed = !(allocations.contains(op->name) &&
allocations.get(op->name).type == t);
string id_index = print_expr(op->index);
string id_value = print_expr(op->value);
do_indent();
if (type_cast_needed) {
stream << "(("
<< get_memory_space(op->name) << " "
<< print_type(t)
<< " *)"
<< print_name(op->name)
<< ")";
} else {
stream << print_name(op->name);
}
stream << "[" << id_index << "] = "
<< id_value << ";\n";
}
cache.clear();
}
namespace {
}
void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const EQ *op) {
visit_binop(eliminated_bool_type(op->type, op->a.type()), op->a, op->b, "==");
}
void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const NE *op) {
visit_binop(eliminated_bool_type(op->type, op->a.type()), op->a, op->b, "!=");
}
void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const LT *op) {
visit_binop(eliminated_bool_type(op->type, op->a.type()), op->a, op->b, "<");
}
void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const LE *op) {
visit_binop(eliminated_bool_type(op->type, op->a.type()), op->a, op->b, "<=");
}
void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const GT *op) {
visit_binop(eliminated_bool_type(op->type, op->a.type()), op->a, op->b, ">");
}
void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const GE *op) {
visit_binop(eliminated_bool_type(op->type, op->a.type()), op->a, op->b, ">=");
}
void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Cast *op) {
if (op->type.is_vector()) {
print_assignment(op->type, "convert_" + print_type(op->type) + "(" + print_expr(op->value) + ")");
} else {
CodeGen_C::visit(op);
}
}
void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Select *op) {
internal_assert(op->condition.type().is_scalar());
CodeGen_C::visit(op);
}
void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Allocate *op) {
user_assert(!op->new_expr.defined()) << "Allocate node inside OpenCL kernel has custom new expression.\n" <<
"(Memoization is not supported inside GPU kernels at present.)\n";
if (op->name == "__shared") {
op->body.accept(this);
} else {
open_scope();
debug(2) << "Allocate " << op->name << " on device\n";
debug(3) << "Pushing allocation called " << op->name << " onto the symbol table\n";
int32_t size = op->constant_allocation_size();
user_assert(size > 0)
<< "Allocation " << op->name << " has a dynamic size. "
<< "Only fixed-size allocations are supported on the gpu. "
<< "Try storing into shared memory instead.";
do_indent();
stream << print_type(op->type) << ' '
<< print_name(op->name) << "[" << size << "];\n";
do_indent();
stream << "#define " << get_memory_space(op->name) << " __private\n";
Allocation alloc;
alloc.type = op->type;
allocations.push(op->name, alloc);
op->body.accept(this);
internal_assert(!allocations.contains(op->name));
close_scope("alloc " + print_name(op->name));
}
}
void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Free *op) {
if (op->name == "__shared") {
return;
} else {
internal_assert(allocations.contains(op->name));
allocations.pop(op->name);
do_indent();
stream << "#undef " << get_memory_space(op->name) << "\n";
}
}
void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const AssertStmt *op) {
user_warning << "Ignoring assertion inside OpenCL kernel: " << op->condition << "\n";
}
void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Shuffle *op) {
if (op->is_interleave()) {
int op_lanes = op->type.lanes();
internal_assert(!op->vectors.empty());
int arg_lanes = op->vectors[0].type().lanes();
if (op->vectors.size() == 1) {
internal_assert(op_lanes == arg_lanes);
print_assignment(op->type, print_expr(op->vectors[0]));
} else if (op->vectors.size() == 2) {
internal_assert(op->vectors[1].type().lanes() == arg_lanes);
internal_assert(op_lanes / 2 == arg_lanes);
string a1 = print_expr(op->vectors[0]);
string a2 = print_expr(op->vectors[1]);
id = unique_name('_');
do_indent();
stream << print_type(op->type) << " " << id << ";\n";
do_indent();
stream << id << ".even = " << a1 << ";\n";
do_indent();
stream << id << ".odd = " << a2 << ";\n";
} else {
int dest_lanes = op->type.lanes();
internal_assert(dest_lanes <= 16);
int num_vectors = op->vectors.size();
vector<string> arg_exprs(num_vectors);
for (int i = 0; i < num_vectors; i++) {
internal_assert(op->vectors[i].type().lanes() == arg_lanes);
arg_exprs[i] = print_expr(op->vectors[i]);
}
internal_assert(num_vectors * arg_lanes >= dest_lanes);
id = unique_name('_');
do_indent();
stream << print_type(op->type) << " " << id;
stream << " = (" << print_type(op->type) << ")(";
for (int i = 0; i < dest_lanes; i++) {
int arg = i % num_vectors;
int arg_idx = i / num_vectors;
internal_assert(arg_idx <= arg_lanes);
stream << arg_exprs[arg] << ".s" << vector_elements[arg_idx];
if (i != dest_lanes - 1) {
stream << ", ";
}
}
stream << ");\n";
}
} else {
internal_error << "Shuffle not implemented.\n";
}
}
void CodeGen_OpenCL_Dev::add_kernel(Stmt s,
const string &name,
const vector<DeviceArgument> &args) {
debug(2) << "CodeGen_OpenCL_Dev::compile " << name << "\n";
cur_kernel_name = name;
clc.add_kernel(s, name, args);
}
namespace {
struct BufferSize {
string name;
size_t size;
BufferSize() : size(0) {}
BufferSize(string name, size_t size) : name(name), size(size) {}
bool operator < (const BufferSize &r) const {
return size < r.size;
}
};
}
void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::add_kernel(Stmt s,
const string &name,
const vector<DeviceArgument> &args) {
debug(2) << "Adding OpenCL kernel " << name << "\n";
debug(2) << "Eliminating bool vectors\n";
s = eliminate_bool_vectors(s);
debug(2) << "After eliminating bool vectors:\n" << s << "\n";
vector<BufferSize> constants;
for (size_t i = 0; i < args.size(); i++) {
if (args[i].is_buffer &&
CodeGen_GPU_Dev::is_buffer_constant(s, args[i].name) &&
args[i].size > 0) {
constants.push_back(BufferSize(args[i].name, args[i].size));
}
}
sort(constants.begin(), constants.end());
for (size_t i = 1; i < constants.size(); i++) {
constants[i].size += constants[i - 1].size;
}
stream << "// Address spaces for " << name << "\n";
for (size_t i = 0; i < args.size(); i++) {
if (args[i].is_buffer) {
vector<BufferSize>::iterator constant = constants.begin();
while (constant != constants.end() &&
constant->name != args[i].name) {
constant++;
}
if (constant != constants.end()) {
stream << "#if " << constant->size << " <= MAX_CONSTANT_BUFFER_SIZE && "
<< constant - constants.begin() << " < MAX_CONSTANT_ARGS\n";
stream << "#define " << get_memory_space(args[i].name) << " __constant\n";
stream << "#else\n";
stream << "#define " << get_memory_space(args[i].name) << " __global\n";
stream << "#endif\n";
} else {
stream << "#define " << get_memory_space(args[i].name) << " __global\n";
}
}
}
stream << "__kernel void " << name << "(\n";
for (size_t i = 0; i < args.size(); i++) {
if (args[i].is_buffer) {
stream << " " << get_memory_space(args[i].name) << " ";
if (!args[i].write) stream << "const ";
stream << print_type(args[i].type) << " *"
<< "restrict "
<< print_name(args[i].name);
Allocation alloc;
alloc.type = args[i].type;
allocations.push(args[i].name, alloc);
} else {
Type t = args[i].type;
t = t.with_bits(t.bytes() * 8);
stream << " const "
<< print_type(t)
<< " "
<< print_name(args[i].name);
}
if (i < args.size()-1) stream << ",\n";
}
stream << ",\n" << " __address_space___shared int16* __shared";
stream << ")\n";
open_scope();
print(s);
close_scope("kernel " + name);
for (size_t i = 0; i < args.size(); i++) {
if (args[i].is_buffer) {
allocations.pop(args[i].name);
}
}
for (size_t i = 0; i < args.size(); i++) {
if (args[i].is_buffer) {
stream << "#undef " << get_memory_space(args[i].name) << "\n";
}
}
}
void CodeGen_OpenCL_Dev::init_module() {
debug(2) << "OpenCL device codegen init_module\n";
src_stream.str("");
src_stream.clear();
src_stream << "/*OpenCL C*/\n";
src_stream << "#pragma OPENCL FP_CONTRACT ON\n";
src_stream << "float float_from_bits(unsigned int x) {return as_float(x);}\n"
<< "float nan_f32() { return NAN; }\n"
<< "float neg_inf_f32() { return -INFINITY; }\n"
<< "float inf_f32() { return INFINITY; }\n"
<< "#define sqrt_f32 sqrt \n"
<< "#define sin_f32 sin \n"
<< "#define cos_f32 cos \n"
<< "#define exp_f32 exp \n"
<< "#define log_f32 log \n"
<< "#define abs_f32 fabs \n"
<< "#define floor_f32 floor \n"
<< "#define ceil_f32 ceil \n"
<< "#define round_f32 round \n"
<< "#define trunc_f32 trunc \n"
<< "#define pow_f32 pow\n"
<< "#define asin_f32 asin \n"
<< "#define acos_f32 acos \n"
<< "#define tan_f32 tan \n"
<< "#define atan_f32 atan \n"
<< "#define atan2_f32 atan2\n"
<< "#define sinh_f32 sinh \n"
<< "#define asinh_f32 asinh \n"
<< "#define cosh_f32 cosh \n"
<< "#define acosh_f32 acosh \n"
<< "#define tanh_f32 tanh \n"
<< "#define atanh_f32 atanh \n"
<< "#define fast_inverse_f32 native_recip \n"
<< "#define fast_inverse_sqrt_f32 native_rsqrt \n"
<< "int halide_gpu_thread_barrier() {\n"
<< " barrier(CLK_LOCAL_MEM_FENCE);\n"
<< " return 0;\n"
<< "}\n";
src_stream << "#define __address_space___shared __local\n";
if (target.has_feature(Target::CLDoubles)) {
src_stream << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
<< "bool is_nan_f64(double x) {return x != x; }\n"
<< "#define sqrt_f64 sqrt\n"
<< "#define sin_f64 sin\n"
<< "#define cos_f64 cos\n"
<< "#define exp_f64 exp\n"
<< "#define log_f64 log\n"
<< "#define abs_f64 fabs\n"
<< "#define floor_f64 floor\n"
<< "#define ceil_f64 ceil\n"
<< "#define round_f64 round\n"
<< "#define trunc_f64 trunc\n"
<< "#define pow_f64 pow\n"
<< "#define asin_f64 asin\n"
<< "#define acos_f64 acos\n"
<< "#define tan_f64 tan\n"
<< "#define atan_f64 atan\n"
<< "#define atan2_f64 atan2\n"
<< "#define sinh_f64 sinh\n"
<< "#define asinh_f64 asinh\n"
<< "#define cosh_f64 cosh\n"
<< "#define acosh_f64 acosh\n"
<< "#define tanh_f64 tanh\n"
<< "#define atanh_f64 atanh\n";
}
src_stream << '\n';
src_stream << "__kernel void _at_least_one_kernel(int x) { }\n";
cur_kernel_name = "";
}
vector<char> CodeGen_OpenCL_Dev::compile_to_src() {
string str = src_stream.str();
debug(1) << "OpenCL kernel:\n" << str << "\n";
vector<char> buffer(str.begin(), str.end());
buffer.push_back(0);
return buffer;
}
string CodeGen_OpenCL_Dev::get_current_kernel_name() {
return cur_kernel_name;
}
void CodeGen_OpenCL_Dev::dump() {
std::cerr << src_stream.str() << std::endl;
}
std::string CodeGen_OpenCL_Dev::print_gpu_name(const std::string &name) {
return name;
}
}}