This source file includes following definitions.
- ubits
- less_than_zero
- is_negative_one
- is_negative_one
- is_negative_one
- is_negative_one
- is_negative_one
- is_negative_one
- maximum
- minimum
- init
- mul
- div_mod
- f_mod
- test_mul
- test_div_mod
- main
#include <stdio.h>
#include "Halide.h"
#include <math.h>
#include <algorithm>
#include <future>
#ifdef _MSC_VER
#pragma warning(disable:4146)
#endif
using namespace Halide;
using Halide::Internal::Call;
#define WIDTH 1024
#define HEIGHT 1024
#define SALTRATE 50
# define SWIDTH 32
# define SHEIGHT HEIGHT
uint64_t ubits(int unique, int i, int j) {
uint64_t bits, mi, mj, mk, ml, mu;
mi = 982451653;
mj = 776531491;
mk = 573259391;
ml = 373587883;
mu = 275604541;
bits = ((unique * mu + i) * mi + j) * mj;
bits = (bits ^ (bits >> 32)) * mk;
bits = (bits ^ (bits >> 32)) * ml;
bits = (bits ^ (bits >> 32)) * mi;
bits = (bits ^ (bits >> 32)) * mu;
return bits;
}
template <typename T>
bool less_than_zero(T val) {
return (val < 0);
}
template <>
bool less_than_zero<unsigned long long>(unsigned long long val) {
return false;
}
template <>
bool less_than_zero<unsigned long>(unsigned long val) {
return false;
}
template <>
bool less_than_zero<unsigned int>(unsigned int val) {
return false;
}
template <>
bool less_than_zero<unsigned short>(unsigned short val) {
return false;
}
template <>
bool less_than_zero<unsigned char>(unsigned char val) {
return false;
}
template<typename T>
bool is_negative_one(T val) {
return (val == -1);
}
template<>
bool is_negative_one(unsigned long long val) {
return false;
}
template<>
bool is_negative_one(unsigned long val) {
return false;
}
template<>
bool is_negative_one(unsigned int val) {
return false;
}
template<>
bool is_negative_one(unsigned short val) {
return false;
}
template<>
bool is_negative_one(unsigned char val) {
return false;
}
template<typename T, typename BIG>
BIG maximum() {
Type t = type_of<T>();
if (t.is_float()) {
return (BIG) 1.0;
}
if (t.is_uint()) {
uint64_t max;
max = 0;
max = ~max;
if (t.bits() < 64)
max = (((uint64_t) 1) << t.bits()) - 1;
return (BIG) max;
}
if (t.is_int()) {
uint64_t umax;
umax = (((uint64_t) 1) << (t.bits() - 1)) - 1;
return (BIG) umax;
}
assert(0);
return (BIG) 1;
}
template<typename T, typename BIG>
BIG minimum() {
Type t = type_of<T>();
if (t.is_float()) {
return (BIG) 0.0;
}
if (t.is_uint()) {
return (BIG) 0;
}
if (t.is_int()) {
uint64_t umax;
BIG min;
umax = (((uint64_t) 1) << (t.bits() - 1)) - 1;
min = umax;
min = -min - 1;
return min;
}
assert(0);
return (BIG) 0;
}
template<typename T, typename BIG>
Buffer<T> init(Type t, int unique, int width, int height) {
if (width < 2) width = 2;
if (height < 2) height = 2;
Buffer<T> result(width, height);
if (t.is_int()) {
int64_t max, min, neg, v, vsalt;
max = maximum<T, int64_t>();
min = minimum<T, int64_t>();
neg = (~((int64_t) 0)) ^ max;
for (int i = 0; i < width; i++) {
for (int j = 0; j < height; j++) {
v = (int64_t) (ubits(unique,i,j));
if (v < 0)
v |= neg;
else
v &= max;
vsalt = (int64_t) (ubits(unique|0x100,i,j));
if (vsalt % SALTRATE == 0) {
if (vsalt & 0x1000000) {
v = max;
} else {
v = min;
}
}
result(i,j) = (T)v;
}
}
result(0,0) = (T)min;
result(1,0) = (T)max;
result(0,1) = (T)((unique & 1) ? min : max);
result(1,1) = (T)((unique & 1) ? max : min);
}
else if (t.is_uint()) {
uint64_t max, v, vsalt;
max = maximum<T, BIG>();
for (int i = 0; i < width; i++) {
for (int j = 0; j < height; j++) {
v = ubits(unique,i,j) & max;
vsalt = (int64_t) (ubits(unique|0x100,i,j));
if (vsalt % SALTRATE == 0) {
if (vsalt & 0x1000000) {
v = max;
} else {
v = 0;
}
}
result(i,j) = (T)v;
}
}
result(0,0) = (T)0;
result(1,0) = (T)max;
result(0,1) = (T)((unique & 1) ? 0 : max);
result(1,1) = (T)((unique & 1) ? max : 0);
}
else if (t.is_float()) {
uint64_t uv, vsalt;
uint64_t max = (uint64_t)(-1);
double v;
for (int i = 0; i < width; i++) {
for (int j = 0; j < height; j++) {
uv = ubits(unique,i,j);
v = (((double) uv) / ((double) (max))) * 2.0 - 1.0;
vsalt = (int64_t) (ubits(unique|0x100,i,j));
if (vsalt % SALTRATE == 0) {
if (vsalt & 0x1000000) {
v = 1.0;
} else {
v = 0.0;
}
}
result(i,j) = (T)v;
}
}
result(0,0) = (T)(0.0);
result(1,0) = (T)(1.0);
result(0,1) = (T)((unique & 1) ? 0.0 : 1.0);
result(1,1) = (T)((unique & 1) ? 1.0 : 0.0);
}
else {
printf ("Unknown data type in init.\n");
}
return result;
}
enum ScheduleVariant {
CPU,
TiledGPU,
Hexagon
};
template<typename T1, typename T2, typename RT, typename BIG>
bool mul(int vector_width, ScheduleVariant scheduling, const Target &target) {
int i, j;
Type t1 = type_of<T1>();
Type t2 = type_of<T2>();
Type rt = type_of<RT>();
bool success = true;
Buffer<T1> a = init<T1, BIG>(t1, 1, WIDTH, HEIGHT);
Buffer<T2> b = init<T2, BIG>(t2, 2, WIDTH, HEIGHT);
Func f;
Var x, y, xi, yi;
f(x, y) = cast(rt, a(x, y)) * cast(rt, b(x, y));
if (vector_width > 1) {
f.vectorize(x, vector_width);
}
switch (scheduling) {
case CPU:
break;
case TiledGPU:
f.compute_root().gpu_tile(x, y, xi, yi, 16, 16);
break;
case Hexagon:
f.compute_root().hexagon();
break;
};
Buffer<RT> r = f.realize(WIDTH, HEIGHT, target);
int ecount = 0;
for (i = 0; i < WIDTH; i++) {
for (j = 0; j < HEIGHT; j++) {
T1 ai = a(i, j);
T2 bi = b(i, j);
RT ri = r(i, j);
RT correct = RT(ai)*RT(bi);
if (correct != ri && (ecount++) < 10) {
std::cerr << ai << "*" << bi << " -> " << ri << " != " << correct << "\n";
success = false;
}
if (i < SWIDTH && j < SHEIGHT) {
Expr ae = cast<RT>(Expr(ai));
Expr be = cast<RT>(Expr(bi));
Expr re = simplify(ae*be);
if (re.as<Call>() && re.as<Call>()->is_intrinsic(Call::signed_integer_overflow)) {
} else {
if (!Internal::equal(re, Expr(ri)) && (ecount++) < 10) {
std::cerr << "Compiled a*b != simplified a*b: " << (int64_t)ai
<< "*" << (int64_t)bi
<< " = " << (int64_t)ri
<< " != " << re << "\n";
success = false;
}
}
}
}
}
return success;
}
template<typename T, typename BIG>
bool div_mod(int vector_width, ScheduleVariant scheduling, const Target &target) {
int i, j;
Type t = type_of<T>();
BIG minval = minimum<T, BIG>();
bool success = true;
Buffer<T> a = init<T, BIG>(t, 1, WIDTH, HEIGHT);
Buffer<T> b = init<T, BIG>(t, 2, WIDTH, HEIGHT);
for (i = 0; i < WIDTH; i++) {
for (j = 0; j < HEIGHT; j++) {
if (b(i,j) == 0) {
b(i,j) = 1;
}
if (a(i,j) == minval && less_than_zero(minval) && is_negative_one(b(i,j))) {
a(i,j) = a(i,j) + 1;
}
}
}
Func f;
Var x, y, xi, yi;
f(x, y) = Tuple(a(x, y) / b(x, y), a(x, y) % b(x, y));
if (vector_width > 1) {
f.vectorize(x, vector_width);
}
switch (scheduling) {
case CPU:
break;
case TiledGPU:
f.compute_root().gpu_tile(x, y, xi, yi, 16, 16);
break;
case Hexagon:
f.compute_root().hexagon();
break;
};
Realization R = f.realize(WIDTH, HEIGHT, target);
Buffer<T> q(R[0]);
Buffer<T> r(R[1]);
int ecount = 0;
for (i = 0; i < WIDTH; i++) {
for (j = 0; j < HEIGHT; j++) {
T ai = a(i, j);
T bi = b(i, j);
T qi = q(i, j);
T ri = r(i, j);
if (qi*bi + ri != ai && (ecount++) < 10) {
std::cerr << "(a/b)*b + a%b != a; a, b = " << (int64_t)ai
<< ", " << (int64_t)bi
<< "; q, r = " << (int64_t)qi
<< ", " << (int64_t)ri << "\n";
success = false;
} else if (!(0 <= ri &&
(t.is_min((int64_t)bi) || ri < (T)std::abs((int64_t)bi))) &&
(ecount++) < 10) {
std::cerr << "ri is not in the range [0, |b|); a, b = " << (int64_t)ai
<< ", " << (int64_t)bi
<< "; q, r = " << (int64_t)qi
<< ", " << (int64_t)ri << "\n";
success = false;
}
if (i < SWIDTH && j < SHEIGHT) {
Expr ae = Expr(ai);
Expr be = Expr(bi);
Expr qe = simplify(ae/be);
Expr re = simplify(ae%be);
if (!Internal::equal(qe, Expr(qi)) && (ecount++) < 10) {
std::cerr << "Compiled a/b != simplified a/b: " << (int64_t)ai
<< "/" << (int64_t)bi
<< " = " << (int64_t)qi
<< " != " << qe << "\n";
success = false;
} else if (!Internal::equal(re, Expr(ri)) && (ecount++) < 10) {
std::cerr << "Compiled a%b != simplified a%b: " << (int64_t)ai
<< "/" << (int64_t)bi
<< " = " << (int64_t)ri
<< " != " << re << "\n";
success = false;
}
}
}
}
return success;
}
template<typename T, typename BIG>
bool f_mod() {
int i, j;
Type t = type_of<T>();
bool success = true;
Buffer<T> a = init<T, BIG>(t, 1, WIDTH, HEIGHT);
Buffer<T> b = init<T, BIG>(t, 2, WIDTH, HEIGHT);
Buffer<T> out(WIDTH,HEIGHT);
for (i = 0; i < WIDTH; i++) {
for (j = 0; j < HEIGHT; j++) {
if (b(i,j) == 0.0) {
b(i,j) = 1.0;
}
}
}
Func f;
f(_) = a(_) % b(_);
f.realize(out);
int ecount = 0;
for (i = 0; i < std::min(SWIDTH,WIDTH); i++) {
for (j = 0; j < std::min(SHEIGHT, HEIGHT); j++) {
T arg_a = a(i,j);
T arg_b = b(i,j);
T v = out(i,j);
Expr in_e = cast<T>((float) arg_a) % cast<T>((float) arg_b);
Expr e = simplify(in_e);
Expr eout = cast<T>((float) v);
if (! Internal::equal(e, eout) && (ecount++) < 10) {
Expr diff = simplify(e - eout);
Expr smalldiff = simplify(diff < (float) (0.000001) && diff > (float) (-0.000001));
if (! Internal::is_one(smalldiff)) {
std::cerr << "simplify(" << in_e << ") yielded " << e << "; expected " << eout << "\n";
std::cerr << " difference=" << diff << "\n";
success = false;
}
}
}
}
return success;
}
bool test_mul(int vector_width, ScheduleVariant scheduling, Target target) {
bool success = true;
success &= mul<uint8_t, uint8_t, uint8_t, uint64_t>(vector_width, scheduling, target);
success &= mul<uint16_t, uint16_t, uint16_t, uint64_t>(vector_width, scheduling, target);
success &= mul<uint32_t, uint32_t, uint32_t, uint64_t>(vector_width, scheduling, target);
success &= mul<int8_t, int8_t, int8_t, int64_t>(vector_width, scheduling, target);
success &= mul<int16_t, int16_t, int16_t, int64_t>(vector_width, scheduling, target);
success &= mul<int32_t, int32_t, int32_t, int64_t>(vector_width, scheduling, target);
success &= mul<uint8_t, uint8_t, uint16_t, uint64_t>(vector_width, scheduling, target);
success &= mul<uint16_t, uint16_t, uint32_t, uint64_t>(vector_width, scheduling, target);
success &= mul<int8_t, int8_t, int16_t, int64_t>(vector_width, scheduling, target);
success &= mul<int16_t, int16_t, int32_t, int64_t>(vector_width, scheduling, target);
success &= mul<uint16_t, uint32_t, uint32_t, uint64_t>(vector_width, scheduling, target);
success &= mul<int16_t, int32_t, int32_t, uint64_t>(vector_width, scheduling, target);
success &= mul<uint16_t, int32_t, int32_t, uint64_t>(vector_width, scheduling, target);
return success;
}
bool test_div_mod(int vector_width, ScheduleVariant scheduling, Target target) {
bool success = true;
success &= div_mod<uint8_t, uint64_t>(vector_width, scheduling, target);
success &= div_mod<uint16_t, uint64_t>(vector_width, scheduling, target);
success &= div_mod<uint32_t, uint64_t>(vector_width, scheduling, target);
success &= div_mod<int8_t, int64_t>(vector_width, scheduling, target);
success &= div_mod<int16_t, int64_t>(vector_width, scheduling, target);
success &= div_mod<int32_t, int64_t>(vector_width, scheduling, target);
return success;
}
int main(int argc, char **argv) {
Target target = get_jit_target_from_environment();
ScheduleVariant scheduling = CPU;
if (target.has_gpu_feature()) {
scheduling = TiledGPU;
} else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
scheduling = Hexagon;
}
std::vector<int> mul_vector_widths = { 1 };
if (target.has_feature(Target::Metal)) {
for (int i = 2; i <= 4; i *= 2) {
mul_vector_widths.push_back(i);
}
} else if (target.has_feature(Target::HVX_64)) {
mul_vector_widths.push_back(64);
} else if (target.has_feature(Target::HVX_128)) {
mul_vector_widths.push_back(128);
} else {
for (int i = 2; i <= 16; i *= 2) {
mul_vector_widths.push_back(i);
}
}
std::vector<int> div_vector_widths = mul_vector_widths;
if (scheduling == Hexagon) {
div_vector_widths.clear();
div_vector_widths.push_back(1);
}
Halide::Internal::ThreadPool<bool> pool;
std::vector<std::future<bool>> futures;
for (int vector_width : mul_vector_widths) {
std::cout << "Testing mul vector_width: " << vector_width << "\n";
auto f = pool.async(test_mul, vector_width, scheduling, target);
futures.push_back(std::move(f));
}
for (int vector_width : div_vector_widths) {
std::cout << "Testing div_mod vector_width: " << vector_width << "\n";
auto f = pool.async(test_div_mod, vector_width, scheduling, target);
futures.push_back(std::move(f));
}
futures.push_back(pool.async(f_mod<float, double>));
bool success = true;
for (auto &f : futures) {
success &= f.get();
}
if (!success) {
printf ("Failure!\n");
return -1;
}
printf("Success!\n");
return 0;
}