This source file includes following definitions.
- build
- build_wrap
- main
#include "Halide.h"
#include "halide_benchmark.h"
using namespace Halide;
using namespace Halide::Tools;
Func build(bool use_shared) {
Func host;
Var x, y;
host(x, y) = x + y;
host.compute_root();
Func staged;
staged(x, y) = host(x, y);
const int stages = 10;
Func f[stages];
for (int i = 0; i < stages; i++) {
Expr prev = (i == 0) ? Expr(0) : Expr(f[i-1](x, y));
Expr stencil = 0;
for (int dy = -1; dy <= 1; dy++) {
for (int dx = -1; dx <= 1; dx++) {
stencil += staged(select(prev > 0, x, x+dx),
select(prev > 0, y, y+dy));
}
}
if (i == 0) {
f[i](x, y) = stencil;
} else {
f[i](x, y) = f[i-1](x, y) + stencil;
}
}
Func final = f[stages-1];
Var xo, yo, xi, yi;
final.compute_root().gpu_tile(x, y, xo, yo, xi, yi, 8, 8);
for (int i = 0; i < stages-1; i++) {
f[i].compute_at(final, xo).gpu_threads(x, y);
}
if (use_shared) {
staged.compute_at(final, xo).unroll(x, 2).unroll(y, 2).gpu_threads(x, y);
}
return final;
}
Func build_wrap() {
Func host;
Var x, y;
host(x, y) = x + y;
host.compute_root();
const int stages = 10;
Func f[stages];
for (int i = 0; i < stages; i++) {
Expr prev = (i == 0) ? Expr(0) : Expr(f[i-1](x, y));
Expr stencil = 0;
for (int dy = -1; dy <= 1; dy++) {
for (int dx = -1; dx <= 1; dx++) {
stencil += host(select(prev > 0, x, x+dx),
select(prev > 0, y, y+dy));
}
}
if (i == 0) {
f[i](x, y) = stencil;
} else {
f[i](x, y) = f[i-1](x, y) + stencil;
}
}
Func final = f[stages-1];
Var xo, yo, xi, yi;
final.compute_root().gpu_tile(x, y, xo, yo, xi, yi, 8, 8);
for (int i = 0; i < stages-1; i++) {
f[i].compute_at(final, xo).gpu_threads(x, y);
}
host.in().compute_at(final, xo).unroll(x, 2).unroll(y, 2).gpu_threads(x, y);
return final;
}
int main(int argc, char **argv) {
Target target = get_jit_target_from_environment();
if (!target.has_gpu_feature()) {
printf("Not running test because no gpu target enabled\n");
return 0;
}
Func use_shared = build(true);
Func use_l1 = build(false);
Func use_wrap_for_shared = build_wrap();
use_shared.compile_jit();
use_l1.compile_jit();
use_wrap_for_shared.compile_jit();
Buffer<int> out1(1000, 1000);
Buffer<int> out2(1000, 1000);
Buffer<int> out3(1000, 1000);
double shared_time = benchmark(5, 5, [&]() {
use_shared.realize(out1);
out1.device_sync();
});
double l1_time = benchmark(5, 5, [&]() {
use_l1.realize(out2);
out2.device_sync();
});
double wrap_time = benchmark(5, 5, [&]() {
use_wrap_for_shared.realize(out3);
out3.device_sync();
});
for (int y = 0; y < out3.height(); y++) {
for (int x = 0; x < out3.width(); x++) {
if (out3(x, y) != out1(x, y)) {
printf("wrapper(%d, %d) = %d instead of %d\n",
x, y, out3(x, y), out1(x, y));
return -1;
}
}
}
for (int y = 0; y < out3.height(); y++) {
for (int x = 0; x < out3.width(); x++) {
if (out3(x, y) != out2(x, y)) {
printf("wrapper(%d, %d) = %d instead of %d\n",
x, y, out3(x, y), out2(x, y));
return -1;
}
}
}
printf("using shared: %f\n"
"using l1: %f\n"
"using wrap for shared: %f\n",
shared_time, l1_time, wrap_time);
return 0;
}