This source file includes following definitions.
- main
#include <stdio.h>
#include "Halide.h"
using namespace Halide;
int main(int argc, char **argv) {
{
RDom r1(0, 16), r2(0, 16);
Var i, j;
Func f1, f2;
f1(i) = sum(i*16 + r1);
f2() = sum(f1(r2));
f1.compute_root().vectorize(i, 4).parallel(i);
Buffer<int> im = f2.realize();
int correct = (256*255)/2;
if (im(0) != correct) {
printf("im(0) = %d instead of %d\n", im(0), correct);
return -1;
}
}
{
Var i, j;
Buffer<int> input(256), correct(256);
for (int i = 0; i < 256; i++) {
input(i) = rand() % 16;
correct(i) = input(i);
if (i > 0) correct(i) += correct(i-1);
}
int chunk_size = 16;
RDom r1(0, chunk_size);
Func sum_rows;
sum_rows(i, j) = 0;
sum_rows(r1, j) = sum_rows(r1 - 1, j) + input(r1 + j * chunk_size);
Func sum_cols;
sum_cols(j) = 0;
sum_cols(r1) += sum_cols(r1 - 1) + sum_rows(chunk_size - 1, r1);
Func out;
Expr x = i % chunk_size, y = i / chunk_size;
out(i) = sum_rows(x, y) + sum_cols(y-1);
Var ii, io;
out.split(i, io, ii, chunk_size).vectorize(ii, 4).parallel(io);
sum_rows.compute_root().vectorize(i, 4).parallel(j);
sum_rows.update().parallel(j);
sum_cols.compute_root().vectorize(j, 4);
sum_cols.update();
out.output_buffer().dim(0).set_bounds(0, 256);
Buffer<int> result = out.realize(256);
for (int i = 0; i < 256; i++) {
if (result(i) != correct(i)) {
printf("result(%d) = %d instead of %d\n", i, result(i), correct(i));
return -1;
}
}
}
printf("Success!\n");
return 0;
}