This source file includes following definitions.
- generate
- schedule
#include "Halide.h"
using namespace Halide;
class Conv3x3 : public Generator<Conv3x3> {
public:
GeneratorParam<Type> accumulator_type{"accumulator_type", Int(16)};
Input<Buffer<uint8_t>> input{"input", 2};
Input<Buffer<int8_t>> mask{"mask", 2};
Output<Buffer<uint8_t>> output{"output", 2};
void generate() {
bounded_input(x, y) = BoundaryConditions::repeat_edge(input)(x, y);
Expr sum = cast(accumulator_type, 0);
for (int i = -1; i <= 1; i++) {
for (int j = -1; j <= 1; j++) {
sum += cast<int16_t>(bounded_input(x+j, y+i)) * cast<int16_t>(mask(j+1, i+1));
}
}
output(x, y) = cast<uint8_t>(clamp(sum >> 4, 0, 255));
}
void schedule() {
Var xi{"xi"}, yi{"yi"};
input.dim(0).set_min(0);
input.dim(1).set_min(0);
output.dim(0).set_min(0);
output.dim(1).set_min(0);
if (get_target().features_any_of({Target::HVX_64, Target::HVX_128})) {
const int vector_size = get_target().has_feature(Target::HVX_128) ? 128 : 64;
Expr input_stride = input.dim(1).stride();
input.dim(1).set_stride((input_stride/vector_size) * vector_size);
Expr output_stride = output.dim(1).stride();
output.dim(1).set_stride((output_stride/vector_size) * vector_size);
bounded_input
.compute_at(Func(output), y)
.align_storage(x, 128)
.vectorize(x, vector_size, TailStrategy::RoundUp);
output
.hexagon()
.tile(x, y, xi, yi, vector_size, 4, TailStrategy::RoundUp)
.vectorize(xi)
.unroll(yi);
} else {
const int vector_size = natural_vector_size<uint8_t>();
output
.vectorize(x, vector_size)
.parallel(y, 16);
}
}
private:
Var x{"x"}, y{"y"};
Func bounded_input{"input_bounded"};
};
HALIDE_REGISTER_GENERATOR(Conv3x3, "conv3x3");