#ifndef HALIDE_RUNTIME_DEVICE_BUFFER_UTILS_H
#define HALIDE_RUNTIME_DEVICE_BUFFER_UTILS_H
#include "HalideRuntime.h"
#include "device_interface.h"
#include "printer.h"
namespace Halide { namespace Runtime { namespace Internal {
// A host <-> dev copy should be done with the fewest possible number
// of contiguous copies to minimize driver overhead. If our
// halide_buffer_t has strides larger than its extents (e.g. because
// it represents a sub-region of a larger halide_buffer_t) we can't
// safely copy it back and forth using a single contiguous copy,
// because we'd clobber in-between values that another thread might be
// using. In the best case we can do a single contiguous copy, but in
// the worst case we need to individually copy over every pixel.
//
// This problem is made extra difficult by the fact that the ordering
// of the dimensions in a halide_buffer_t doesn't relate to memory layout at
// all, so the strides could be in any order.
//
// We solve it by representing a copy job we need to perform as a
// device_copy struct. It describes a multi-dimensional array of
// copies to perform. Initially it describes copying over a single
// pixel at a time. We then try to discover contiguous groups of
// copies that can be coalesced into a single larger copy.
// The struct that describes a host <-> dev copy to perform.
#define MAX_COPY_DIMS 16
struct device_copy {
uint64_t src, dst;
// The multidimensional array of contiguous copy tasks that need to be done.
uint64_t extent[MAX_COPY_DIMS];
// The strides (in bytes) that separate adjacent copy tasks in each dimension.
uint64_t stride_bytes[MAX_COPY_DIMS];
// How many contiguous bytes to copy per task
uint64_t chunk_size;
};
WEAK void copy_memory_helper(const device_copy ©, int d, int64_t off) {
// Skip size-1 dimensions
while (d >= 0 && copy.extent[d] == 1) d--;
if (d == -1) {
const void *from = (void *)(copy.src + off);
void *to = (void *)(copy.dst + off);
memcpy(to, from, copy.chunk_size);
} else {
for (uint64_t i = 0; i < copy.extent[d]; i++) {
copy_memory_helper(copy, d - 1, off);
off += copy.stride_bytes[d];
}
}
}
WEAK void copy_memory(const device_copy ©, void *user_context) {
// If this is a zero copy buffer, these pointers will be the same.
if (copy.src != copy.dst) {
copy_memory_helper(copy, MAX_COPY_DIMS-1, 0);
} else {
debug(user_context) << "copy_memory: no copy needed as pointers are the same.\n";
}
}
WEAK device_copy make_host_to_device_copy(const halide_buffer_t *buf) {
// Make a copy job representing copying the first pixel only.
device_copy c;
c.src = (uint64_t)buf->host;
c.dst = buf->device;
c.chunk_size = buf->type.bytes();
for (int i = 0; i < MAX_COPY_DIMS; i++) {
c.extent[i] = 1;
c.stride_bytes[i] = 0;
}
if (buf->dimensions > MAX_COPY_DIMS) {
// This condition should also be checked for outside this fn.
device_copy zero = {0};
return zero;
}
if (buf->type.bits == 0) {
// This buffer apparently represents no memory. Return a zero'd copy
// task.
device_copy zero = {0};
return zero;
}
// Now expand it to copy all the pixels (one at a time) by taking
// the extents and strides from the halide_buffer_t. Dimensions
// are added to the copy by inserting it such that the stride is
// in ascending order.
for (int i = 0; i < buf->dimensions; i++) {
// TODO: deal with negative strides.
uint64_t stride_bytes = buf->dim[i].stride * buf->type.bytes();
// Insert the dimension sorted into the buffer copy.
int insert;
for (insert = 0; insert < i; insert++) {
// If the stride is 0, we put it at the end because it can't be
// folded.
if (stride_bytes < c.stride_bytes[insert] && stride_bytes != 0) {
break;
}
}
for (int j = i; j > insert; j--) {
c.extent[j] = c.extent[j - 1];
c.stride_bytes[j] = c.stride_bytes[j - 1];
}
// If the stride is 0, only copy it once.
c.extent[insert] = stride_bytes != 0 ? buf->dim[i].extent : 1;
debug(NULL) << "c.extent[" << insert << "] = " << (int)(c.extent[insert]) << "\n";
c.stride_bytes[insert] = stride_bytes;
};
// Attempt to fold contiguous dimensions into the chunk size. Since the
// dimensions are sorted by stride, and the strides must be greater than
// or equal to the chunk size, this means we can just delete the innermost
// dimension as long as its stride is equal to the chunk size.
while(c.chunk_size == c.stride_bytes[0]) {
// Fold the innermost dimension's extent into the chunk_size.
c.chunk_size *= c.extent[0];
// Erase the innermost dimension from the list of dimensions to
// iterate over.
for (int j = 1; j < MAX_COPY_DIMS; j++) {
c.extent[j-1] = c.extent[j];
c.stride_bytes[j-1] = c.stride_bytes[j];
}
c.extent[MAX_COPY_DIMS-1] = 1;
c.stride_bytes[MAX_COPY_DIMS-1] = 0;
}
return c;
}
WEAK device_copy make_device_to_host_copy(const halide_buffer_t *buf) {
// Just make a host to dev copy and swap src and dst
device_copy c = make_host_to_device_copy(buf);
uint64_t tmp = c.src;
c.src = c.dst;
c.dst = tmp;
return c;
}
}}} // namespace Halide::Runtime::Internal
#endif // HALIDE_DEVICE_BUFFER_UTILS_H