This source file includes following definitions.
- integral_sum_cols
- integral_sum_rows
#ifdef DOUBLE_SUPPORT
#ifdef cl_amd_fp64
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#elif defined (cl_khr_fp64)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
#endif
#ifndef LOCAL_SUM_SIZE
#define LOCAL_SUM_SIZE 16
#endif
#define LOCAL_SUM_STRIDE (LOCAL_SUM_SIZE + 1)
kernel void integral_sum_cols(__global const uchar *src_ptr, int src_step, int src_offset, int rows, int cols,
__global uchar *buf_ptr, int buf_step, int buf_offset
#ifdef SUM_SQUARE
,__global uchar *buf_sq_ptr, int buf_sq_step, int buf_sq_offset
#endif
)
{
__local sumT lm_sum[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE];
#ifdef SUM_SQUARE
__local sumSQT lm_sum_sq[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE];
#endif
int lid = get_local_id(0);
int gid = get_group_id(0);
int x = get_global_id(0);
int src_index = x + src_offset;
sumT accum = 0;
#ifdef SUM_SQUARE
sumSQT accum_sq = 0;
#endif
for (int y = 0; y < rows; y += LOCAL_SUM_SIZE)
{
int lsum_index = lid;
#pragma unroll
for (int yin = 0; yin < LOCAL_SUM_SIZE; yin++, src_index+=src_step, lsum_index += LOCAL_SUM_STRIDE)
{
if ((x < cols) && (y + yin < rows))
{
__global const uchar *src = src_ptr + src_index;
accum += src[0];
#ifdef SUM_SQUARE
sumSQT temp = src[0] * src[0];
accum_sq += temp;
#endif
}
lm_sum[lsum_index] = accum;
#ifdef SUM_SQUARE
lm_sum_sq[lsum_index] = accum_sq;
#endif
}
barrier(CLK_LOCAL_MEM_FENCE);
int buf_index = mad24(buf_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumT), y + lid, buf_offset));
#ifdef SUM_SQUARE
int buf_sq_index = mad24(buf_sq_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumSQT), y + lid, buf_sq_offset));
#endif
lsum_index = LOCAL_SUM_STRIDE * lid;
#pragma unroll
for (int yin = 0; yin < LOCAL_SUM_SIZE; yin++, lsum_index ++)
{
__global sumT *buf = (__global sumT *)(buf_ptr + buf_index);
buf[0] = lm_sum[lsum_index];
buf_index += buf_step;
#ifdef SUM_SQUARE
__global sumSQT *bufsq = (__global sumSQT *)(buf_sq_ptr + buf_sq_index);
bufsq[0] = lm_sum_sq[lsum_index];
buf_sq_index += buf_sq_step;
#endif
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
kernel void integral_sum_rows(__global const uchar *buf_ptr, int buf_step, int buf_offset,
#ifdef SUM_SQUARE
__global uchar *buf_sq_ptr, int buf_sq_step, int buf_sq_offset,
#endif
__global uchar *dst_ptr, int dst_step, int dst_offset, int rows, int cols
#ifdef SUM_SQUARE
,__global uchar *dst_sq_ptr, int dst_sq_step, int dst_sq_offset
#endif
)
{
__local sumT lm_sum[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE];
#ifdef SUM_SQUARE
__local sumSQT lm_sum_sq[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE];
#endif
int lid = get_local_id(0);
int gid = get_group_id(0);
int gs = get_global_size(0);
int x = get_global_id(0);
__global sumT *dst = (__global sumT *)(dst_ptr + dst_offset);
for (int xin = x; xin < cols; xin += gs)
{
dst[xin] = 0;
}
dst_offset += dst_step;
if (x < rows - 1)
{
dst = (__global sumT *)(dst_ptr + mad24(x, dst_step, dst_offset));
dst[0] = 0;
}
int buf_index = mad24((int)sizeof(sumT), x, buf_offset);
sumT accum = 0;
#ifdef SUM_SQUARE
__global sumSQT *dst_sq = (__global sumT *)(dst_sq_ptr + dst_sq_offset);
for (int xin = x; xin < cols; xin += gs)
{
dst_sq[xin] = 0;
}
dst_sq_offset += dst_sq_step;
if (x < rows - 1)
{
dst_sq = (__global sumSQT *)(dst_sq_ptr + mad24(x, dst_sq_step, dst_sq_offset));
dst_sq[0] = 0;
}
int buf_sq_index = mad24((int)sizeof(sumSQT), x, buf_sq_offset);
sumSQT accum_sq = 0;
#endif
for (int y = 1; y < cols; y += LOCAL_SUM_SIZE)
{
int lsum_index = lid;
#pragma unroll
for (int yin = 0; yin < LOCAL_SUM_SIZE; yin++, lsum_index += LOCAL_SUM_STRIDE)
{
__global const sumT *buf = (__global const sumT *)(buf_ptr + buf_index);
accum += buf[0];
lm_sum[lsum_index] = accum;
buf_index += buf_step;
#ifdef SUM_SQUARE
__global const sumSQT *buf_sq = (__global const sumSQT *)(buf_sq_ptr + buf_sq_index);
accum_sq += buf_sq[0];
lm_sum_sq[lsum_index] = accum_sq;
buf_sq_index += buf_sq_step;
#endif
}
barrier(CLK_LOCAL_MEM_FENCE);
if (y + lid < cols)
{
int dst_index = mad24(dst_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumT), y + lid, dst_offset));
#ifdef SUM_SQUARE
int dst_sq_index = mad24(dst_sq_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumSQT), y + lid, dst_sq_offset));
#endif
lsum_index = LOCAL_SUM_STRIDE * lid;
int yin_max = min(rows - 1 - LOCAL_SUM_SIZE * gid, LOCAL_SUM_SIZE);
#pragma unroll
for (int yin = 0; yin < yin_max; yin++, lsum_index++)
{
dst = (__global sumT *)(dst_ptr + dst_index);
dst[0] = lm_sum[lsum_index];
dst_index += dst_step;
#ifdef SUM_SQUARE
dst_sq = (__global sumSQT *)(dst_sq_ptr + dst_sq_index);
dst_sq[0] = lm_sum_sq[lsum_index];
dst_sq_index += dst_sq_step;
#endif
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}