This source file includes following definitions.
- horizontal_pass
- horizontal_pass
- horizontal_pass
- int_to_uchar4
- horisontal_pass_8u_shfl_kernel
- horisontal_pass_8u_shfl
- vertical_pass
- vertical_pass
- integral
- integral
- integral
#pragma once
#ifndef __OPENCV_CUDEV_GRID_INTEGRAL_DETAIL_HPP__
#define __OPENCV_CUDEV_GRID_INTEGRAL_DETAIL_HPP__
#include "../../common.hpp"
#include "../../warp/shuffle.hpp"
#include "../../block/scan.hpp"
#include "../../ptr2d/glob.hpp"
namespace cv { namespace cudev {
namespace integral_detail
{
template <int NUM_SCAN_THREADS, class SrcPtr, typename D>
__global__ void horizontal_pass(const SrcPtr src, GlobPtr<D> dst, const int cols)
{
__shared__ D smem[NUM_SCAN_THREADS * 2];
__shared__ D carryElem;
carryElem = 0;
__syncthreads();
D* dst_row = dst.row(blockIdx.x);
int numBuckets = divUp(cols, NUM_SCAN_THREADS);
int offsetX = 0;
while (numBuckets--)
{
const int curElemOffs = offsetX + threadIdx.x;
D curElem = 0.0f;
if (curElemOffs < cols)
curElem = src(blockIdx.x, curElemOffs);
const D curScanElem = blockScanInclusive<NUM_SCAN_THREADS>(curElem, smem, threadIdx.x);
if (curElemOffs < cols)
dst_row[curElemOffs] = carryElem + curScanElem;
offsetX += NUM_SCAN_THREADS;
__syncthreads();
if (threadIdx.x == NUM_SCAN_THREADS - 1)
{
carryElem += curScanElem;
}
__syncthreads();
}
}
template <int NUM_SCAN_THREADS, typename T, typename D>
__global__ void horizontal_pass(const GlobPtr<T> src, GlobPtr<D> dst, const int cols)
{
__shared__ D smem[NUM_SCAN_THREADS * 2];
__shared__ D carryElem;
carryElem = 0;
__syncthreads();
const T* src_row = src.row(blockIdx.x);
D* dst_row = dst.row(blockIdx.x);
int numBuckets = divUp(cols, NUM_SCAN_THREADS);
int offsetX = 0;
while (numBuckets--)
{
const int curElemOffs = offsetX + threadIdx.x;
D curElem = 0.0f;
if (curElemOffs < cols)
curElem = src_row[curElemOffs];
const D curScanElem = blockScanInclusive<NUM_SCAN_THREADS>(curElem, smem, threadIdx.x);
if (curElemOffs < cols)
dst_row[curElemOffs] = carryElem + curScanElem;
offsetX += NUM_SCAN_THREADS;
__syncthreads();
if (threadIdx.x == NUM_SCAN_THREADS - 1)
{
carryElem += curScanElem;
}
__syncthreads();
}
}
template <class SrcPtr, typename D>
__host__ void horizontal_pass(const SrcPtr& src, const GlobPtr<D>& dst, int rows, int cols, cudaStream_t stream)
{
const int NUM_SCAN_THREADS = 256;
const dim3 block(NUM_SCAN_THREADS);
const dim3 grid(rows);
horizontal_pass<NUM_SCAN_THREADS><<<grid, block, 0, stream>>>(src, dst, cols);
CV_CUDEV_SAFE_CALL( cudaGetLastError() );
}
__device__ static uchar4 int_to_uchar4(unsigned int in)
{
uchar4 bytes;
bytes.x = (in & 0x000000ff) >> 0;
bytes.y = (in & 0x0000ff00) >> 8;
bytes.z = (in & 0x00ff0000) >> 16;
bytes.w = (in & 0xff000000) >> 24;
return bytes;
}
__global__ static void horisontal_pass_8u_shfl_kernel(const GlobPtr<uint4> img, GlobPtr<uint4> integral)
{
#if CV_CUDEV_ARCH >= 300
__shared__ int sums[128];
const int id = threadIdx.x;
const int lane_id = id % warpSize;
const int warp_id = id / warpSize;
const uint4 data = img(blockIdx.x, id);
const uchar4 a = int_to_uchar4(data.x);
const uchar4 b = int_to_uchar4(data.y);
const uchar4 c = int_to_uchar4(data.z);
const uchar4 d = int_to_uchar4(data.w);
int result[16];
result[0] = a.x;
result[1] = result[0] + a.y;
result[2] = result[1] + a.z;
result[3] = result[2] + a.w;
result[4] = result[3] + b.x;
result[5] = result[4] + b.y;
result[6] = result[5] + b.z;
result[7] = result[6] + b.w;
result[8] = result[7] + c.x;
result[9] = result[8] + c.y;
result[10] = result[9] + c.z;
result[11] = result[10] + c.w;
result[12] = result[11] + d.x;
result[13] = result[12] + d.y;
result[14] = result[13] + d.z;
result[15] = result[14] + d.w;
int sum = result[15];
#pragma unroll
for (int i = 1; i < 32; i *= 2)
{
const int n = shfl_up(sum, i, 32);
if (lane_id >= i)
{
#pragma unroll
for (int k = 0; k < 16; ++k)
result[k] += n;
sum += n;
}
}
if (threadIdx.x % warpSize == warpSize - 1)
sums[warp_id] = result[15];
__syncthreads();
if (warp_id == 0)
{
int warp_sum = sums[lane_id];
#pragma unroll
for (int i = 1; i <= 32; i *= 2)
{
const int n = shfl_up(warp_sum, i, 32);
if (lane_id >= i)
warp_sum += n;
}
sums[lane_id] = warp_sum;
}
__syncthreads();
int blockSum = 0;
if (warp_id > 0)
{
blockSum = sums[warp_id - 1];
#pragma unroll
for (int k = 0; k < 16; ++k)
result[k] += blockSum;
}
result[4] = shfl_xor(result[4] , 1, 32);
result[5] = shfl_xor(result[5] , 1, 32);
result[6] = shfl_xor(result[6] , 1, 32);
result[7] = shfl_xor(result[7] , 1, 32);
result[8] = shfl_xor(result[8] , 2, 32);
result[9] = shfl_xor(result[9] , 2, 32);
result[10] = shfl_xor(result[10], 2, 32);
result[11] = shfl_xor(result[11], 2, 32);
result[12] = shfl_xor(result[12], 3, 32);
result[13] = shfl_xor(result[13], 3, 32);
result[14] = shfl_xor(result[14], 3, 32);
result[15] = shfl_xor(result[15], 3, 32);
uint4* integral_row = integral.row(blockIdx.x);
uint4 output;
if (threadIdx.x % 4 == 0)
output = make_uint4(result[0], result[1], result[2], result[3]);
if (threadIdx.x % 4 == 1)
output = make_uint4(result[4], result[5], result[6], result[7]);
if (threadIdx.x % 4 == 2)
output = make_uint4(result[8], result[9], result[10], result[11]);
if (threadIdx.x % 4 == 3)
output = make_uint4(result[12], result[13], result[14], result[15]);
integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16] = output;
if (threadIdx.x % 4 == 2)
output = make_uint4(result[0], result[1], result[2], result[3]);
if (threadIdx.x % 4 == 3)
output = make_uint4(result[4], result[5], result[6], result[7]);
if (threadIdx.x % 4 == 0)
output = make_uint4(result[8], result[9], result[10], result[11]);
if (threadIdx.x % 4 == 1)
output = make_uint4(result[12], result[13], result[14], result[15]);
integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 8] = output;
#pragma unroll
for (int i = 0; i < 16; ++i)
result[i] = shfl_xor(result[i], 1, 32);
if (threadIdx.x % 4 == 0)
output = make_uint4(result[0], result[1], result[2], result[3]);
if (threadIdx.x % 4 == 1)
output = make_uint4(result[4], result[5], result[6], result[7]);
if (threadIdx.x % 4 == 2)
output = make_uint4(result[8], result[9], result[10], result[11]);
if (threadIdx.x % 4 == 3)
output = make_uint4(result[12], result[13], result[14], result[15]);
integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16 + 4] = output;
if (threadIdx.x % 4 == 2)
output = make_uint4(result[0], result[1], result[2], result[3]);
if (threadIdx.x % 4 == 3)
output = make_uint4(result[4], result[5], result[6], result[7]);
if (threadIdx.x % 4 == 0)
output = make_uint4(result[8], result[9], result[10], result[11]);
if (threadIdx.x % 4 == 1)
output = make_uint4(result[12], result[13], result[14], result[15]);
integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 12] = output;
#endif
}
__host__ static void horisontal_pass_8u_shfl(const GlobPtr<uchar> src, GlobPtr<uint> integral, int rows, int cols, cudaStream_t stream)
{
const int block = cols / 16;
const int grid = rows;
CV_CUDEV_SAFE_CALL( cudaFuncSetCacheConfig(horisontal_pass_8u_shfl_kernel, cudaFuncCachePreferL1) );
GlobPtr<uint4> src4 = globPtr((uint4*) src.data, src.step);
GlobPtr<uint4> integral4 = globPtr((uint4*) integral.data, integral.step);
horisontal_pass_8u_shfl_kernel<<<grid, block, 0, stream>>>(src4, integral4);
CV_CUDEV_SAFE_CALL( cudaGetLastError() );
}
template <typename T>
__global__ void vertical_pass(GlobPtr<T> integral, const int rows, const int cols)
{
#if CV_CUDEV_ARCH >= 300
__shared__ T sums[32][9];
const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
const int lane_id = tidx % 8;
sums[threadIdx.x][threadIdx.y] = 0;
__syncthreads();
T stepSum = 0;
int numBuckets = divUp(rows, blockDim.y);
int y = threadIdx.y;
while (numBuckets--)
{
T* p = integral.row(y) + tidx;
T sum = (tidx < cols) && (y < rows) ? *p : 0;
sums[threadIdx.x][threadIdx.y] = sum;
__syncthreads();
const int j = threadIdx.x % 8;
const int k = threadIdx.x / 8 + threadIdx.y * 4;
T partial_sum = sums[k][j];
for (int i = 1; i <= 8; i *= 2)
{
T n = shfl_up(partial_sum, i, 32);
if (lane_id >= i)
partial_sum += n;
}
sums[k][j] = partial_sum;
__syncthreads();
if (threadIdx.y > 0)
sum += sums[threadIdx.x][threadIdx.y - 1];
sum += stepSum;
stepSum += sums[threadIdx.x][blockDim.y - 1];
__syncthreads();
if ((tidx < cols) && (y < rows))
{
*p = sum;
}
y += blockDim.y;
}
#else
__shared__ T smem[32][32];
__shared__ T prevVals[32];
volatile T* smem_row = &smem[0][0] + 64 * threadIdx.y;
if (threadIdx.y == 0)
prevVals[threadIdx.x] = 0;
__syncthreads();
const int x = blockIdx.x * blockDim.x + threadIdx.x;
int numBuckets = divUp(rows, 8 * 4);
int offsetY = 0;
while (numBuckets--)
{
const int curRowOffs = offsetY + threadIdx.y;
T curElems[4];
T temp[4];
smem[threadIdx.y + 0][threadIdx.x] = 0.0f;
smem[threadIdx.y + 8][threadIdx.x] = 0.0f;
smem[threadIdx.y + 16][threadIdx.x] = 0.0f;
smem[threadIdx.y + 24][threadIdx.x] = 0.0f;
if (x < cols)
{
for (int i = 0; i < 4; ++i)
{
if (curRowOffs + i * 8 < rows)
smem[threadIdx.y + i * 8][threadIdx.x] = integral(curRowOffs + i * 8, x);
}
}
__syncthreads();
curElems[0] = smem[threadIdx.x][threadIdx.y ];
curElems[1] = smem[threadIdx.x][threadIdx.y + 8];
curElems[2] = smem[threadIdx.x][threadIdx.y + 16];
curElems[3] = smem[threadIdx.x][threadIdx.y + 24];
__syncthreads();
temp[0] = curElems[0] = warpScanInclusive(curElems[0], smem_row, threadIdx.x);
temp[1] = curElems[1] = warpScanInclusive(curElems[1], smem_row, threadIdx.x);
temp[2] = curElems[2] = warpScanInclusive(curElems[2], smem_row, threadIdx.x);
temp[3] = curElems[3] = warpScanInclusive(curElems[3], smem_row, threadIdx.x);
curElems[0] += prevVals[threadIdx.y ];
curElems[1] += prevVals[threadIdx.y + 8];
curElems[2] += prevVals[threadIdx.y + 16];
curElems[3] += prevVals[threadIdx.y + 24];
__syncthreads();
if (threadIdx.x == 31)
{
prevVals[threadIdx.y ] += temp[0];
prevVals[threadIdx.y + 8] += temp[1];
prevVals[threadIdx.y + 16] += temp[2];
prevVals[threadIdx.y + 24] += temp[3];
}
smem[threadIdx.y ][threadIdx.x] = curElems[0];
smem[threadIdx.y + 8][threadIdx.x] = curElems[1];
smem[threadIdx.y + 16][threadIdx.x] = curElems[2];
smem[threadIdx.y + 24][threadIdx.x] = curElems[3];
__syncthreads();
if (x < cols)
{
for (int i = 0; i < 4; ++i)
{
if (curRowOffs + i * 8 < rows)
integral(curRowOffs + i * 8, x) = smem[threadIdx.x][threadIdx.y + i * 8];
}
}
__syncthreads();
offsetY += 8 * 4;
}
#endif
}
template <typename T>
__host__ void vertical_pass(const GlobPtr<T>& integral, int rows, int cols, cudaStream_t stream)
{
const dim3 block(32, 8);
const dim3 grid(divUp(cols, block.x));
vertical_pass<<<grid, block, 0, stream>>>(integral, rows, cols);
CV_CUDEV_SAFE_CALL( cudaGetLastError() );
}
template <class SrcPtr, typename D>
__host__ void integral(const SrcPtr& src, const GlobPtr<D>& dst, int rows, int cols, cudaStream_t stream)
{
horizontal_pass(src, dst, rows, cols, stream);
vertical_pass(dst, rows, cols, stream);
if (stream == 0)
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
}
__host__ static void integral(const GlobPtr<uchar>& src, const GlobPtr<uint>& dst, int rows, int cols, cudaStream_t stream)
{
if (deviceSupports(FEATURE_SET_COMPUTE_30)
&& (cols % 16 == 0)
&& reinterpret_cast<intptr_t>(src.data) % 32 == 0
&& reinterpret_cast<intptr_t>(dst.data) % 32 == 0)
{
horisontal_pass_8u_shfl(src, dst, rows, cols, stream);
}
else
{
horizontal_pass(src, dst, rows, cols, stream);
}
vertical_pass(dst, rows, cols, stream);
if (stream == 0)
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
}
__host__ __forceinline__ void integral(const GlobPtr<uchar>& src, const GlobPtr<int>& dst, int rows, int cols, cudaStream_t stream)
{
GlobPtr<uint> dstui = globPtr((uint*) dst.data, dst.step);
integral(src, dstui, rows, cols, stream);
}
}
}}
#endif