Spaces:
Paused
Paused
/* | |
* General settings and functions | |
*/ | |
const int WARP_SIZE = 32; | |
const int MAX_BLOCK_SIZE = 1024; | |
static int getNumThreads(int nElem) { | |
int threadSizes[6] = {32, 64, 128, 256, 512, MAX_BLOCK_SIZE}; | |
for (int i = 0; i < 6; ++i) { | |
if (nElem <= threadSizes[i]) { | |
return threadSizes[i]; | |
} | |
} | |
return MAX_BLOCK_SIZE; | |
} | |
/* | |
* Reduction utilities | |
*/ | |
template <typename T> | |
__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize, | |
unsigned int mask = 0xffffffff) { | |
return __shfl_xor_sync(mask, value, laneMask, width); | |
return __shfl_xor(value, laneMask, width); | |
} | |
__device__ __forceinline__ int getMSB(int val) { return 31 - __clz(val); } | |
template<typename T> | |
struct Pair { | |
T v1, v2; | |
__device__ Pair() {} | |
__device__ Pair(T _v1, T _v2) : v1(_v1), v2(_v2) {} | |
__device__ Pair(T v) : v1(v), v2(v) {} | |
__device__ Pair(int v) : v1(v), v2(v) {} | |
__device__ Pair &operator+=(const Pair<T> &a) { | |
v1 += a.v1; | |
v2 += a.v2; | |
return *this; | |
} | |
}; | |
template<typename T> | |
static __device__ __forceinline__ T warpSum(T val) { | |
for (int i = 0; i < getMSB(WARP_SIZE); ++i) { | |
val += WARP_SHFL_XOR(val, 1 << i, WARP_SIZE); | |
} | |
__shared__ T values[MAX_BLOCK_SIZE]; | |
values[threadIdx.x] = val; | |
__threadfence_block(); | |
const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE; | |
for (int i = 1; i < WARP_SIZE; i++) { | |
val += values[base + ((i + threadIdx.x) % WARP_SIZE)]; | |
} | |
return val; | |
} | |
template<typename T> | |
static __device__ __forceinline__ Pair<T> warpSum(Pair<T> value) { | |
value.v1 = warpSum(value.v1); | |
value.v2 = warpSum(value.v2); | |
return value; | |
} |