CUDA Reduction 最小值和索引
CUDA Reduction minimum value and index
我通过遵循 this 很好的解释并修改它
使用 CUDA 8
实现了最小减少
__inline__ __device__ int warpReduceMin(int val)
{
for (int offset = warpSize / 2; offset > 0; offset /= 2)
{
int tmpVal = __shfl_down(val, offset);
if (tmpVal < val)
{
val = tmpVal;
}
}
return val;
}
__inline__ __device__ int blockReduceMin(int val)
{
static __shared__ int shared[32]; // Shared mem for 32 partial mins
int lane = threadIdx.x % warpSize;
int wid = threadIdx.x / warpSize;
val = warpReduceMin(val); // Each warp performs partial reduction
if (lane == 0)
{
shared[wid] = val; // Write reduced value to shared memory
}
__syncthreads(); // Wait for all partial reductions
//read from shared memory only if that warp existed
val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : INT_MAX;
if (wid == 0)
{
val = warpReduceMin(val); //Final reduce within first warp
}
return val;
}
__global__ void deviceReduceBlockAtomicKernel(int *in, int* out, int N) {
int minVal = INT_MAX;
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
i < N;
i += blockDim.x * gridDim.x)
{
minVal = min(minVal, in[i]);
}
minVal = blockReduceMin(minVal);
if (threadIdx.x == 0)
{
atomicMin(out, minVal);
}
}
效果很好,我得到了最小值。但是,我不关心最小值,只关心它在原始输入数组中的索引。
我尝试稍微修改一下我的代码
__inline__ __device__ int warpReduceMin(int val, int* idx) // Adding output idx
{
for (int offset = warpSize / 2; offset > 0; offset /= 2)
{
int tmpVal = __shfl_down(val, offset);
if (tmpVal < val)
{
*idx = blockIdx.x * blockDim.x + threadIdx.x + offset; // I guess I'm missing something here
val = tmpVal;
}
}
return val;
}
...
blockReduceMin stayed the same only adding idx to function calls
...
__global__ void deviceReduceBlockAtomicKernel(int *in, int* out, int N) {
int minVal = INT_MAX;
int minIdx = 0; // Added this
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
i < N;
i += blockDim.x * gridDim.x)
{
if (in[i] < minVal)
{
minVal = in[i];
minIdx = i; // Added this
}
}
minVal = blockReduceMin(minVal, &minIdx);
if (threadIdx.x == 0)
{
int old = atomicMin(out, minVal);
if (old != minVal) // value was updated
{
atomicExch(out + 1, minIdx);
}
}
}
但是没用。我觉得我遗漏了一些重要的东西,这不是解决问题的方法,但我的搜索没有结果。
这里有几个问题。每次找到新的局部最小值时,您都需要修改 warp 和 block minimum 函数以传播最小值及其索引。也许是这样的:
__inline__ __device__ void warpReduceMin(int& val, int& idx)
{
for (int offset = warpSize / 2; offset > 0; offset /= 2) {
int tmpVal = __shfl_down(val, offset);
int tmpIdx = __shfl_down(idx, offset);
if (tmpVal < val) {
val = tmpVal;
idx = tmpIdx;
}
}
}
__inline__ __device__ void blockReduceMin(int& val, int& idx)
{
static __shared__ int values[32], indices[32]; // Shared mem for 32 partial mins
int lane = threadIdx.x % warpSize;
int wid = threadIdx.x / warpSize;
warpReduceMin(val, idx); // Each warp performs partial reduction
if (lane == 0) {
values[wid] = val; // Write reduced value to shared memory
indices[wid] = idx; // Write reduced value to shared memory
}
__syncthreads(); // Wait for all partial reductions
//read from shared memory only if that warp existed
if (threadIdx.x < blockDim.x / warpSize) {
val = values[lane];
idx = indices[lane];
} else {
val = INT_MAX;
idx = 0;
}
if (wid == 0) {
warpReduceMin(val, idx); //Final reduce within first warp
}
}
[注意:用浏览器编写,从未编译或测试,使用风险自负]
这应该让每个块都保持正确的局部最小值和索引。然后你有第二个问题。这个:
int old = atomicMin(out, minVal);
if (old != minVal) // value was updated
{
atomicExch(out + 1, minIdx);
}
坏了。无法保证在此代码中正确设置最小值及其索引。这是因为不能保证两个原子操作都有任何同步,并且存在潜在的竞争,其中一个块可能正确地覆盖另一个块的最小值,但随后其索引被它替换的块覆盖。这里唯一的解决方案是某种互斥锁,或者 运行 每个块结果的第二个缩减内核。
我通过遵循 this 很好的解释并修改它
使用CUDA 8
实现了最小减少
__inline__ __device__ int warpReduceMin(int val)
{
for (int offset = warpSize / 2; offset > 0; offset /= 2)
{
int tmpVal = __shfl_down(val, offset);
if (tmpVal < val)
{
val = tmpVal;
}
}
return val;
}
__inline__ __device__ int blockReduceMin(int val)
{
static __shared__ int shared[32]; // Shared mem for 32 partial mins
int lane = threadIdx.x % warpSize;
int wid = threadIdx.x / warpSize;
val = warpReduceMin(val); // Each warp performs partial reduction
if (lane == 0)
{
shared[wid] = val; // Write reduced value to shared memory
}
__syncthreads(); // Wait for all partial reductions
//read from shared memory only if that warp existed
val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : INT_MAX;
if (wid == 0)
{
val = warpReduceMin(val); //Final reduce within first warp
}
return val;
}
__global__ void deviceReduceBlockAtomicKernel(int *in, int* out, int N) {
int minVal = INT_MAX;
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
i < N;
i += blockDim.x * gridDim.x)
{
minVal = min(minVal, in[i]);
}
minVal = blockReduceMin(minVal);
if (threadIdx.x == 0)
{
atomicMin(out, minVal);
}
}
效果很好,我得到了最小值。但是,我不关心最小值,只关心它在原始输入数组中的索引。
我尝试稍微修改一下我的代码
__inline__ __device__ int warpReduceMin(int val, int* idx) // Adding output idx
{
for (int offset = warpSize / 2; offset > 0; offset /= 2)
{
int tmpVal = __shfl_down(val, offset);
if (tmpVal < val)
{
*idx = blockIdx.x * blockDim.x + threadIdx.x + offset; // I guess I'm missing something here
val = tmpVal;
}
}
return val;
}
...
blockReduceMin stayed the same only adding idx to function calls
...
__global__ void deviceReduceBlockAtomicKernel(int *in, int* out, int N) {
int minVal = INT_MAX;
int minIdx = 0; // Added this
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
i < N;
i += blockDim.x * gridDim.x)
{
if (in[i] < minVal)
{
minVal = in[i];
minIdx = i; // Added this
}
}
minVal = blockReduceMin(minVal, &minIdx);
if (threadIdx.x == 0)
{
int old = atomicMin(out, minVal);
if (old != minVal) // value was updated
{
atomicExch(out + 1, minIdx);
}
}
}
但是没用。我觉得我遗漏了一些重要的东西,这不是解决问题的方法,但我的搜索没有结果。
这里有几个问题。每次找到新的局部最小值时,您都需要修改 warp 和 block minimum 函数以传播最小值及其索引。也许是这样的:
__inline__ __device__ void warpReduceMin(int& val, int& idx)
{
for (int offset = warpSize / 2; offset > 0; offset /= 2) {
int tmpVal = __shfl_down(val, offset);
int tmpIdx = __shfl_down(idx, offset);
if (tmpVal < val) {
val = tmpVal;
idx = tmpIdx;
}
}
}
__inline__ __device__ void blockReduceMin(int& val, int& idx)
{
static __shared__ int values[32], indices[32]; // Shared mem for 32 partial mins
int lane = threadIdx.x % warpSize;
int wid = threadIdx.x / warpSize;
warpReduceMin(val, idx); // Each warp performs partial reduction
if (lane == 0) {
values[wid] = val; // Write reduced value to shared memory
indices[wid] = idx; // Write reduced value to shared memory
}
__syncthreads(); // Wait for all partial reductions
//read from shared memory only if that warp existed
if (threadIdx.x < blockDim.x / warpSize) {
val = values[lane];
idx = indices[lane];
} else {
val = INT_MAX;
idx = 0;
}
if (wid == 0) {
warpReduceMin(val, idx); //Final reduce within first warp
}
}
[注意:用浏览器编写,从未编译或测试,使用风险自负]
这应该让每个块都保持正确的局部最小值和索引。然后你有第二个问题。这个:
int old = atomicMin(out, minVal);
if (old != minVal) // value was updated
{
atomicExch(out + 1, minIdx);
}
坏了。无法保证在此代码中正确设置最小值及其索引。这是因为不能保证两个原子操作都有任何同步,并且存在潜在的竞争,其中一个块可能正确地覆盖另一个块的最小值,但随后其索引被它替换的块覆盖。这里唯一的解决方案是某种互斥锁,或者 运行 每个块结果的第二个缩减内核。