如何使用cuda对沿行方向的巨大二维矩阵进行归约? (每行的最大值和最大值的索引)
How to perform reduction on a huge 2D matrix along the row direction using cuda? (max value and max value's index for each row)
我正在尝试沿二维矩阵的行方向实施缩减。我从我在 Whosebug 上找到的代码开始(非常感谢罗伯特!)
上面的 link 显示了一个自定义内核,它对单行执行归约。它将输入行分成许多行,每行有 1024 个线程。效果很好。
对于 2D 情况,除了现在有一个 y 网格维度外,一切都一样。所以每个块的 y 维度仍然是 1。问题是当我尝试将数据写入每个块内的共享内存时(在代码中的 "max_idx_kernel_reduction_within_block" 内核内),它需要很长时间(超过(# of行)*(在 1 行上执行减少所需的时间。我宁愿 运行 一个 for 循环)。我知道我有很多元素,但我期待比这更快的东西。
我认为内存访问模式不是问题,但我听说共享内存总量可能是限制?? : CUDA: Is coalesced global memory access faster than shared memory? Also, does allocating a large shared memory array slow down the program?
有什么建议可以使我的代码更快(第一个内核是瓶颈)?非常感谢,非常感谢!!
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>
#include <cuda_runtime.h>
#define NCOLS 163317 // number of columns
#define NROWS 8 // number of rows
#define nTPB 1024 // Threads per Block. nTPB should be a power-of-2
#define MAX_BLOCKS_X ((NCOLS/nTPB)+1) // # of blocks I will launch
#define MIN(a,b) ((a>b)?b:a)
#define FLOAT_MIN -1.0f // lowest anticipated number of the data. Values in array will be compared with this and updated with the larger one
#define IDX2F(i,j,ld) ((j-1) * (ld) + ((i) - 1)) // 1 based indexing
#define IDX2C(i,j,ld) ((j) * (ld) + i) // 0 based indexing
__device__ volatile float blk_vals[NROWS][MAX_BLOCKS_X];
__device__ volatile int blk_idxs[NROWS][MAX_BLOCKS_X];
// blk_vals and blk_idxs are the results obtained from reduction within each block.
// after 1st reduction, each row will have blk_vals[MAX_BLOCKS_X] array and blk_idxs[MAX_BLOCKS_X]
// these will be passed to the 2nd kernel
__global__ void max_idx_kernel_reduction_within_block(const float *data, const int xSize, const int ySize){ // first kernel. Reduction within blocks
__shared__ volatile float vals[nTPB]; // Total amount of shared memory per block: 49152 bytes (50 KB). 1024 gives ~ 4KB for single.
__shared__ volatile int idxs[nTPB]; // ~ 4 KB for single, when nTPB is 1024. each block will have both indices and values
int idx = threadIdx.x+blockDim.x * blockIdx.x; // idx in the x direction
float my_val = FLOAT_MIN; // lowest possible number
int my_idx = -1; // to check whether you DID perform the kernel. Again, it's the idx in the x dir.
// sweep from global memory
while (idx < xSize){ // this ensures you don't go out the size of the array's x direction
if (data[IDX2C(blockIdx.y,idx,ySize)] > my_val) {my_val = data[IDX2C(blockIdx.y,idx,ySize)]; my_idx = idx;}
// compare with my_val, and put the bigger value into my_val for next comparison. my_idx is 0 index based
idx += blockDim.x*gridDim.x;}
// until here takes about 6 ms !! very fast!!
// populate shared memory: takes ~ 270 ms
vals[threadIdx.x] = my_val; // put the computed max value for each thread into the shared memory. -> this is the bottleneck!!
idxs[threadIdx.x] = my_idx; // do this for index as well -> this is also slow!!
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i) // the first half threads of the block
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
// the above is comparing shared memory of threadIdx.x with shared memory of threadIdx.x + i.
// then puts the larger value into shared memory of threadIdx.x
__syncthreads();} // so now in each block, shared memory's first element (index 0) is the max value and max value index
// perform block-level reduction
if (!threadIdx.x){ // at the shared memory, only the first element (index 0) (actually 2 elements in the first index. max value, and max value index) is what we need.
blk_vals[blockIdx.y][blockIdx.x] = vals[0]; // For each window (single x row), the first elements of the blocks are stored into the blk_vals[windowNumber][:]
// remember, this is a global variable.
blk_idxs[blockIdx.y][blockIdx.x] = idxs[0]; // and the max value index
__syncthreads();
}
}
// originally the following kernel was in the 1st kernel, performed by the last block. So just use one block for this.
__global__ void max_idx_kernel_final(int *result_maxInd, float *result_maxVal){
__shared__ volatile float vals[nTPB]; // Total amount of shared memory per block: 49152 bytes (50 KB). 1024 gives ~ 4KB for single.
__shared__ volatile int idxs[nTPB]; // ~ 4 KB for single, when nTPB is 1024. each block will have these variables!! (vals and idxs)
int idx = threadIdx.x;
float my_val = FLOAT_MIN;
int my_idx = -1; // remember, these are local variables, so each thread has this variable. This local variable is independent from other thread's local variable
while (idx < MAX_BLOCKS_X ){ // ?? confused whether it should be gridDim.x (actual # of blocks launched) or MAX_BLOCKS_X (# of elements in x dir of the global array blk_vals)
if (blk_vals[blockIdx.y][idx] > my_val)
{my_val = blk_vals[blockIdx.y][idx]; my_idx = blk_idxs[blockIdx.y][idx]; }
idx += blockDim.x;} // all threads in this single block (single in the x dir) are working, so you should loop over blockDim.x.
// Imagine where gridDim.x (# of blocks) is huge so that you need to loop over to get the max value and index
// After this, each thread in the block has a local variable (max value and max value index).
// So far it was sort of a reduction, but instead of pairing values we just looped over the blk_vals and blk_idxs
// populate shared memory
vals[threadIdx.x] = my_val; // This is now shared memory. This is because reduction requires comparison between different elements
idxs[threadIdx.x] = my_idx; // my_idx value is 0 based. This is done for all blocks (in the y direction)
__syncthreads();
// Now the final task is to do reduction for all threads in our single block (single block in the x dir, NROWS blocks in the y dir)!
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1) {
if (threadIdx.x < i) // the first half threads of the block
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();} // now all the results are in threadIdx.x == 0 for each block (there are NROWS blocks in the y dir)
// 0th thread. the results are in shared memory, not the local memory, so any thread could do the following. We just selected the 0th thread for no reason. If several threads try to do this, that would be a problem, since we'll have to wait for them
if(!threadIdx.x){
result_maxInd[blockIdx.y] = idxs[0]; // the final result for each row goes into the corresponding position (blockIdx.y)
result_maxVal[blockIdx.y] = vals[0];
}
}
int main(){
dim3 grids(MAX_BLOCKS_X, NROWS);
dim3 threads(nTPB,1);
dim3 grids2(1,NROWS);
dim3 threads2(nTPB);
float *d_vector, *h_vector;
h_vector = (float*)malloc(NROWS * NCOLS * sizeof(float));
for (int j = 1; j <= NCOLS; j++) {
for (int i = 1; i <= NROWS; i++) {
h_vector[IDX2F(i,j,NROWS)] = (float) (rand()/(float)RAND_MAX);
}
}
h_vector[IDX2F(2,5,NROWS)] = 10; // create definite max element
cudaMalloc(&d_vector, NROWS * NCOLS * sizeof(float));
cudaMemcpy(d_vector, h_vector, NROWS * NCOLS * sizeof(float), cudaMemcpyHostToDevice);
//d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats.
int *max_index;
float *max_val;
int *d_max_index;
float *d_max_val;
max_index = (int*)malloc(NROWS * sizeof(int));
max_val = (float*)malloc(NROWS * sizeof(float));
cudaMalloc((void**)&d_max_index, NROWS * sizeof(int));
cudaMalloc((void**)&d_max_val, NROWS * sizeof(float));
max_idx_kernel_reduction_within_block<<<grids, threads>>>(d_vector, NCOLS, NROWS);
max_idx_kernel_final<<<grids2,threads2>>>(d_max_index, d_max_val);
cudaMemcpy(max_index, d_max_index, NROWS * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(max_val, d_max_val, NROWS * sizeof(float), cudaMemcpyDeviceToHost);
for(int z=0;z<20;z++)
printf("%d ",max_index[z]);
printf("\n\n\n");
for(int z=0;z<20;z++)
printf("%f ",max_val[z]);
return 0;
}
您的代码存在各种问题:
- 你应该使用 proper cuda error checking。这只是我所做的标准样板声明。我认为您的代码没有产生任何运行时错误。
- 您应该验证您的结果。我严重怀疑代码是否产生了合理的结果。原因将变得显而易见。如果您想向自己证明这一点,请将您的数据初始化修改为明显且易于验证的内容,如我所展示的,而不进行任何其他更改,您将看到您的程序产生错误。
- 在您的内核中,您没有正确地索引数组。也许您不了解
IDX2C
和 IDX2F
宏。它们以两种方式伤害了您:您不了解它们通过您的数组进行索引的模式,并且它们正在破坏您对全局内存的合并(由于您使用它们的方式).每当我们有一个构造是 threadIdx.x
和 threadIdx.y
(或本例中的 blockIdx.y
)的索引函数时,为了在相邻线程之间保持适当的合并,我们希望组件基于threadIdx.x
未乘以任何比例因子。但是您在内核中将参数传递给 IDX2C
的方式违反了这条规则(并且还破坏了您想要的按行访问模式。)所以现在,让我们摆脱那些宏,因为它们令人困惑问题。
这是对__syncthreads()
的非法使用:
if (!threadIdx.x){ // at the shared memory, only the first element (index 0) (actually 2 elements in the first index. max value, and max value index) is what we need.
blk_vals[blockIdx.y][blockIdx.x] = vals[0]; // For each window (single x row), the first elements of the blocks are stored into the blk_vals[windowNumber][:]
// remember, this is a global variable.
blk_idxs[blockIdx.y][blockIdx.x] = idxs[0]; // and the max value index
__syncthreads();
}
在条件代码中使用它是非法的,除非条件对块中的每个线程的计算结果相同。那里完全不需要它,所以我们将其删除。
您的打印输出索引为 20 而不是 NROWS。
通过上述更改,您的代码从被破坏(产生不正确的结果)变为被修复,并且内核在我的系统上的执行时间从 0.929 毫秒变为 0.656 毫秒。我将所有这些归因于上面第 3 项中的合并修复。
当我 profile 使用 nvprof --metrics gld_efficiency ...
前后情况时,它显示原始代码的效率为 12.5%,更改后的效率为 53%。这是修改后的代码:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>
#define NCOLS 163317 // number of columns
#define NROWS 8 // number of rows
#define nTPB 1024 // Threads per Block. nTPB should be a power-of-2
#define MAX_BLOCKS_X ((NCOLS/nTPB)+1) // # of blocks I will launch
#define FLOAT_MIN -1.0f // lowest anticipated number of the data. Values in array will be compared with this and updated with the larger one
__device__ volatile float blk_vals[NROWS][MAX_BLOCKS_X];
__device__ volatile int blk_idxs[NROWS][MAX_BLOCKS_X];
// blk_vals and blk_idxs are the results obtained from reduction within each block.
// after 1st reduction, each row will have blk_vals[MAX_BLOCKS_X] array and blk_idxs[MAX_BLOCKS_X]
// these will be passed to the 2nd kernel
__global__ void max_idx_kernel_reduction_within_block(const float *data, const int xSize, const int ySize){ // first kernel. Reduction within blocks
__shared__ volatile float vals[nTPB]; // Total amount of shared memory per block: 49152 bytes (50 KB). 1024 gives ~ 4KB for single.
__shared__ volatile int idxs[nTPB]; // ~ 4 KB for single, when nTPB is 1024. each block will have both indices and values
int idx = threadIdx.x+blockDim.x * blockIdx.x; // idx in the x direction
int idy = blockIdx.y;
float my_val = FLOAT_MIN; // lowest possible number
int my_idx = -1; // to check whether you DID perform the kernel. Again, it's the idx in the x dir.
// sweep from global memory
while (idx < xSize){ // this ensures you don't go out the size of the array's x direction
float temp = data[idy*xSize+idx];
if (temp > my_val) {my_val = temp; my_idx = idx;}
// compare with my_val, and put the bigger value into my_val for next comparison. my_idx is 0 index based
idx += blockDim.x*gridDim.x;}
// until here takes about 6 ms !! very fast!!
// populate shared memory: takes ~ 270 ms
vals[threadIdx.x] = my_val; // put the computed max value for each thread into the shared memory. -> this is the bottleneck!!
idxs[threadIdx.x] = my_idx; // do this for index as well -> this is also slow!!
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i) // the first half threads of the block
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
// the above is comparing shared memory of threadIdx.x with shared memory of threadIdx.x + i.
// then puts the larger value into shared memory of threadIdx.x
__syncthreads();} // so now in each block, shared memory's first element (index 0) is the max value and max value index
// perform block-level reduction
if (!threadIdx.x){ // at the shared memory, only the first element (index 0) (actually 2 elements in the first index. max value, and max value index) is what we need.
blk_vals[blockIdx.y][blockIdx.x] = vals[0]; // For each window (single x row), the first elements of the blocks are stored into the blk_vals[windowNumber][:]
// remember, this is a global variable.
blk_idxs[blockIdx.y][blockIdx.x] = idxs[0]; // and the max value index
__syncthreads();
}
}
// originally the following kernel was in the 1st kernel, performed by the last block. So just use one block for this.
__global__ void max_idx_kernel_final(int *result_maxInd, float *result_maxVal){
__shared__ volatile float vals[nTPB]; // Total amount of shared memory per block: 49152 bytes (50 KB). 1024 gives ~ 4KB for single.
__shared__ volatile int idxs[nTPB]; // ~ 4 KB for single, when nTPB is 1024. each block will have these variables!! (vals and idxs)
int idx = threadIdx.x;
int idy = blockIdx.y;
float my_val = FLOAT_MIN;
int my_idx = -1; // remember, these are local variables, so each thread has this variable. This local variable is independent from other thread's local variable
while (idx < MAX_BLOCKS_X ){ // ?? confused whether it should be gridDim.x (actual # of blocks launched) or MAX_BLOCKS_X (# of elements in x dir of the global array blk_vals)
float temp = blk_vals[idy][idx];
if (temp > my_val)
{my_val = temp; my_idx = blk_idxs[idy][idx]; }
idx += blockDim.x;} // all threads in this single block (single in the x dir) are working, so you should loop over blockDim.x.
// Imagine where gridDim.x (# of blocks) is huge so that you need to loop over to get the max value and index
// After this, each thread in the block has a local variable (max value and max value index).
// So far it was sort of a reduction, but instead of pairing values we just looped over the blk_vals and blk_idxs
// populate shared memory
idx = threadIdx.x;
vals[idx] = my_val; // This is now shared memory. This is because reduction requires comparison between different elements
idxs[idx] = my_idx; // my_idx value is 0 based. This is done for all blocks (in the y direction)
__syncthreads();
// Now the final task is to do reduction for all threads in our single block (single block in the x dir, NROWS blocks in the y dir)!
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1) {
if (idx < i) // the first half threads of the block
if (vals[idx] < vals[idx + i]) {vals[idx] = vals[idx+i]; idxs[idx] = idxs[idx+i]; }
__syncthreads();} // now all the results are in threadIdx.x == 0 for each block (there are NROWS blocks in the y dir)
// 0th thread. the results are in shared memory, not the local memory, so any thread could do the following. We just selected the 0th thread for no reason. If several threads try to do this, that would be a problem, since we'll have to wait for them
if(!threadIdx.x){
result_maxInd[idy] = idxs[0]; // the final result for each row goes into the corresponding position (blockIdx.y)
result_maxVal[idy] = vals[0];
}
}
int main(){
dim3 grids(MAX_BLOCKS_X, NROWS);
dim3 threads(nTPB,1);
dim3 grids2(1,NROWS);
dim3 threads2(nTPB);
float *d_vector, *h_vector;
h_vector = (float*)malloc(NROWS * NCOLS * sizeof(float));
memset(h_vector, 0, NROWS*NCOLS*sizeof(float));
for (int i = 0; i < NROWS; i++)
h_vector[i*NCOLS + i] = 10.0f; // create definite max element per row
cudaMalloc(&d_vector, NROWS * NCOLS * sizeof(float));
cudaMemcpy(d_vector, h_vector, NROWS * NCOLS * sizeof(float), cudaMemcpyHostToDevice);
//d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats.
int *max_index;
float *max_val;
int *d_max_index;
float *d_max_val;
max_index = (int*)malloc(NROWS * sizeof(int));
max_val = (float*)malloc(NROWS * sizeof(float));
cudaMalloc((void**)&d_max_index, NROWS * sizeof(int));
cudaMalloc((void**)&d_max_val, NROWS * sizeof(float));
cudaEvent_t start, stop;
cudaEventCreate(&start); cudaEventCreate(&stop);
cudaEventRecord(start);
max_idx_kernel_reduction_within_block<<<grids, threads>>>(d_vector, NCOLS, NROWS);
max_idx_kernel_final<<<grids2,threads2>>>(d_max_index, d_max_val);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float et;
cudaEventElapsedTime(&et, start, stop);
printf("elapsed time: %fms\n", et);
cudaMemcpy(max_index, d_max_index, NROWS * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(max_val, d_max_val, NROWS * sizeof(float), cudaMemcpyDeviceToHost);
for(int z=0;z<NROWS;z++)
printf("%d ",max_index[z]);
printf("\n\n\n");
for(int z=0;z<NROWS;z++)
printf("%f ",max_val[z]);
printf("\n");
return 0;
}
$
我正在尝试沿二维矩阵的行方向实施缩减。我从我在 Whosebug 上找到的代码开始(非常感谢罗伯特!)
上面的 link 显示了一个自定义内核,它对单行执行归约。它将输入行分成许多行,每行有 1024 个线程。效果很好。
对于 2D 情况,除了现在有一个 y 网格维度外,一切都一样。所以每个块的 y 维度仍然是 1。问题是当我尝试将数据写入每个块内的共享内存时(在代码中的 "max_idx_kernel_reduction_within_block" 内核内),它需要很长时间(超过(# of行)*(在 1 行上执行减少所需的时间。我宁愿 运行 一个 for 循环)。我知道我有很多元素,但我期待比这更快的东西。
我认为内存访问模式不是问题,但我听说共享内存总量可能是限制?? : CUDA: Is coalesced global memory access faster than shared memory? Also, does allocating a large shared memory array slow down the program?
有什么建议可以使我的代码更快(第一个内核是瓶颈)?非常感谢,非常感谢!!
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>
#include <cuda_runtime.h>
#define NCOLS 163317 // number of columns
#define NROWS 8 // number of rows
#define nTPB 1024 // Threads per Block. nTPB should be a power-of-2
#define MAX_BLOCKS_X ((NCOLS/nTPB)+1) // # of blocks I will launch
#define MIN(a,b) ((a>b)?b:a)
#define FLOAT_MIN -1.0f // lowest anticipated number of the data. Values in array will be compared with this and updated with the larger one
#define IDX2F(i,j,ld) ((j-1) * (ld) + ((i) - 1)) // 1 based indexing
#define IDX2C(i,j,ld) ((j) * (ld) + i) // 0 based indexing
__device__ volatile float blk_vals[NROWS][MAX_BLOCKS_X];
__device__ volatile int blk_idxs[NROWS][MAX_BLOCKS_X];
// blk_vals and blk_idxs are the results obtained from reduction within each block.
// after 1st reduction, each row will have blk_vals[MAX_BLOCKS_X] array and blk_idxs[MAX_BLOCKS_X]
// these will be passed to the 2nd kernel
__global__ void max_idx_kernel_reduction_within_block(const float *data, const int xSize, const int ySize){ // first kernel. Reduction within blocks
__shared__ volatile float vals[nTPB]; // Total amount of shared memory per block: 49152 bytes (50 KB). 1024 gives ~ 4KB for single.
__shared__ volatile int idxs[nTPB]; // ~ 4 KB for single, when nTPB is 1024. each block will have both indices and values
int idx = threadIdx.x+blockDim.x * blockIdx.x; // idx in the x direction
float my_val = FLOAT_MIN; // lowest possible number
int my_idx = -1; // to check whether you DID perform the kernel. Again, it's the idx in the x dir.
// sweep from global memory
while (idx < xSize){ // this ensures you don't go out the size of the array's x direction
if (data[IDX2C(blockIdx.y,idx,ySize)] > my_val) {my_val = data[IDX2C(blockIdx.y,idx,ySize)]; my_idx = idx;}
// compare with my_val, and put the bigger value into my_val for next comparison. my_idx is 0 index based
idx += blockDim.x*gridDim.x;}
// until here takes about 6 ms !! very fast!!
// populate shared memory: takes ~ 270 ms
vals[threadIdx.x] = my_val; // put the computed max value for each thread into the shared memory. -> this is the bottleneck!!
idxs[threadIdx.x] = my_idx; // do this for index as well -> this is also slow!!
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i) // the first half threads of the block
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
// the above is comparing shared memory of threadIdx.x with shared memory of threadIdx.x + i.
// then puts the larger value into shared memory of threadIdx.x
__syncthreads();} // so now in each block, shared memory's first element (index 0) is the max value and max value index
// perform block-level reduction
if (!threadIdx.x){ // at the shared memory, only the first element (index 0) (actually 2 elements in the first index. max value, and max value index) is what we need.
blk_vals[blockIdx.y][blockIdx.x] = vals[0]; // For each window (single x row), the first elements of the blocks are stored into the blk_vals[windowNumber][:]
// remember, this is a global variable.
blk_idxs[blockIdx.y][blockIdx.x] = idxs[0]; // and the max value index
__syncthreads();
}
}
// originally the following kernel was in the 1st kernel, performed by the last block. So just use one block for this.
__global__ void max_idx_kernel_final(int *result_maxInd, float *result_maxVal){
__shared__ volatile float vals[nTPB]; // Total amount of shared memory per block: 49152 bytes (50 KB). 1024 gives ~ 4KB for single.
__shared__ volatile int idxs[nTPB]; // ~ 4 KB for single, when nTPB is 1024. each block will have these variables!! (vals and idxs)
int idx = threadIdx.x;
float my_val = FLOAT_MIN;
int my_idx = -1; // remember, these are local variables, so each thread has this variable. This local variable is independent from other thread's local variable
while (idx < MAX_BLOCKS_X ){ // ?? confused whether it should be gridDim.x (actual # of blocks launched) or MAX_BLOCKS_X (# of elements in x dir of the global array blk_vals)
if (blk_vals[blockIdx.y][idx] > my_val)
{my_val = blk_vals[blockIdx.y][idx]; my_idx = blk_idxs[blockIdx.y][idx]; }
idx += blockDim.x;} // all threads in this single block (single in the x dir) are working, so you should loop over blockDim.x.
// Imagine where gridDim.x (# of blocks) is huge so that you need to loop over to get the max value and index
// After this, each thread in the block has a local variable (max value and max value index).
// So far it was sort of a reduction, but instead of pairing values we just looped over the blk_vals and blk_idxs
// populate shared memory
vals[threadIdx.x] = my_val; // This is now shared memory. This is because reduction requires comparison between different elements
idxs[threadIdx.x] = my_idx; // my_idx value is 0 based. This is done for all blocks (in the y direction)
__syncthreads();
// Now the final task is to do reduction for all threads in our single block (single block in the x dir, NROWS blocks in the y dir)!
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1) {
if (threadIdx.x < i) // the first half threads of the block
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();} // now all the results are in threadIdx.x == 0 for each block (there are NROWS blocks in the y dir)
// 0th thread. the results are in shared memory, not the local memory, so any thread could do the following. We just selected the 0th thread for no reason. If several threads try to do this, that would be a problem, since we'll have to wait for them
if(!threadIdx.x){
result_maxInd[blockIdx.y] = idxs[0]; // the final result for each row goes into the corresponding position (blockIdx.y)
result_maxVal[blockIdx.y] = vals[0];
}
}
int main(){
dim3 grids(MAX_BLOCKS_X, NROWS);
dim3 threads(nTPB,1);
dim3 grids2(1,NROWS);
dim3 threads2(nTPB);
float *d_vector, *h_vector;
h_vector = (float*)malloc(NROWS * NCOLS * sizeof(float));
for (int j = 1; j <= NCOLS; j++) {
for (int i = 1; i <= NROWS; i++) {
h_vector[IDX2F(i,j,NROWS)] = (float) (rand()/(float)RAND_MAX);
}
}
h_vector[IDX2F(2,5,NROWS)] = 10; // create definite max element
cudaMalloc(&d_vector, NROWS * NCOLS * sizeof(float));
cudaMemcpy(d_vector, h_vector, NROWS * NCOLS * sizeof(float), cudaMemcpyHostToDevice);
//d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats.
int *max_index;
float *max_val;
int *d_max_index;
float *d_max_val;
max_index = (int*)malloc(NROWS * sizeof(int));
max_val = (float*)malloc(NROWS * sizeof(float));
cudaMalloc((void**)&d_max_index, NROWS * sizeof(int));
cudaMalloc((void**)&d_max_val, NROWS * sizeof(float));
max_idx_kernel_reduction_within_block<<<grids, threads>>>(d_vector, NCOLS, NROWS);
max_idx_kernel_final<<<grids2,threads2>>>(d_max_index, d_max_val);
cudaMemcpy(max_index, d_max_index, NROWS * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(max_val, d_max_val, NROWS * sizeof(float), cudaMemcpyDeviceToHost);
for(int z=0;z<20;z++)
printf("%d ",max_index[z]);
printf("\n\n\n");
for(int z=0;z<20;z++)
printf("%f ",max_val[z]);
return 0;
}
您的代码存在各种问题:
- 你应该使用 proper cuda error checking。这只是我所做的标准样板声明。我认为您的代码没有产生任何运行时错误。
- 您应该验证您的结果。我严重怀疑代码是否产生了合理的结果。原因将变得显而易见。如果您想向自己证明这一点,请将您的数据初始化修改为明显且易于验证的内容,如我所展示的,而不进行任何其他更改,您将看到您的程序产生错误。
- 在您的内核中,您没有正确地索引数组。也许您不了解
IDX2C
和IDX2F
宏。它们以两种方式伤害了您:您不了解它们通过您的数组进行索引的模式,并且它们正在破坏您对全局内存的合并(由于您使用它们的方式).每当我们有一个构造是threadIdx.x
和threadIdx.y
(或本例中的blockIdx.y
)的索引函数时,为了在相邻线程之间保持适当的合并,我们希望组件基于threadIdx.x
未乘以任何比例因子。但是您在内核中将参数传递给IDX2C
的方式违反了这条规则(并且还破坏了您想要的按行访问模式。)所以现在,让我们摆脱那些宏,因为它们令人困惑问题。 这是对
__syncthreads()
的非法使用:if (!threadIdx.x){ // at the shared memory, only the first element (index 0) (actually 2 elements in the first index. max value, and max value index) is what we need. blk_vals[blockIdx.y][blockIdx.x] = vals[0]; // For each window (single x row), the first elements of the blocks are stored into the blk_vals[windowNumber][:] // remember, this is a global variable. blk_idxs[blockIdx.y][blockIdx.x] = idxs[0]; // and the max value index __syncthreads(); }
在条件代码中使用它是非法的,除非条件对块中的每个线程的计算结果相同。那里完全不需要它,所以我们将其删除。
您的打印输出索引为 20 而不是 NROWS。
通过上述更改,您的代码从被破坏(产生不正确的结果)变为被修复,并且内核在我的系统上的执行时间从 0.929 毫秒变为 0.656 毫秒。我将所有这些归因于上面第 3 项中的合并修复。
当我 profile 使用 nvprof --metrics gld_efficiency ...
前后情况时,它显示原始代码的效率为 12.5%,更改后的效率为 53%。这是修改后的代码:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>
#define NCOLS 163317 // number of columns
#define NROWS 8 // number of rows
#define nTPB 1024 // Threads per Block. nTPB should be a power-of-2
#define MAX_BLOCKS_X ((NCOLS/nTPB)+1) // # of blocks I will launch
#define FLOAT_MIN -1.0f // lowest anticipated number of the data. Values in array will be compared with this and updated with the larger one
__device__ volatile float blk_vals[NROWS][MAX_BLOCKS_X];
__device__ volatile int blk_idxs[NROWS][MAX_BLOCKS_X];
// blk_vals and blk_idxs are the results obtained from reduction within each block.
// after 1st reduction, each row will have blk_vals[MAX_BLOCKS_X] array and blk_idxs[MAX_BLOCKS_X]
// these will be passed to the 2nd kernel
__global__ void max_idx_kernel_reduction_within_block(const float *data, const int xSize, const int ySize){ // first kernel. Reduction within blocks
__shared__ volatile float vals[nTPB]; // Total amount of shared memory per block: 49152 bytes (50 KB). 1024 gives ~ 4KB for single.
__shared__ volatile int idxs[nTPB]; // ~ 4 KB for single, when nTPB is 1024. each block will have both indices and values
int idx = threadIdx.x+blockDim.x * blockIdx.x; // idx in the x direction
int idy = blockIdx.y;
float my_val = FLOAT_MIN; // lowest possible number
int my_idx = -1; // to check whether you DID perform the kernel. Again, it's the idx in the x dir.
// sweep from global memory
while (idx < xSize){ // this ensures you don't go out the size of the array's x direction
float temp = data[idy*xSize+idx];
if (temp > my_val) {my_val = temp; my_idx = idx;}
// compare with my_val, and put the bigger value into my_val for next comparison. my_idx is 0 index based
idx += blockDim.x*gridDim.x;}
// until here takes about 6 ms !! very fast!!
// populate shared memory: takes ~ 270 ms
vals[threadIdx.x] = my_val; // put the computed max value for each thread into the shared memory. -> this is the bottleneck!!
idxs[threadIdx.x] = my_idx; // do this for index as well -> this is also slow!!
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i) // the first half threads of the block
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
// the above is comparing shared memory of threadIdx.x with shared memory of threadIdx.x + i.
// then puts the larger value into shared memory of threadIdx.x
__syncthreads();} // so now in each block, shared memory's first element (index 0) is the max value and max value index
// perform block-level reduction
if (!threadIdx.x){ // at the shared memory, only the first element (index 0) (actually 2 elements in the first index. max value, and max value index) is what we need.
blk_vals[blockIdx.y][blockIdx.x] = vals[0]; // For each window (single x row), the first elements of the blocks are stored into the blk_vals[windowNumber][:]
// remember, this is a global variable.
blk_idxs[blockIdx.y][blockIdx.x] = idxs[0]; // and the max value index
__syncthreads();
}
}
// originally the following kernel was in the 1st kernel, performed by the last block. So just use one block for this.
__global__ void max_idx_kernel_final(int *result_maxInd, float *result_maxVal){
__shared__ volatile float vals[nTPB]; // Total amount of shared memory per block: 49152 bytes (50 KB). 1024 gives ~ 4KB for single.
__shared__ volatile int idxs[nTPB]; // ~ 4 KB for single, when nTPB is 1024. each block will have these variables!! (vals and idxs)
int idx = threadIdx.x;
int idy = blockIdx.y;
float my_val = FLOAT_MIN;
int my_idx = -1; // remember, these are local variables, so each thread has this variable. This local variable is independent from other thread's local variable
while (idx < MAX_BLOCKS_X ){ // ?? confused whether it should be gridDim.x (actual # of blocks launched) or MAX_BLOCKS_X (# of elements in x dir of the global array blk_vals)
float temp = blk_vals[idy][idx];
if (temp > my_val)
{my_val = temp; my_idx = blk_idxs[idy][idx]; }
idx += blockDim.x;} // all threads in this single block (single in the x dir) are working, so you should loop over blockDim.x.
// Imagine where gridDim.x (# of blocks) is huge so that you need to loop over to get the max value and index
// After this, each thread in the block has a local variable (max value and max value index).
// So far it was sort of a reduction, but instead of pairing values we just looped over the blk_vals and blk_idxs
// populate shared memory
idx = threadIdx.x;
vals[idx] = my_val; // This is now shared memory. This is because reduction requires comparison between different elements
idxs[idx] = my_idx; // my_idx value is 0 based. This is done for all blocks (in the y direction)
__syncthreads();
// Now the final task is to do reduction for all threads in our single block (single block in the x dir, NROWS blocks in the y dir)!
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1) {
if (idx < i) // the first half threads of the block
if (vals[idx] < vals[idx + i]) {vals[idx] = vals[idx+i]; idxs[idx] = idxs[idx+i]; }
__syncthreads();} // now all the results are in threadIdx.x == 0 for each block (there are NROWS blocks in the y dir)
// 0th thread. the results are in shared memory, not the local memory, so any thread could do the following. We just selected the 0th thread for no reason. If several threads try to do this, that would be a problem, since we'll have to wait for them
if(!threadIdx.x){
result_maxInd[idy] = idxs[0]; // the final result for each row goes into the corresponding position (blockIdx.y)
result_maxVal[idy] = vals[0];
}
}
int main(){
dim3 grids(MAX_BLOCKS_X, NROWS);
dim3 threads(nTPB,1);
dim3 grids2(1,NROWS);
dim3 threads2(nTPB);
float *d_vector, *h_vector;
h_vector = (float*)malloc(NROWS * NCOLS * sizeof(float));
memset(h_vector, 0, NROWS*NCOLS*sizeof(float));
for (int i = 0; i < NROWS; i++)
h_vector[i*NCOLS + i] = 10.0f; // create definite max element per row
cudaMalloc(&d_vector, NROWS * NCOLS * sizeof(float));
cudaMemcpy(d_vector, h_vector, NROWS * NCOLS * sizeof(float), cudaMemcpyHostToDevice);
//d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats.
int *max_index;
float *max_val;
int *d_max_index;
float *d_max_val;
max_index = (int*)malloc(NROWS * sizeof(int));
max_val = (float*)malloc(NROWS * sizeof(float));
cudaMalloc((void**)&d_max_index, NROWS * sizeof(int));
cudaMalloc((void**)&d_max_val, NROWS * sizeof(float));
cudaEvent_t start, stop;
cudaEventCreate(&start); cudaEventCreate(&stop);
cudaEventRecord(start);
max_idx_kernel_reduction_within_block<<<grids, threads>>>(d_vector, NCOLS, NROWS);
max_idx_kernel_final<<<grids2,threads2>>>(d_max_index, d_max_val);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float et;
cudaEventElapsedTime(&et, start, stop);
printf("elapsed time: %fms\n", et);
cudaMemcpy(max_index, d_max_index, NROWS * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(max_val, d_max_val, NROWS * sizeof(float), cudaMemcpyDeviceToHost);
for(int z=0;z<NROWS;z++)
printf("%d ",max_index[z]);
printf("\n\n\n");
for(int z=0;z<NROWS;z++)
printf("%f ",max_val[z]);
printf("\n");
return 0;
}
$