具有 OpenMP 多线程的 Cuda 设备内存变量产生错误的结果
Cuda device memory variables with OpenMP multithreading produce wrong results
我有一个函数,在这个函数中我在一个循环中连续调用一个 cuda 内核。该函数使用 OpenMP 在线程中并行执行。通过每次迭代,我更新变量 currentTime
为:
cudaMemcpyFromSymbolAsync(¤tTime, minChangeTime, sizeof(currentTime), 0, cudaMemcpyDeviceToHost, stream_id);
其中 minChangeTime
在内核中计算。不知何故,当使用 OpenMP 并行调用多个内核时,此变量 currentTime
的更新未正确完成。我在最后提供了一个可重现的代码。我期待的结果是:
0 65 186
1 130 251
2 195 316
3 260 381
4 325 446
...
但是在启用 OpenMP 时,我没有得到 121 的差异:
7 325 641
3 325 381
3 325 381
6 325 576
4 390 446
8 390 706
7 390 641
4 3063 446
我做错了什么或误解了什么?如果设备内存变量在这里不合适,那么什么是更好的变量类型?
#ifdef __CUDACC__
#define CUDA_HOSTDEV __host__ __device__
#define CUDA_DEVICE __device__
#define CUDA_GLOBAL __global__
#define CUDA_CONST __constant__
#else
#define CUDA_HOSTDEV
#define CUDA_DEVICE
#define CUDA_GLOBAL
#define CUDA_CONST
#endif
#include <cuda.h>
#include <cuda_runtime.h>
#include <omp.h>
#include "helper_cuda.h"
#include "helper_functions.h"
CUDA_DEVICE int minChangeTime;
CUDA_DEVICE bool foundMinimum;
CUDA_GLOBAL void reduction(
int* cu_adjustment_time
){
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
__syncthreads();
for (unsigned int s=1; s < blockDim.x; s *= 2) {
if (tid % (2*s) == 0){
atomicMin(&minChangeTime, cu_adjustment_time[tid+s]);
}
__syncthreads();
}
}
CUDA_GLOBAL void wh(int* cu_adjustment_time, int currentTime){
int tid = threadIdx.x + blockDim.x*blockIdx.x;
cu_adjustment_time[tid] = currentTime+tid;
}
void iteration_function(int *iRows, int time_data_index, int num_nets, cudaStream_t stream_id){
int currentTime = 0;
int limit = *iRows-1;
int starting_point = time_data_index;
time_data_index+=currentTime;
int* cu_adjustment_time;
cudaMalloc((void **)&cu_adjustment_time, sizeof(int) * (num_nets));
limit = (*iRows) - 1;
cudaStreamSynchronize(stream_id);
int loop = 0;
while(currentTime<limit){
cudaMemcpyToSymbolAsync(minChangeTime, &limit, sizeof(*iRows), 0, cudaMemcpyHostToDevice, stream_id);
wh<<<num_nets, 1, 0, stream_id>>>(
cu_adjustment_time,
currentTime
);
cudaStreamSynchronize(stream_id);
reduction<<<1, num_nets, 0, stream_id>>>(
cu_adjustment_time
);
cudaStreamSynchronize(stream_id);
cudaMemcpyFromSymbolAsync(¤tTime, minChangeTime, sizeof(currentTime), 0, cudaMemcpyDeviceToHost, stream_id);
cudaStreamSynchronize(stream_id);
currentTime+=num_nets;
time_data_index+=num_nets+1;
std::cout << loop << " " << currentTime << " " << time_data_index << std::endl;
loop++;
}
std::cout << "finished" << std::endl;
}
int main(){
//compiled with: nvcc no_fun.cu -Xcompiler=-fopenmp -o no_fun
int iRows = 3000;
int iter = 300;
int time_data_index = 121;
int num_nets = 64;
cudaStream_t streams[iter];
//#pragma omp parallel for simd schedule(dynamic) -> including this part causes undefined results
for(unsigned int j = 0; j < iter; j++){
cudaStreamCreate(&streams[j]);
iteration_function(&iRows, time_data_index, num_nets, streams[j]);
cudaStreamSynchronize(streams[j]);
cudaStreamDestroy(streams[j]);
}
}
当多个 reduction
内核同时 运行 时,全局变量 minChangeTime
存在竞争条件。
您需要为每个应该 运行 并行的内核提供单独的设备内存。最简单的方法是在每个线程中只使用 cudaMalloc minChangeTime
而不是将其声明为全局变量,并将其传递给内核。
我有一个函数,在这个函数中我在一个循环中连续调用一个 cuda 内核。该函数使用 OpenMP 在线程中并行执行。通过每次迭代,我更新变量 currentTime
为:
cudaMemcpyFromSymbolAsync(¤tTime, minChangeTime, sizeof(currentTime), 0, cudaMemcpyDeviceToHost, stream_id);
其中 minChangeTime
在内核中计算。不知何故,当使用 OpenMP 并行调用多个内核时,此变量 currentTime
的更新未正确完成。我在最后提供了一个可重现的代码。我期待的结果是:
0 65 186
1 130 251
2 195 316
3 260 381
4 325 446
...
但是在启用 OpenMP 时,我没有得到 121 的差异:
7 325 641
3 325 381
3 325 381
6 325 576
4 390 446
8 390 706
7 390 641
4 3063 446
我做错了什么或误解了什么?如果设备内存变量在这里不合适,那么什么是更好的变量类型?
#ifdef __CUDACC__
#define CUDA_HOSTDEV __host__ __device__
#define CUDA_DEVICE __device__
#define CUDA_GLOBAL __global__
#define CUDA_CONST __constant__
#else
#define CUDA_HOSTDEV
#define CUDA_DEVICE
#define CUDA_GLOBAL
#define CUDA_CONST
#endif
#include <cuda.h>
#include <cuda_runtime.h>
#include <omp.h>
#include "helper_cuda.h"
#include "helper_functions.h"
CUDA_DEVICE int minChangeTime;
CUDA_DEVICE bool foundMinimum;
CUDA_GLOBAL void reduction(
int* cu_adjustment_time
){
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
__syncthreads();
for (unsigned int s=1; s < blockDim.x; s *= 2) {
if (tid % (2*s) == 0){
atomicMin(&minChangeTime, cu_adjustment_time[tid+s]);
}
__syncthreads();
}
}
CUDA_GLOBAL void wh(int* cu_adjustment_time, int currentTime){
int tid = threadIdx.x + blockDim.x*blockIdx.x;
cu_adjustment_time[tid] = currentTime+tid;
}
void iteration_function(int *iRows, int time_data_index, int num_nets, cudaStream_t stream_id){
int currentTime = 0;
int limit = *iRows-1;
int starting_point = time_data_index;
time_data_index+=currentTime;
int* cu_adjustment_time;
cudaMalloc((void **)&cu_adjustment_time, sizeof(int) * (num_nets));
limit = (*iRows) - 1;
cudaStreamSynchronize(stream_id);
int loop = 0;
while(currentTime<limit){
cudaMemcpyToSymbolAsync(minChangeTime, &limit, sizeof(*iRows), 0, cudaMemcpyHostToDevice, stream_id);
wh<<<num_nets, 1, 0, stream_id>>>(
cu_adjustment_time,
currentTime
);
cudaStreamSynchronize(stream_id);
reduction<<<1, num_nets, 0, stream_id>>>(
cu_adjustment_time
);
cudaStreamSynchronize(stream_id);
cudaMemcpyFromSymbolAsync(¤tTime, minChangeTime, sizeof(currentTime), 0, cudaMemcpyDeviceToHost, stream_id);
cudaStreamSynchronize(stream_id);
currentTime+=num_nets;
time_data_index+=num_nets+1;
std::cout << loop << " " << currentTime << " " << time_data_index << std::endl;
loop++;
}
std::cout << "finished" << std::endl;
}
int main(){
//compiled with: nvcc no_fun.cu -Xcompiler=-fopenmp -o no_fun
int iRows = 3000;
int iter = 300;
int time_data_index = 121;
int num_nets = 64;
cudaStream_t streams[iter];
//#pragma omp parallel for simd schedule(dynamic) -> including this part causes undefined results
for(unsigned int j = 0; j < iter; j++){
cudaStreamCreate(&streams[j]);
iteration_function(&iRows, time_data_index, num_nets, streams[j]);
cudaStreamSynchronize(streams[j]);
cudaStreamDestroy(streams[j]);
}
}
当多个 reduction
内核同时 运行 时,全局变量 minChangeTime
存在竞争条件。
您需要为每个应该 运行 并行的内核提供单独的设备内存。最简单的方法是在每个线程中只使用 cudaMalloc minChangeTime
而不是将其声明为全局变量,并将其传递给内核。