cudaThreadSynchronise() 返回错误代码 6
cudaThreadSynchronise() returned error code 6
我正在尝试 运行 在 Cuda 中使用并行约简查找数组最大元素的代码
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
/* a is the array that holds the values and c is the array used to store the maximum in a block */
cudaError_t reduce_max(int *a,int *c,int size);
/*The kernel that performs the reduction */
__global__ void global_max(int *d_c, int * d_a)
{
int myId=threadIdx.x+blockDim.x*blockIdx.x;
int tid=threadIdx.x;
for(int s=(blockDim.x)/2; s>0; s>>1)
{
if(tid<s)
{
d_a[myId]=max(d_a[myId],d_a[myId+s]);
}
__syncthreads();
}
if(tid==0)
{
d_c[blockIdx.x]=d_a[myId];
}
}
int main()
{
const int arraySize = 1024;
int i;
int a[arraySize];
for(i=0;i<arraySize;i++)
{
a[i]=i;
}
int c[arraySize];
cudaError_t cudaStatus = reduce_max(a,c,arraySize);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "The required operation failed");
return 1;
}
cudaStatus = cudaThreadExit();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaThreadExit failed!");
return 1;
}
return 0;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t reduce_max(int *a,int *c,int size)
{
int *dev_a = 0;
int *dev_c = 0;
/*
dev_a and dev_c are the arrays on the device
*/
cudaError_t cudaStatus;
const dim3 blockSize(64,1,1);
const dim3 gridSize(size/blockSize.x,1,1);
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
/*Allocating the memory on the device */
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
/*Copying array from host to device */
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
/*Calling the kernel */
global_max<<<gridSize,blockSize>>>(dev_c, dev_a);
cudaStatus = cudaThreadSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaThreadSynchronize returned error code %d\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
return cudaStatus;
}
但是在执行上面的代码时我得到了错误:
cudaThreadSynchronize 返回错误代码 6.
我无法找出问题。
您的代码将 运行 永远。结果你遇到了超时。
此行已损坏,您的编译器应该会发出警告:
for(int s=(blockDim.x)/2; s>0; s>>1)
s>>1
不修改 s
变量。我很确定你的意思是 s>>=1
修改 s
。如果不修改 s
,您的循环将永远 运行,结果您遇到了内核超时。
改为这样做:
for(int s=(blockDim.x)/2; s>0; s>>=1)
我正在尝试 运行 在 Cuda 中使用并行约简查找数组最大元素的代码
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
/* a is the array that holds the values and c is the array used to store the maximum in a block */
cudaError_t reduce_max(int *a,int *c,int size);
/*The kernel that performs the reduction */
__global__ void global_max(int *d_c, int * d_a)
{
int myId=threadIdx.x+blockDim.x*blockIdx.x;
int tid=threadIdx.x;
for(int s=(blockDim.x)/2; s>0; s>>1)
{
if(tid<s)
{
d_a[myId]=max(d_a[myId],d_a[myId+s]);
}
__syncthreads();
}
if(tid==0)
{
d_c[blockIdx.x]=d_a[myId];
}
}
int main()
{
const int arraySize = 1024;
int i;
int a[arraySize];
for(i=0;i<arraySize;i++)
{
a[i]=i;
}
int c[arraySize];
cudaError_t cudaStatus = reduce_max(a,c,arraySize);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "The required operation failed");
return 1;
}
cudaStatus = cudaThreadExit();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaThreadExit failed!");
return 1;
}
return 0;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t reduce_max(int *a,int *c,int size)
{
int *dev_a = 0;
int *dev_c = 0;
/*
dev_a and dev_c are the arrays on the device
*/
cudaError_t cudaStatus;
const dim3 blockSize(64,1,1);
const dim3 gridSize(size/blockSize.x,1,1);
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
/*Allocating the memory on the device */
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
/*Copying array from host to device */
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
/*Calling the kernel */
global_max<<<gridSize,blockSize>>>(dev_c, dev_a);
cudaStatus = cudaThreadSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaThreadSynchronize returned error code %d\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
return cudaStatus;
}
但是在执行上面的代码时我得到了错误: cudaThreadSynchronize 返回错误代码 6.
我无法找出问题。
您的代码将 运行 永远。结果你遇到了超时。
此行已损坏,您的编译器应该会发出警告:
for(int s=(blockDim.x)/2; s>0; s>>1)
s>>1
不修改 s
变量。我很确定你的意思是 s>>=1
修改 s
。如果不修改 s
,您的循环将永远 运行,结果您遇到了内核超时。
改为这样做:
for(int s=(blockDim.x)/2; s>0; s>>=1)