在抛出 'thrust::system::system_error' what() 的实例后调用终止:parallel_for 失败:cudaErrorInvalidValue:参数无效
terminate called after throwing an instance of 'thrust::system::system_error' what(): parallel_for failed: cudaErrorInvalidValue: invalid argument
我正在尝试计算 curand_uniform() returns 1.0 的次数。但是我似乎无法让以下代码为我工作:
#include <stdio.h>
#include <stdlib.h>
#include <thrust/device_vector.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <curand_kernel.h>
using namespace std;
__global__
void counts(int length, int *sum, curandStatePhilox4_32_10_t* state) {
int tempsum = int(0);
int i = blockIdx.x * blockDim.x + threadIdx.x;
curandStatePhilox4_32_10_t localState = state[i];
for(; i < length; i += blockDim.x * gridDim.x) {
double thisnum = curand_uniform( &localState );
if ( thisnum == 1.0 ){
tempsum += 1;
}
}
atomicAdd(sum, tempsum);
}
__global__
void curand_setup(curandStatePhilox4_32_10_t *state, long seed) {
int id = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, id, 0, &state[id]);
}
int main(int argc, char *argv[]) {
const int N = 1e5;
int* count_h = 0;
int* count_d;
cudaMalloc(&count_d, sizeof(int) );
cudaMemcpy(count_d, count_h, sizeof(int), cudaMemcpyHostToDevice);
int threads_per_block = 64;
int Nblocks = 32*6;
thrust::device_vector<curandStatePhilox4_32_10_t> d_state(Nblocks*threads_per_block);
curand_setup<<<Nblocks, threads_per_block>>>(d_state.data().get(), time(0));
counts<<<Nblocks, threads_per_block>>>(N, count_d, d_state.data().get());
cudaMemcpy(count_h, count_d, sizeof(int), cudaMemcpyDeviceToHost);
cout << count_h << endl;
cudaFree(count_d);
free(count_h);
}
我收到终端错误(在
linux):
terminate called after throwing an instance of 'thrust::system::system_error'
what(): parallel_for failed: cudaErrorInvalidValue: invalid argument
Aborted (core dumped)
我是这样编译的:
nvcc -Xcompiler "-fopenmp" -o test uniform_one_hit_count.cu
我不明白这个错误信息。
这一行:
thrust::device_vector<curandStatePhilox4_32_10_t> d_state(Nblocks*threads_per_block);
正在设备上初始化一个新向量。当 thrust 执行此操作时,它会调用正在使用的对象的构造函数,在本例中为 curandStatePhilox4_32_10
,一个定义在 /usr/local/cuda/include/curand_philox4x32_x.h
中(无论如何在 linux 中)的结构。不幸的是,结构定义没有提供任何用 __device__
修饰的构造函数,这给推力造成了麻烦。
一个简单的解决方法是 assemble 主机上的矢量并将其复制到设备:
thrust::host_vector<curandStatePhilox4_32_10_t> h_state(Nblocks*threads_per_block);
thrust::device_vector<curandStatePhilox4_32_10_t> d_state = h_state;
或者,只需使用 cudaMalloc 分配 space:
curandStatePhilox4_32_10_t *d_state;
cudaMalloc(&d_state, (Nblocks*threads_per_block)*sizeof(d_state[0]));
您至少还有一个问题。这实际上并没有为指针应该指向的内容提供适当的存储分配:
int* count_h = 0;
之后,您应该执行以下操作:
count_h = (int *)malloc(sizeof(int));
memset(count_h, 0, sizeof(int));
在您的 print-out 行中,您最有可能想要这样做:
cout << count_h[0] << endl;
解决 count_h
问题的另一种方法是:
int count_h = 0;
这将需要对您的代码进行一组不同的更改(对 cudaMemcpy
操作)。
我正在尝试计算 curand_uniform() returns 1.0 的次数。但是我似乎无法让以下代码为我工作:
#include <stdio.h>
#include <stdlib.h>
#include <thrust/device_vector.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <curand_kernel.h>
using namespace std;
__global__
void counts(int length, int *sum, curandStatePhilox4_32_10_t* state) {
int tempsum = int(0);
int i = blockIdx.x * blockDim.x + threadIdx.x;
curandStatePhilox4_32_10_t localState = state[i];
for(; i < length; i += blockDim.x * gridDim.x) {
double thisnum = curand_uniform( &localState );
if ( thisnum == 1.0 ){
tempsum += 1;
}
}
atomicAdd(sum, tempsum);
}
__global__
void curand_setup(curandStatePhilox4_32_10_t *state, long seed) {
int id = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, id, 0, &state[id]);
}
int main(int argc, char *argv[]) {
const int N = 1e5;
int* count_h = 0;
int* count_d;
cudaMalloc(&count_d, sizeof(int) );
cudaMemcpy(count_d, count_h, sizeof(int), cudaMemcpyHostToDevice);
int threads_per_block = 64;
int Nblocks = 32*6;
thrust::device_vector<curandStatePhilox4_32_10_t> d_state(Nblocks*threads_per_block);
curand_setup<<<Nblocks, threads_per_block>>>(d_state.data().get(), time(0));
counts<<<Nblocks, threads_per_block>>>(N, count_d, d_state.data().get());
cudaMemcpy(count_h, count_d, sizeof(int), cudaMemcpyDeviceToHost);
cout << count_h << endl;
cudaFree(count_d);
free(count_h);
}
我收到终端错误(在 linux):
terminate called after throwing an instance of 'thrust::system::system_error'
what(): parallel_for failed: cudaErrorInvalidValue: invalid argument
Aborted (core dumped)
我是这样编译的:
nvcc -Xcompiler "-fopenmp" -o test uniform_one_hit_count.cu
我不明白这个错误信息。
这一行:
thrust::device_vector<curandStatePhilox4_32_10_t> d_state(Nblocks*threads_per_block);
正在设备上初始化一个新向量。当 thrust 执行此操作时,它会调用正在使用的对象的构造函数,在本例中为 curandStatePhilox4_32_10
,一个定义在 /usr/local/cuda/include/curand_philox4x32_x.h
中(无论如何在 linux 中)的结构。不幸的是,结构定义没有提供任何用 __device__
修饰的构造函数,这给推力造成了麻烦。
一个简单的解决方法是 assemble 主机上的矢量并将其复制到设备:
thrust::host_vector<curandStatePhilox4_32_10_t> h_state(Nblocks*threads_per_block);
thrust::device_vector<curandStatePhilox4_32_10_t> d_state = h_state;
或者,只需使用 cudaMalloc 分配 space:
curandStatePhilox4_32_10_t *d_state;
cudaMalloc(&d_state, (Nblocks*threads_per_block)*sizeof(d_state[0]));
您至少还有一个问题。这实际上并没有为指针应该指向的内容提供适当的存储分配:
int* count_h = 0;
之后,您应该执行以下操作:
count_h = (int *)malloc(sizeof(int));
memset(count_h, 0, sizeof(int));
在您的 print-out 行中,您最有可能想要这样做:
cout << count_h[0] << endl;
解决 count_h
问题的另一种方法是:
int count_h = 0;
这将需要对您的代码进行一组不同的更改(对 cudaMemcpy
操作)。