CUDA 设备向量
CUDA Device Vector
我想向我的内核发送一个向量。我有浮点向量 h_vec
.
thrust::device_vector<float> d_vec = h_vec;
float* pd_vec = thrust::raw_pointer_cast(d_vec.data());
...
kernel<<<grid, block>>>(pd_vec)
但是在我的内核中,矢量似乎充满了零,我不知道为什么。
__global__ void kernel (float* pd_vec)
是否需要我在 cudaMalloc
和 cudaMemcpy
的主机中专门为此向量分配内存?
Is it required that I specifically allocate memory for this vector in host with cudaMalloc and cudaMemcpy?
没有。复制分配在推力容器(主机或设备)和 std::vector
.
之间工作得很好
例如:
$ module load cuda/10.1
$ cat notreallyno.cu
#include <thrust/device_vector.h>
#include <vector>
#include <cstdio>
__global__ void kernel (float* pd_vec, int n)
{
if (threadIdx.x < n)
printf("%d %f \n", threadIdx.x, pd_vec[threadIdx.x]);
}
int main()
{
{
std::vector<float> h_vec = { 1.1f, 2.2f, 3.3f, 4.4f, 5.5f, 6.6f, 7.7f, 8.8f, 9.9f, 10.01f };
thrust::device_vector<float> d_vec = h_vec;
float* pd_vec = thrust::raw_pointer_cast(d_vec.data());
int n = h_vec.size();
kernel<<<1, 32>>>(pd_vec, n);
cudaDeviceSynchronize();
}
cudaDeviceReset();
return 0;
}
$ nvcc -std=c++11 -arch=sm_52 -o notreallyno notreallyno.cu
$ ./notreallyno
0 1.100000
1 2.200000
2 3.300000
3 4.400000
4 5.500000
5 6.600000
6 7.700000
7 8.800000
8 9.900000
9 10.010000
我想向我的内核发送一个向量。我有浮点向量 h_vec
.
thrust::device_vector<float> d_vec = h_vec;
float* pd_vec = thrust::raw_pointer_cast(d_vec.data());
...
kernel<<<grid, block>>>(pd_vec)
但是在我的内核中,矢量似乎充满了零,我不知道为什么。
__global__ void kernel (float* pd_vec)
是否需要我在 cudaMalloc
和 cudaMemcpy
的主机中专门为此向量分配内存?
Is it required that I specifically allocate memory for this vector in host with cudaMalloc and cudaMemcpy?
没有。复制分配在推力容器(主机或设备)和 std::vector
.
例如:
$ module load cuda/10.1
$ cat notreallyno.cu
#include <thrust/device_vector.h>
#include <vector>
#include <cstdio>
__global__ void kernel (float* pd_vec, int n)
{
if (threadIdx.x < n)
printf("%d %f \n", threadIdx.x, pd_vec[threadIdx.x]);
}
int main()
{
{
std::vector<float> h_vec = { 1.1f, 2.2f, 3.3f, 4.4f, 5.5f, 6.6f, 7.7f, 8.8f, 9.9f, 10.01f };
thrust::device_vector<float> d_vec = h_vec;
float* pd_vec = thrust::raw_pointer_cast(d_vec.data());
int n = h_vec.size();
kernel<<<1, 32>>>(pd_vec, n);
cudaDeviceSynchronize();
}
cudaDeviceReset();
return 0;
}
$ nvcc -std=c++11 -arch=sm_52 -o notreallyno notreallyno.cu
$ ./notreallyno
0 1.100000
1 2.200000
2 3.300000
3 4.400000
4 5.500000
5 6.600000
6 7.700000
7 8.800000
8 9.900000
9 10.010000