CUDA 设备向量

CUDA Device Vector

我想向我的内核发送一个向量。我有浮点向量 h_vec.

thrust::device_vector<float> d_vec = h_vec;
float* pd_vec = thrust::raw_pointer_cast(d_vec.data());
...
kernel<<<grid, block>>>(pd_vec)

但是在我的内核中,矢量似乎充满了零,我不知道为什么。

__global__ void kernel (float* pd_vec)

是否需要我在 cudaMalloccudaMemcpy 的主机中专门为此向量分配内存?

Is it required that I specifically allocate memory for this vector in host with cudaMalloc and cudaMemcpy?

没有。复制分配在推力容器(主机或设备)和 std::vector.

之间工作得很好

例如:

$ module load cuda/10.1

$ cat notreallyno.cu 

#include <thrust/device_vector.h>
#include <vector>
#include <cstdio>

__global__ void kernel (float* pd_vec, int n)
{
    if (threadIdx.x < n)
        printf("%d %f \n", threadIdx.x, pd_vec[threadIdx.x]);
}

int main()
{
    {
    std::vector<float> h_vec = { 1.1f, 2.2f, 3.3f, 4.4f, 5.5f, 6.6f, 7.7f, 8.8f, 9.9f, 10.01f };
    thrust::device_vector<float> d_vec = h_vec;
    float* pd_vec = thrust::raw_pointer_cast(d_vec.data());

    int n = h_vec.size();
    kernel<<<1, 32>>>(pd_vec, n);
    cudaDeviceSynchronize();
    }
    cudaDeviceReset();

    return 0;
}

$ nvcc -std=c++11 -arch=sm_52 -o notreallyno notreallyno.cu 

$ ./notreallyno 
0 1.100000 
1 2.200000 
2 3.300000 
3 4.400000 
4 5.500000 
5 6.600000 
6 7.700000 
7 8.800000 
8 9.900000 
9 10.010000