使用 cudaMallocPitch 分配 1 个维度数组,然后使用 cudaMemcpy2D 3 复制到设备
Allocate 1 Dimension array with cudaMallocPitch and then copy to device with cudaMemcpy2D 3
我已经阅读了这个 post Allocate 2D array with cudaMallocPitch and copying with cudaMemcpy2D 以及包括 NVIDIA 文档在内的许多其他内容,但我无法让 cudaMallocPitch 与 cudaMemcpy2D 一起工作。
我需要以数组格式 (Matrix[width*height]) 复制一个非常大的矩阵以及一个简单的数组来执行 Matrix * 向量运算。为了避免冲突和获得更好的性能,我不能选择使用 cudaMallocPitch。
所以,我开始只是尝试将矩阵(在我的例子中是矢量)复制到设备并检查它是否被正确复制,但我的代码没有打印任何东西。如果我使用 cudaMalloc 和 cudaMemcpy 一切正常。但是我不知道如何处理 cudaMallocPitch 和 cudaMemcpy2D。
我该怎么做才能解决这个问题?
#include <stdio.h>
__global__ void kernel(size_t mpitch, double * A, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
while (idx < N)
{
double e = *(double *)(((char *) A + idx * mpitch) + N);
printf("(%f)", e);
}
}
int main()
{
int N = 1500;
double * A = new double[N], * d_A;
size_t pitch;
for (int i = 0; i < N; ++i)
{
A[i] = i;
}
cudaMallocPitch(&d_A, &pitch, sizeof(double) * N, 1);
cudaMemcpy2D(d_A, pitch, A, N * sizeof(double), sizeof(double) * N, 1, cudaMemcpyHostToDevice);
unsigned int blocksize = 1024;
unsigned int nblocks = (N + blocksize - 1) / blocksize;
kernel <<<nblocks, blocksize>>>(pitch, d_A, N);
cudaFree(d_A);
delete [] A;
return 0;
}
错误检查对调试有很大影响。在来这里之前你应该总是使用它。
不清楚您需要行向量还是列向量,即 [1xN] 或 [Nx1] 的矩阵
我已经添加了关于 Talomnies 建议的解释,但首先是 'working slabs of code'
这是 [Nx1]
#include <cstdio>
#include <iostream>
#include <cuda.h>
using namespace std;
__global__ void kernel(size_t mpitch, double * A, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if(idx>=N) return;
double e = *(double *)(((char *) A + idx * mpitch));
printf("(%f)", e);
}
int main()
{
int N = 15;
double * A = new double[N], * d_A;
size_t pitch;
for (int i = 0; i < N; ++i)
{
A[i] = i;
}
cudaError_t err = cudaMallocPitch(&d_A, &pitch, sizeof(double), N);
if(err!=cudaSuccess) cout<<"err0:"<<cudaGetErrorString(err)<<endl;
err = cudaMemcpy2D(d_A, pitch, A, sizeof(double), sizeof(double), N, cudaMemcpyHostToDevice);
if(err!=cudaSuccess) cout<<"err1:"<<cudaGetErrorString(err)<<endl;
unsigned int blocksize = 1024;
unsigned int nblocks = (N + blocksize - 1) / blocksize;
kernel <<<nblocks, blocksize>>>(pitch, d_A, N);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess) cout<<"err2:"<<cudaGetErrorString(err)<<endl;
cudaFree(d_A);
delete [] A;
return 0;
}
[1xN]:
#include <cstdio>
#include <iostream>
#include <cuda.h>
using namespace std;
__global__ void kernel(size_t mpitch, double * A, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if(idx>=N) return;
int row=0;//only one row
double *row_ptr = (double *)( (char *) (A + mpitch * row) );
double e = row_ptr[idx];
printf("(%f)", e);
}
int main()
{
int N = 15;
double * A = new double[N], * d_A;
size_t pitch;
for (int i = 0; i < N; ++i)
{
A[i] = i;
}
cudaError_t err = cudaMallocPitch(&d_A, &pitch, sizeof(double)*N, 1);
if(err!=cudaSuccess) cout<<"err0:"<<cudaGetErrorString(err)<<endl;
err = cudaMemcpy2D(d_A, pitch, A, sizeof(double)*N, sizeof(double)*N, 1, cudaMemcpyHostToDevice);
if(err!=cudaSuccess) cout<<"err1:"<<cudaGetErrorString(err)<<endl;
unsigned int blocksize = 1024;
unsigned int nblocks = (N + blocksize - 1) / blocksize;
kernel <<<nblocks, blocksize>>>(pitch, d_A, N);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess) cout<<"err2:"<<cudaGetErrorString(err)<<endl;
cudaFree(d_A);
delete [] A;
return 0;
}
说明
首先,错误处理:
考虑到 CUDA 中的错误处理是多么容易,没有理由不把它放进去。
cudaError_t err = cudaMallocPitch(&d_A, &pitch, sizeof(double)*N, 1);
if(err!=cudaSuccess) cout<<"err0:"<<cudaGetErrorString(err)<<endl;
其次,您没有指定是需要列向量还是行向量。由于行向量只是线性内存中的一个一维数组,你不需要倾斜内存来做到这一点,我将假设你的解释是列向量。
您遇到的重复出现的问题是 "misaligned address" 在内核中。这表明问题出在簿记上,所以让我们来看看处理对齐的二维数组的三个主要步骤(即使我们的数组将是列向量或行向量)。
正在分配:
您的分配被写成
cudaMallocPitch(&d_A, &pitch, sizeof(double) * N, 1);
这对于行向量是正确的,因为 API 是 cudaMallocPitch(void*** pointer, size_t* pitch_return, size_t row_width_in_bytes, size_t count_of_rows)
但是,如果我们想做一个列向量,正确的调用是
cudaMallocPitch(&d_A, &pitch, sizeof(double), N);
正在访问:
对于访问,您混淆了访问行和访问行中的元素。
double e = *(double *)(((char *) A + idx * mpitch) + N);
再次坚持文档。 API cudaMallocPitch 文档包括
T* pElement = (T*)((char*)BaseAddress + Row * pitch) + Column;
对我们来说,这转化为
int column=0;
double element=(double*) ((char*)A + idx * mpitch) + column;
为了完整性,我使用了 column = 0
,因为我们只有一列。
正在复制:
cudaMemcpy2D(d_A, pitch, A, N * sizeof(double), sizeof(double) * N, 1, cudaMemcpyHostToDevice);
对于这种情况,这是正确的。 API 对于 cudaMemcpy2D
是
cudaMemcpy2D(void* destination, size_t pitch_from_mallocPitch, const void* source, size_t source_pitch_bytes, size_t src_width_in_bytes, size_t src_rows_count, enum type_of_xfer);
我已经阅读了这个 post Allocate 2D array with cudaMallocPitch and copying with cudaMemcpy2D 以及包括 NVIDIA 文档在内的许多其他内容,但我无法让 cudaMallocPitch 与 cudaMemcpy2D 一起工作。
我需要以数组格式 (Matrix[width*height]) 复制一个非常大的矩阵以及一个简单的数组来执行 Matrix * 向量运算。为了避免冲突和获得更好的性能,我不能选择使用 cudaMallocPitch。
所以,我开始只是尝试将矩阵(在我的例子中是矢量)复制到设备并检查它是否被正确复制,但我的代码没有打印任何东西。如果我使用 cudaMalloc 和 cudaMemcpy 一切正常。但是我不知道如何处理 cudaMallocPitch 和 cudaMemcpy2D。
我该怎么做才能解决这个问题?
#include <stdio.h>
__global__ void kernel(size_t mpitch, double * A, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
while (idx < N)
{
double e = *(double *)(((char *) A + idx * mpitch) + N);
printf("(%f)", e);
}
}
int main()
{
int N = 1500;
double * A = new double[N], * d_A;
size_t pitch;
for (int i = 0; i < N; ++i)
{
A[i] = i;
}
cudaMallocPitch(&d_A, &pitch, sizeof(double) * N, 1);
cudaMemcpy2D(d_A, pitch, A, N * sizeof(double), sizeof(double) * N, 1, cudaMemcpyHostToDevice);
unsigned int blocksize = 1024;
unsigned int nblocks = (N + blocksize - 1) / blocksize;
kernel <<<nblocks, blocksize>>>(pitch, d_A, N);
cudaFree(d_A);
delete [] A;
return 0;
}
错误检查对调试有很大影响。在来这里之前你应该总是使用它。
不清楚您需要行向量还是列向量,即 [1xN] 或 [Nx1] 的矩阵
我已经添加了关于 Talomnies 建议的解释,但首先是 'working slabs of code'
这是 [Nx1]
#include <cstdio>
#include <iostream>
#include <cuda.h>
using namespace std;
__global__ void kernel(size_t mpitch, double * A, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if(idx>=N) return;
double e = *(double *)(((char *) A + idx * mpitch));
printf("(%f)", e);
}
int main()
{
int N = 15;
double * A = new double[N], * d_A;
size_t pitch;
for (int i = 0; i < N; ++i)
{
A[i] = i;
}
cudaError_t err = cudaMallocPitch(&d_A, &pitch, sizeof(double), N);
if(err!=cudaSuccess) cout<<"err0:"<<cudaGetErrorString(err)<<endl;
err = cudaMemcpy2D(d_A, pitch, A, sizeof(double), sizeof(double), N, cudaMemcpyHostToDevice);
if(err!=cudaSuccess) cout<<"err1:"<<cudaGetErrorString(err)<<endl;
unsigned int blocksize = 1024;
unsigned int nblocks = (N + blocksize - 1) / blocksize;
kernel <<<nblocks, blocksize>>>(pitch, d_A, N);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess) cout<<"err2:"<<cudaGetErrorString(err)<<endl;
cudaFree(d_A);
delete [] A;
return 0;
}
[1xN]:
#include <cstdio>
#include <iostream>
#include <cuda.h>
using namespace std;
__global__ void kernel(size_t mpitch, double * A, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if(idx>=N) return;
int row=0;//only one row
double *row_ptr = (double *)( (char *) (A + mpitch * row) );
double e = row_ptr[idx];
printf("(%f)", e);
}
int main()
{
int N = 15;
double * A = new double[N], * d_A;
size_t pitch;
for (int i = 0; i < N; ++i)
{
A[i] = i;
}
cudaError_t err = cudaMallocPitch(&d_A, &pitch, sizeof(double)*N, 1);
if(err!=cudaSuccess) cout<<"err0:"<<cudaGetErrorString(err)<<endl;
err = cudaMemcpy2D(d_A, pitch, A, sizeof(double)*N, sizeof(double)*N, 1, cudaMemcpyHostToDevice);
if(err!=cudaSuccess) cout<<"err1:"<<cudaGetErrorString(err)<<endl;
unsigned int blocksize = 1024;
unsigned int nblocks = (N + blocksize - 1) / blocksize;
kernel <<<nblocks, blocksize>>>(pitch, d_A, N);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess) cout<<"err2:"<<cudaGetErrorString(err)<<endl;
cudaFree(d_A);
delete [] A;
return 0;
}
说明
首先,错误处理:
考虑到 CUDA 中的错误处理是多么容易,没有理由不把它放进去。
cudaError_t err = cudaMallocPitch(&d_A, &pitch, sizeof(double)*N, 1);
if(err!=cudaSuccess) cout<<"err0:"<<cudaGetErrorString(err)<<endl;
其次,您没有指定是需要列向量还是行向量。由于行向量只是线性内存中的一个一维数组,你不需要倾斜内存来做到这一点,我将假设你的解释是列向量。
您遇到的重复出现的问题是 "misaligned address" 在内核中。这表明问题出在簿记上,所以让我们来看看处理对齐的二维数组的三个主要步骤(即使我们的数组将是列向量或行向量)。
正在分配: 您的分配被写成
cudaMallocPitch(&d_A, &pitch, sizeof(double) * N, 1);
这对于行向量是正确的,因为 API 是 cudaMallocPitch(void*** pointer, size_t* pitch_return, size_t row_width_in_bytes, size_t count_of_rows)
但是,如果我们想做一个列向量,正确的调用是
cudaMallocPitch(&d_A, &pitch, sizeof(double), N);
正在访问: 对于访问,您混淆了访问行和访问行中的元素。
double e = *(double *)(((char *) A + idx * mpitch) + N);
再次坚持文档。 API cudaMallocPitch 文档包括
T* pElement = (T*)((char*)BaseAddress + Row * pitch) + Column;
对我们来说,这转化为
int column=0;
double element=(double*) ((char*)A + idx * mpitch) + column;
为了完整性,我使用了 column = 0
,因为我们只有一列。
正在复制:
cudaMemcpy2D(d_A, pitch, A, N * sizeof(double), sizeof(double) * N, 1, cudaMemcpyHostToDevice);
对于这种情况,这是正确的。 API 对于 cudaMemcpy2D
是
cudaMemcpy2D(void* destination, size_t pitch_from_mallocPitch, const void* source, size_t source_pitch_bytes, size_t src_width_in_bytes, size_t src_rows_count, enum type_of_xfer);