如何在单独的 CUDA 函数中分配 GPU 内存?
How do you allocate GPU memory in a separate CUDA function?
我是 CUDA 的新手,我确定我正在做的事情很简单,可以修复,但我也不确定要准确搜索什么才能找到答案。我试过环顾四周,但无济于事。
我的代码中有几个函数要执行矩阵运算,因此我不想编写多次分配内存的代码,而是想使用一个函数来为我执行此操作。我的问题是内存位置没有传回调用我的 MatrixInitCUDA 函数的函数。
如果我直接在我的矩阵函数中分配内存,它会按预期工作,但我 运行 遇到的问题是我指向设备内存的指针仅被分配给MatrixInitCUDA 函数。
最初我认为参数可能有某种类型的转换,所以我包含了类型信息 header 并在 cudaMalloc 之前和之后打印出设备参数的类型(没有变化 - 不是奇怪)。我已经尝试为设备矩阵参数传递双指针,但这似乎也不起作用,虽然我也没有正确地做到这一点。
// Compile using nvcc <file> -lcublas -o <output>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <typeinfo>
// Define block size for thread allocation
#define BLOCK_DIM 32
#define N 10
typedef struct _matrixSize // Optional Command-line multiplier for matrix sizes
{
unsigned int A_height, A_width, B_height, B_width, C_height, C_width;
} MatrixSize;
void SetMatrixSize(MatrixSize *matrixSize,
unsigned int widthA, unsigned int heightA,
unsigned int widthB, unsigned int heightB,
unsigned int widthC, unsigned int heightC)
{
matrixSize->A_height = heightA;
matrixSize->A_width = widthA;
matrixSize->B_height = heightB;
matrixSize->B_width = widthB;
matrixSize->C_height = heightC;
matrixSize->C_width = widthC;
}
void MatrixInitCUDA(int argc, char **argv, int &devID, MatrixSize *matrixSize,
float *host_matrixA, float *host_matrixB, float *host_matrixC,
float *dev_matrixA, float *dev_matrixB, float *dev_matrixC)
{
// Assign CUDA variables
devID = 0;
cudaGetDevice(&devID);
cudaError_t err;
// Assign size variables
size_t matrixA_size = matrixSize->A_height * matrixSize->A_width * sizeof(float);
printf("Allocation size: %d\tMatrix Size: %d\n", (int) matrixA_size, matrixSize->A_height * matrixSize->A_width);
size_t matrixB_size = matrixSize->B_height * matrixSize->B_width * sizeof(float);
size_t matrixC_size = matrixSize->C_height * matrixSize->C_width * sizeof(float);
printf("PRE ALLOC TYPE: %s\n", typeid(typeof(dev_matrixA)).name());
// Allocate memory on GPU
err = cudaMalloc((void **) &dev_matrixA, matrixA_size);
printf("POST ALLOC TYPE: %s\n", typeid(typeof(dev_matrixA)).name());
printf("DEV A POST ALLOC: %p\n", dev_matrixA);
if (err != cudaSuccess) printf("Allocate matrix A: %s\n", cudaGetErrorString(err));
err = cudaMalloc((void **) &dev_matrixB, matrixB_size);
if (err != cudaSuccess) printf("Allocate matrix B: %s\n", cudaGetErrorString(err));
err = cudaMalloc((void **) &dev_matrixC, matrixC_size);
if (err != cudaSuccess) printf("Allocate matrix C: %s\n", cudaGetErrorString(err));
// Copy data from host PC to GPU
err = cudaMemcpy(dev_matrixA, host_matrixA, matrixA_size, cudaMemcpyHostToDevice);
if (err != cudaSuccess) printf("Copy matrix A to GPU: %s\n", cudaGetErrorString(err));
err =cudaMemcpy(dev_matrixB, host_matrixB, matrixB_size, cudaMemcpyHostToDevice);
if (err != cudaSuccess) printf("Copy matrix B to GPU: %s\n", cudaGetErrorString(err));
err =cudaMemcpy(dev_matrixC, host_matrixC, matrixC_size, cudaMemcpyHostToDevice);
if (err != cudaSuccess) printf("Copy matrix C to GPU: %s\n", cudaGetErrorString(err));
}
int main(int argc, char **argv)
{
// Create memory for Layer 1, Layer 2, Layer 3 vectors
// float *layer1 = malloc(784*sizeof(floats)))
// Create memory for Weight 1->2, Weight 2->3 matrices
// Layer 1 will read from file for input (X) values
// Layer 2 and 3 will be calculated
int devID = 0;
cudaGetDevice(&devID);
// Testing hadamard product, init function, and set matrix size function
float *host_A, *host_B, *host_C, *dev_A = NULL, *dev_B = NULL, *dev_C = NULL;
MatrixSize *mallocTest = (MatrixSize *) calloc(sizeof(MatrixSize), 1);
size_t calcSize = N * N * sizeof(float);
host_A = (float *) calloc(calcSize, 1);
host_B = (float *) calloc(calcSize, 1);
host_C = (float *) calloc(calcSize, 1);
SetMatrixSize(mallocTest, N, N, N, N, N, N);
printf("DEV A PRE ALLOC: %p\n", dev_A);
// Initialize memory on GPU
MatrixInitCUDA(argc, argv, devID, mallocTest,
host_A, host_B, host_C,
dev_A, dev_B, dev_C);
printf("DEV A POST INIT: %p\n", dev_A);
return 0;
}
这是我编译和运行这段代码得到的输出:
DEV A PRE ALLOC: (nil)
Allocation size: 400 Matrix Size: 100
PRE ALLOC TYPE: Pf
POST ALLOC TYPE: Pf
DEV A POST ALLOC: 0x10208400000
DEV A POST INIT: (nil)
有多种方法可以实现所需的行为。
方法一
其中一种方法是修改MatrixInitCUDA
参数以接受设备指针的双指针(**
)并修改代码如下:
修改函数签名:
void MatrixInitCUDA(int argc, char **argv, int &devID, MatrixSize *matrixSize,
float *host_matrixA, float *host_matrixB, float *host_matrixC,
float **dev_matrixA, float **dev_matrixB, float **dev_matrixC)
{
}
在MatrixInitCUDA
:
里面分配设备内存如下
err = cudaMalloc((void **) dev_matrixA, matrixA_size);
像这样从 main
调用 MatrixInitCUDA
:
MatrixInitCUDA(argc, argv, devID, mallocTest,
host_A, host_B, host_C,
&dev_A, &dev_B, &dev_C);
方法二
我个人最喜欢的方法是不执行上述任何操作,只需修改函数签名以接受设备指针的引用,如下所示:
void MatrixInitCUDA(int argc, char **argv, int &devID, MatrixSize *matrixSize,
float *host_matrixA, float *host_matrixB, float *host_matrixC,
float *&dev_matrixA, float *&dev_matrixB, float *&dev_matrixC)
{
}
我是 CUDA 的新手,我确定我正在做的事情很简单,可以修复,但我也不确定要准确搜索什么才能找到答案。我试过环顾四周,但无济于事。
我的代码中有几个函数要执行矩阵运算,因此我不想编写多次分配内存的代码,而是想使用一个函数来为我执行此操作。我的问题是内存位置没有传回调用我的 MatrixInitCUDA 函数的函数。
如果我直接在我的矩阵函数中分配内存,它会按预期工作,但我 运行 遇到的问题是我指向设备内存的指针仅被分配给MatrixInitCUDA 函数。
最初我认为参数可能有某种类型的转换,所以我包含了类型信息 header 并在 cudaMalloc 之前和之后打印出设备参数的类型(没有变化 - 不是奇怪)。我已经尝试为设备矩阵参数传递双指针,但这似乎也不起作用,虽然我也没有正确地做到这一点。
// Compile using nvcc <file> -lcublas -o <output>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <typeinfo>
// Define block size for thread allocation
#define BLOCK_DIM 32
#define N 10
typedef struct _matrixSize // Optional Command-line multiplier for matrix sizes
{
unsigned int A_height, A_width, B_height, B_width, C_height, C_width;
} MatrixSize;
void SetMatrixSize(MatrixSize *matrixSize,
unsigned int widthA, unsigned int heightA,
unsigned int widthB, unsigned int heightB,
unsigned int widthC, unsigned int heightC)
{
matrixSize->A_height = heightA;
matrixSize->A_width = widthA;
matrixSize->B_height = heightB;
matrixSize->B_width = widthB;
matrixSize->C_height = heightC;
matrixSize->C_width = widthC;
}
void MatrixInitCUDA(int argc, char **argv, int &devID, MatrixSize *matrixSize,
float *host_matrixA, float *host_matrixB, float *host_matrixC,
float *dev_matrixA, float *dev_matrixB, float *dev_matrixC)
{
// Assign CUDA variables
devID = 0;
cudaGetDevice(&devID);
cudaError_t err;
// Assign size variables
size_t matrixA_size = matrixSize->A_height * matrixSize->A_width * sizeof(float);
printf("Allocation size: %d\tMatrix Size: %d\n", (int) matrixA_size, matrixSize->A_height * matrixSize->A_width);
size_t matrixB_size = matrixSize->B_height * matrixSize->B_width * sizeof(float);
size_t matrixC_size = matrixSize->C_height * matrixSize->C_width * sizeof(float);
printf("PRE ALLOC TYPE: %s\n", typeid(typeof(dev_matrixA)).name());
// Allocate memory on GPU
err = cudaMalloc((void **) &dev_matrixA, matrixA_size);
printf("POST ALLOC TYPE: %s\n", typeid(typeof(dev_matrixA)).name());
printf("DEV A POST ALLOC: %p\n", dev_matrixA);
if (err != cudaSuccess) printf("Allocate matrix A: %s\n", cudaGetErrorString(err));
err = cudaMalloc((void **) &dev_matrixB, matrixB_size);
if (err != cudaSuccess) printf("Allocate matrix B: %s\n", cudaGetErrorString(err));
err = cudaMalloc((void **) &dev_matrixC, matrixC_size);
if (err != cudaSuccess) printf("Allocate matrix C: %s\n", cudaGetErrorString(err));
// Copy data from host PC to GPU
err = cudaMemcpy(dev_matrixA, host_matrixA, matrixA_size, cudaMemcpyHostToDevice);
if (err != cudaSuccess) printf("Copy matrix A to GPU: %s\n", cudaGetErrorString(err));
err =cudaMemcpy(dev_matrixB, host_matrixB, matrixB_size, cudaMemcpyHostToDevice);
if (err != cudaSuccess) printf("Copy matrix B to GPU: %s\n", cudaGetErrorString(err));
err =cudaMemcpy(dev_matrixC, host_matrixC, matrixC_size, cudaMemcpyHostToDevice);
if (err != cudaSuccess) printf("Copy matrix C to GPU: %s\n", cudaGetErrorString(err));
}
int main(int argc, char **argv)
{
// Create memory for Layer 1, Layer 2, Layer 3 vectors
// float *layer1 = malloc(784*sizeof(floats)))
// Create memory for Weight 1->2, Weight 2->3 matrices
// Layer 1 will read from file for input (X) values
// Layer 2 and 3 will be calculated
int devID = 0;
cudaGetDevice(&devID);
// Testing hadamard product, init function, and set matrix size function
float *host_A, *host_B, *host_C, *dev_A = NULL, *dev_B = NULL, *dev_C = NULL;
MatrixSize *mallocTest = (MatrixSize *) calloc(sizeof(MatrixSize), 1);
size_t calcSize = N * N * sizeof(float);
host_A = (float *) calloc(calcSize, 1);
host_B = (float *) calloc(calcSize, 1);
host_C = (float *) calloc(calcSize, 1);
SetMatrixSize(mallocTest, N, N, N, N, N, N);
printf("DEV A PRE ALLOC: %p\n", dev_A);
// Initialize memory on GPU
MatrixInitCUDA(argc, argv, devID, mallocTest,
host_A, host_B, host_C,
dev_A, dev_B, dev_C);
printf("DEV A POST INIT: %p\n", dev_A);
return 0;
}
这是我编译和运行这段代码得到的输出:
DEV A PRE ALLOC: (nil)
Allocation size: 400 Matrix Size: 100
PRE ALLOC TYPE: Pf
POST ALLOC TYPE: Pf
DEV A POST ALLOC: 0x10208400000
DEV A POST INIT: (nil)
有多种方法可以实现所需的行为。
方法一
其中一种方法是修改MatrixInitCUDA
参数以接受设备指针的双指针(**
)并修改代码如下:
修改函数签名:
void MatrixInitCUDA(int argc, char **argv, int &devID, MatrixSize *matrixSize,
float *host_matrixA, float *host_matrixB, float *host_matrixC,
float **dev_matrixA, float **dev_matrixB, float **dev_matrixC)
{
}
在MatrixInitCUDA
:
err = cudaMalloc((void **) dev_matrixA, matrixA_size);
像这样从 main
调用 MatrixInitCUDA
:
MatrixInitCUDA(argc, argv, devID, mallocTest,
host_A, host_B, host_C,
&dev_A, &dev_B, &dev_C);
方法二
我个人最喜欢的方法是不执行上述任何操作,只需修改函数签名以接受设备指针的引用,如下所示:
void MatrixInitCUDA(int argc, char **argv, int &devID, MatrixSize *matrixSize,
float *host_matrixA, float *host_matrixB, float *host_matrixC,
float *&dev_matrixA, float *&dev_matrixB, float *&dev_matrixC)
{
}