cuda岩浆矩阵-矩阵加法核
cuda magma matrix-matrix addition kernel
我尝试使用与 magmablas_sgeadd_q 内核类似的格式,但是我没有得到正确的输出,而且每次我 运行 它,我都会得到不同的输出。
我使用的代码如下:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#define BLK_X 2
#define BLK_Y 1
__global__ void matrixAdd2( const float *dA, const float *dB, float *dC, int m, int n)
{
int ldda = m;
int lddb = m;
int ind = blockIdx.x*BLK_X + threadIdx.x;
int iby = blockIdx.y*BLK_Y;
/* check if full block-column */
bool full = (iby + BLK_Y <= n);
/* do only rows inside matrix */
if ( ind < m ) {
dA += ind + iby*ldda;
dB += ind + iby*lddb;
if ( full )
{
// full block-column
#pragma unroll
for( int j=0; j < BLK_Y; ++j )
{
dC[j*lddb] = dA[j*ldda] + dB[j*lddb];
printf("A is %f, B is %f, C is %f \n",dA[j*ldda],dB[j*lddb],dC[j*lddb]);
}
}
else
{
// partial block-column
for( int j=0; j < BLK_Y && iby+j < n; ++j )
{
dC[j*lddb] = dA[j*ldda] + dB[j*lddb];
printf("parital: A is %f, B is %f, C is %f \n",dA[j*ldda],dB[j*lddb],dC[j*lddb]);
}
}
}
}
int main ( void )
{
int m = 4; // a - mxn matrix
int n = 2; // b - mxn matrix
size_t size = m * n * sizeof(float);
printf("Matrix addition of %d rows and %d columns \n", m, n);
// allocate matrices on the host
float *h_A = (float *)malloc(size); // a- mxn matrix on the host
float *h_B = (float *)malloc(size); // b- mxn matrix on the host
float *h_C = (float *)malloc(size); // b- mxn matrix on the host
// Initialize the host input matrixs
for (int i = 0; i < m; ++i)
{
for (int j = 0; j < n ; j ++)
{
h_A[i*m+j] = rand()/(float)RAND_MAX;
h_B[i*m+j] = rand()/(float)RAND_MAX;
}
}
// Allocate the device input matrix A
float *d_A = NULL;
err = cudaMalloc((void **)&d_A, size);; // d_a - mxn matrix a on the device
// Allocate the device input matrix B
float *d_B = NULL;
err = cudaMalloc((void **)&d_B, size);
// Allocate the device output matrix C
float *d_C = NULL;
err = cudaMalloc((void **)&d_C, size);
// Copy the host input matrixs A and B in host memory to the device input matrixs in device memory
printf("Copy input data from the host memory to the CUDA device\n");
err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
// defining number of threads and blocks
dim3 threads( BLK_X, 1 );
dim3 grid((int)ceil(m/BLK_X),(int)ceil(n/BLK_Y) );
// Launching kernel
matrixAdd2<<<grid, threads, 0>>>(d_A, d_B, d_C, m, n);
// Copy the device result matrix in device memory to the host result matrix in host memory.
printf("Copy output data from the CUDA device to the host memory\n");
err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
//print A matrix
printf("Matrix A");
for (int i = 0; i < m; i++)
{
for (int j = 0; j < n; j++)
{
printf(" %f", h_A[i*m+j]);
}
printf("\n");
}
// print B matrix if required
printf("Matrix B");
for (int i = 0; i < m; i++)
{
for (int j = 0; j < n; j++)
{
printf(" %f", h_B[i*m+j]);
}
printf("\n");
}
//Error checkng
printf("Matrix C ");
for (int i = 0; i < m; i++)
{
for (int j = 0; j < n; j++)
{
printf("%f", h_C[i*m+j]);
if(h_C[i*m+j] == h_A[i*m+j] + h_B[i*m+j] )
{
flag = flag + 1;
}
}
printf("\n");
}
if(flag==m*n)
{
printf("Test PASSED\n");
}
// Free device global memory
err = cudaFree(d_A);
err = cudaFree(d_B);
err = cudaFree(d_C);
// Free host memory
free(h_A);
free(h_B);
free(h_C);
err = cudaDeviceReset();
printf("Done\n");
return 0;
}
我得到的输出:
4行2列的矩阵加法
将输入数据从主机内存复制到 CUDA 设备
CUDA 内核启动时有 4 个块,每块 2 个线程
将输出数据从 CUDA 设备复制到主机内存
A为0.000000,B为0.364784,C为0.364784
A为0.000000,B为0.952230,C为0.952230
A为0.000000,B为0.000000,C为0.000000
A为0.000000,B为0.000000,C为0.000000
A为0.840188,B为0.394383,C为1.234571
A为0.783099,B为0.798440,C为1.581539
A为0.911647,B为0.197551,C为1.109199
A为0.335223,B为0.768230,C为1.103452
矩阵A
0.840188 0.783099
0.911647 0.335223
0.277775 0.477397
0.364784 0.952230
矩阵 B
0.394383 0.798440
0.197551 0.768230
0.553970 0.628871
0.000000 0.000000
矩阵C
0.0000000.000000
0.0000000.000000
0.0000000.000000
0.0000000.000000
如果您发现代码有问题,请告诉我。
谢谢
我发现了两个编码错误:
当您使用此方法 "bump" 内核中矩阵 dA
和 dB
的基指针时,您还必须 对矩阵 dC
:
的基指针做同样的事情
if ( ind < m ) {
dA += ind + iby*ldda;
dB += ind + iby*lddb;
dC += ind + iby*lddb; // add this line
您的主机代码嵌套 for 循环没有正确索引。外循环旨在对行进行索引,并且您有 n
行,但是您允许外循环对 m
行进行索引:
for (int i = 0; i < m; ++i)
{
for (int j = 0; j < n ; j ++)
那么当你在这里进行实际的索引计算时:
h_A[i*m+j] = rand()/(float)RAND_MAX;
您的索引超出范围。 (i*m
超出了矩阵大小,对于 i
的某些值)这个问题在主机代码中的所有嵌套 for 循环中重复出现。解决方法是反转 i
、j
循环中的 m
、n
边界。
以下代码修复了这些错误(加上您遗漏的变量定义的一些补充 - err
和 flag
在您当前发布的代码中未定义 - 这会产生编译错误。)它似乎 运行 正确并产生正确的结果:
$ cat t1213.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#define BLK_X 2
#define BLK_Y 1
__global__ void matrixAdd2( const float *dA, const float *dB, float *dC, int m, int n)
{
int ldda = m;
int lddb = m;
int ind = blockIdx.x*BLK_X + threadIdx.x;
int iby = blockIdx.y*BLK_Y;
/* check if full block-column */
bool full = (iby + BLK_Y <= n);
/* do only rows inside matrix */
if ( ind < m ) {
dA += ind + iby*ldda;
dB += ind + iby*lddb;
dC += ind + iby*lddb;
if ( full )
{
// full block-column
#pragma unroll
for( int j=0; j < BLK_Y; ++j )
{
dC[j*lddb] = dA[j*ldda] + dB[j*lddb];
printf("A is %f, B is %f, C is %f \n",dA[j*ldda],dB[j*lddb],dC[j*lddb]);
}
}
else
{
// partial block-column
for( int j=0; j < BLK_Y && iby+j < n; ++j )
{
dC[j*lddb] = dA[j*ldda] + dB[j*lddb];
printf("parital: A is %f, B is %f, C is %f \n",dA[j*ldda],dB[j*lddb],dC[j*lddb]);
}
}
}
}
int main ( void )
{
int m = 4; // a - mxn matrix
int n = 2; // b - mxn matrix
size_t size = m * n * sizeof(float);
printf("Matrix addition of %d rows and %d columns \n", m, n);
// allocate matrices on the host
float *h_A = (float *)malloc(size); // a- mxn matrix on the host
float *h_B = (float *)malloc(size); // b- mxn matrix on the host
float *h_C = (float *)malloc(size); // b- mxn matrix on the host
// Initialize the host input matrixs
for (int i = 0; i < n; ++i)
{
for (int j = 0; j < m ; j ++)
{
h_A[i*m+j] = rand()/(float)RAND_MAX;
h_B[i*m+j] = rand()/(float)RAND_MAX;
}
}
// Allocate the device input matrix A
float *d_A = NULL;
cudaError_t err = cudaMalloc((void **)&d_A, size);; // d_a - mxn matrix a on the device
// Allocate the device input matrix B
float *d_B = NULL;
err = cudaMalloc((void **)&d_B, size);
// Allocate the device output matrix C
float *d_C = NULL;
err = cudaMalloc((void **)&d_C, size);
// Copy the host input matrixs A and B in host memory to the device input matrixs in device memory
printf("Copy input data from the host memory to the CUDA device\n");
err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
// defining number of threads and blocks
dim3 threads( BLK_X, BLK_Y );
dim3 grid((int)ceil(m/BLK_X),(int)ceil(n/BLK_Y) );
// Launching kernel
matrixAdd2<<<grid, threads, 0>>>(d_A, d_B, d_C, m, n);
// Copy the device result matrix in device memory to the host result matrix in host memory.
printf("Copy output data from the CUDA device to the host memory\n");
err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
//print A matrix
printf("Matrix A");
for (int i = 0; i < n; i++)
{
for (int j = 0; j < m; j++)
{
printf(" %f", h_A[i*m+j]);
}
printf("\n");
}
// print B matrix if required
printf("Matrix B");
for (int i = 0; i < n; i++)
{
for (int j = 0; j < m; j++)
{
printf(" %f", h_B[i*m+j]);
}
printf("\n");
}
int flag = 0;
//Error checkng
printf("Matrix C ");
for (int i = 0; i < n; i++)
{
for (int j = 0; j < m; j++)
{
printf("%f", h_C[i*m+j]);
if(h_C[i*m+j] == h_A[i*m+j] + h_B[i*m+j] )
{
flag = flag + 1;
}
}
printf("\n");
}
if(flag==m*n)
{
printf("Test PASSED\n");
}
// Free device global memory
err = cudaFree(d_A);
err = cudaFree(d_B);
err = cudaFree(d_C);
// Free host memory
free(h_A);
free(h_B);
free(h_C);
err = cudaDeviceReset();
printf("Done\n");
return 0;
}
$ nvcc -o t1213 t1213.cu
$ cuda-memcheck ./t1213
========= CUDA-MEMCHECK
Matrix addition of 4 rows and 2 columns
Copy input data from the host memory to the CUDA device
Copy output data from the CUDA device to the host memory
A is 0.277775, B is 0.553970, C is 0.831745
A is 0.477397, B is 0.628871, C is 1.106268
A is 0.364784, B is 0.513401, C is 0.878185
A is 0.952230, B is 0.916195, C is 1.868425
A is 0.911647, B is 0.197551, C is 1.109199
A is 0.335223, B is 0.768230, C is 1.103452
A is 0.840188, B is 0.394383, C is 1.234571
A is 0.783099, B is 0.798440, C is 1.581539
Matrix A 0.840188 0.783099 0.911647 0.335223
0.277775 0.477397 0.364784 0.952230
Matrix B 0.394383 0.798440 0.197551 0.768230
0.553970 0.628871 0.513401 0.916195
Matrix C 1.2345711.5815391.1091991.103452
0.8317451.1062680.8781851.868425
Test PASSED
Done
========= ERROR SUMMARY: 0 errors
$
我尝试使用与 magmablas_sgeadd_q 内核类似的格式,但是我没有得到正确的输出,而且每次我 运行 它,我都会得到不同的输出。 我使用的代码如下:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#define BLK_X 2
#define BLK_Y 1
__global__ void matrixAdd2( const float *dA, const float *dB, float *dC, int m, int n)
{
int ldda = m;
int lddb = m;
int ind = blockIdx.x*BLK_X + threadIdx.x;
int iby = blockIdx.y*BLK_Y;
/* check if full block-column */
bool full = (iby + BLK_Y <= n);
/* do only rows inside matrix */
if ( ind < m ) {
dA += ind + iby*ldda;
dB += ind + iby*lddb;
if ( full )
{
// full block-column
#pragma unroll
for( int j=0; j < BLK_Y; ++j )
{
dC[j*lddb] = dA[j*ldda] + dB[j*lddb];
printf("A is %f, B is %f, C is %f \n",dA[j*ldda],dB[j*lddb],dC[j*lddb]);
}
}
else
{
// partial block-column
for( int j=0; j < BLK_Y && iby+j < n; ++j )
{
dC[j*lddb] = dA[j*ldda] + dB[j*lddb];
printf("parital: A is %f, B is %f, C is %f \n",dA[j*ldda],dB[j*lddb],dC[j*lddb]);
}
}
}
}
int main ( void )
{
int m = 4; // a - mxn matrix
int n = 2; // b - mxn matrix
size_t size = m * n * sizeof(float);
printf("Matrix addition of %d rows and %d columns \n", m, n);
// allocate matrices on the host
float *h_A = (float *)malloc(size); // a- mxn matrix on the host
float *h_B = (float *)malloc(size); // b- mxn matrix on the host
float *h_C = (float *)malloc(size); // b- mxn matrix on the host
// Initialize the host input matrixs
for (int i = 0; i < m; ++i)
{
for (int j = 0; j < n ; j ++)
{
h_A[i*m+j] = rand()/(float)RAND_MAX;
h_B[i*m+j] = rand()/(float)RAND_MAX;
}
}
// Allocate the device input matrix A
float *d_A = NULL;
err = cudaMalloc((void **)&d_A, size);; // d_a - mxn matrix a on the device
// Allocate the device input matrix B
float *d_B = NULL;
err = cudaMalloc((void **)&d_B, size);
// Allocate the device output matrix C
float *d_C = NULL;
err = cudaMalloc((void **)&d_C, size);
// Copy the host input matrixs A and B in host memory to the device input matrixs in device memory
printf("Copy input data from the host memory to the CUDA device\n");
err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
// defining number of threads and blocks
dim3 threads( BLK_X, 1 );
dim3 grid((int)ceil(m/BLK_X),(int)ceil(n/BLK_Y) );
// Launching kernel
matrixAdd2<<<grid, threads, 0>>>(d_A, d_B, d_C, m, n);
// Copy the device result matrix in device memory to the host result matrix in host memory.
printf("Copy output data from the CUDA device to the host memory\n");
err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
//print A matrix
printf("Matrix A");
for (int i = 0; i < m; i++)
{
for (int j = 0; j < n; j++)
{
printf(" %f", h_A[i*m+j]);
}
printf("\n");
}
// print B matrix if required
printf("Matrix B");
for (int i = 0; i < m; i++)
{
for (int j = 0; j < n; j++)
{
printf(" %f", h_B[i*m+j]);
}
printf("\n");
}
//Error checkng
printf("Matrix C ");
for (int i = 0; i < m; i++)
{
for (int j = 0; j < n; j++)
{
printf("%f", h_C[i*m+j]);
if(h_C[i*m+j] == h_A[i*m+j] + h_B[i*m+j] )
{
flag = flag + 1;
}
}
printf("\n");
}
if(flag==m*n)
{
printf("Test PASSED\n");
}
// Free device global memory
err = cudaFree(d_A);
err = cudaFree(d_B);
err = cudaFree(d_C);
// Free host memory
free(h_A);
free(h_B);
free(h_C);
err = cudaDeviceReset();
printf("Done\n");
return 0;
}
我得到的输出:
4行2列的矩阵加法 将输入数据从主机内存复制到 CUDA 设备 CUDA 内核启动时有 4 个块,每块 2 个线程 将输出数据从 CUDA 设备复制到主机内存 A为0.000000,B为0.364784,C为0.364784 A为0.000000,B为0.952230,C为0.952230 A为0.000000,B为0.000000,C为0.000000 A为0.000000,B为0.000000,C为0.000000 A为0.840188,B为0.394383,C为1.234571 A为0.783099,B为0.798440,C为1.581539 A为0.911647,B为0.197551,C为1.109199 A为0.335223,B为0.768230,C为1.103452
矩阵A
0.840188 0.783099 0.911647 0.335223 0.277775 0.477397 0.364784 0.952230
矩阵 B
0.394383 0.798440 0.197551 0.768230 0.553970 0.628871 0.000000 0.000000
矩阵C
0.0000000.000000 0.0000000.000000 0.0000000.000000 0.0000000.000000
如果您发现代码有问题,请告诉我。
谢谢
我发现了两个编码错误:
当您使用此方法 "bump" 内核中矩阵
的基指针做同样的事情dA
和dB
的基指针时,您还必须 对矩阵dC
:if ( ind < m ) { dA += ind + iby*ldda; dB += ind + iby*lddb; dC += ind + iby*lddb; // add this line
您的主机代码嵌套 for 循环没有正确索引。外循环旨在对行进行索引,并且您有
n
行,但是您允许外循环对m
行进行索引:for (int i = 0; i < m; ++i) { for (int j = 0; j < n ; j ++)
那么当你在这里进行实际的索引计算时:
h_A[i*m+j] = rand()/(float)RAND_MAX;
您的索引超出范围。 (
i*m
超出了矩阵大小,对于i
的某些值)这个问题在主机代码中的所有嵌套 for 循环中重复出现。解决方法是反转i
、j
循环中的m
、n
边界。
以下代码修复了这些错误(加上您遗漏的变量定义的一些补充 - err
和 flag
在您当前发布的代码中未定义 - 这会产生编译错误。)它似乎 运行 正确并产生正确的结果:
$ cat t1213.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#define BLK_X 2
#define BLK_Y 1
__global__ void matrixAdd2( const float *dA, const float *dB, float *dC, int m, int n)
{
int ldda = m;
int lddb = m;
int ind = blockIdx.x*BLK_X + threadIdx.x;
int iby = blockIdx.y*BLK_Y;
/* check if full block-column */
bool full = (iby + BLK_Y <= n);
/* do only rows inside matrix */
if ( ind < m ) {
dA += ind + iby*ldda;
dB += ind + iby*lddb;
dC += ind + iby*lddb;
if ( full )
{
// full block-column
#pragma unroll
for( int j=0; j < BLK_Y; ++j )
{
dC[j*lddb] = dA[j*ldda] + dB[j*lddb];
printf("A is %f, B is %f, C is %f \n",dA[j*ldda],dB[j*lddb],dC[j*lddb]);
}
}
else
{
// partial block-column
for( int j=0; j < BLK_Y && iby+j < n; ++j )
{
dC[j*lddb] = dA[j*ldda] + dB[j*lddb];
printf("parital: A is %f, B is %f, C is %f \n",dA[j*ldda],dB[j*lddb],dC[j*lddb]);
}
}
}
}
int main ( void )
{
int m = 4; // a - mxn matrix
int n = 2; // b - mxn matrix
size_t size = m * n * sizeof(float);
printf("Matrix addition of %d rows and %d columns \n", m, n);
// allocate matrices on the host
float *h_A = (float *)malloc(size); // a- mxn matrix on the host
float *h_B = (float *)malloc(size); // b- mxn matrix on the host
float *h_C = (float *)malloc(size); // b- mxn matrix on the host
// Initialize the host input matrixs
for (int i = 0; i < n; ++i)
{
for (int j = 0; j < m ; j ++)
{
h_A[i*m+j] = rand()/(float)RAND_MAX;
h_B[i*m+j] = rand()/(float)RAND_MAX;
}
}
// Allocate the device input matrix A
float *d_A = NULL;
cudaError_t err = cudaMalloc((void **)&d_A, size);; // d_a - mxn matrix a on the device
// Allocate the device input matrix B
float *d_B = NULL;
err = cudaMalloc((void **)&d_B, size);
// Allocate the device output matrix C
float *d_C = NULL;
err = cudaMalloc((void **)&d_C, size);
// Copy the host input matrixs A and B in host memory to the device input matrixs in device memory
printf("Copy input data from the host memory to the CUDA device\n");
err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
// defining number of threads and blocks
dim3 threads( BLK_X, BLK_Y );
dim3 grid((int)ceil(m/BLK_X),(int)ceil(n/BLK_Y) );
// Launching kernel
matrixAdd2<<<grid, threads, 0>>>(d_A, d_B, d_C, m, n);
// Copy the device result matrix in device memory to the host result matrix in host memory.
printf("Copy output data from the CUDA device to the host memory\n");
err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
//print A matrix
printf("Matrix A");
for (int i = 0; i < n; i++)
{
for (int j = 0; j < m; j++)
{
printf(" %f", h_A[i*m+j]);
}
printf("\n");
}
// print B matrix if required
printf("Matrix B");
for (int i = 0; i < n; i++)
{
for (int j = 0; j < m; j++)
{
printf(" %f", h_B[i*m+j]);
}
printf("\n");
}
int flag = 0;
//Error checkng
printf("Matrix C ");
for (int i = 0; i < n; i++)
{
for (int j = 0; j < m; j++)
{
printf("%f", h_C[i*m+j]);
if(h_C[i*m+j] == h_A[i*m+j] + h_B[i*m+j] )
{
flag = flag + 1;
}
}
printf("\n");
}
if(flag==m*n)
{
printf("Test PASSED\n");
}
// Free device global memory
err = cudaFree(d_A);
err = cudaFree(d_B);
err = cudaFree(d_C);
// Free host memory
free(h_A);
free(h_B);
free(h_C);
err = cudaDeviceReset();
printf("Done\n");
return 0;
}
$ nvcc -o t1213 t1213.cu
$ cuda-memcheck ./t1213
========= CUDA-MEMCHECK
Matrix addition of 4 rows and 2 columns
Copy input data from the host memory to the CUDA device
Copy output data from the CUDA device to the host memory
A is 0.277775, B is 0.553970, C is 0.831745
A is 0.477397, B is 0.628871, C is 1.106268
A is 0.364784, B is 0.513401, C is 0.878185
A is 0.952230, B is 0.916195, C is 1.868425
A is 0.911647, B is 0.197551, C is 1.109199
A is 0.335223, B is 0.768230, C is 1.103452
A is 0.840188, B is 0.394383, C is 1.234571
A is 0.783099, B is 0.798440, C is 1.581539
Matrix A 0.840188 0.783099 0.911647 0.335223
0.277775 0.477397 0.364784 0.952230
Matrix B 0.394383 0.798440 0.197551 0.768230
0.553970 0.628871 0.513401 0.916195
Matrix C 1.2345711.5815391.1091991.103452
0.8317451.1062680.8781851.868425
Test PASSED
Done
========= ERROR SUMMARY: 0 errors
$