cuBlas 的不同结果
Varying results from cuBlas
我已经实现了以下 CUDA 代码,但我对行为有点困惑。
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <ctime>
#include <chrono>
#include <string>
#define IDX2F(i,j,ld) ((((j)-1)*(ld))+((i)-1))
void PrintMatrix(float* a, int n)
{
int j, i;
for (j = 1; j <= n; j++)
{
for (i = 1; i <= n; i++)
{
printf("%7.0f", a[IDX2F(i, j, n)]);
}
printf("\n");
}
}
float* CreateMatrix(int n)
{
float* matrix = static_cast<float *>(malloc(n * n * sizeof(float)));
if (!matrix)
{
printf("host memory allocation failed");
return nullptr;
}
for (int j = 1; j <= n; j++)
{
for (int i = 1; i <= n; i++)
{
matrix[IDX2F(i, j, n)] = 2;
}
}
return matrix;
}
long CudaMatrixMultiply(float* matrix, int n)
{
cudaError_t cudaStat;
cublasStatus_t status;
cublasHandle_t handle;
float* deviceMatrix;
cudaStat = cudaMalloc(reinterpret_cast<void**>(&deviceMatrix), n * n * sizeof(float));
if (cudaStat != cudaSuccess)
{
printf("device memory allocation failed");
return EXIT_FAILURE;
}
status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("CUBLAS initialization failed\n");
return EXIT_FAILURE;
}
status = cublasSetMatrix(n, n, sizeof(float), matrix, n, deviceMatrix, n);
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("data download failed");
cudaFree(deviceMatrix);
cublasDestroy(handle);
return EXIT_FAILURE;
}
float alpha = 1;
float beta = 0;
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, &alpha, deviceMatrix, n, deviceMatrix, n, &beta, deviceMatrix, n);
status = cublasGetMatrix(n, n, sizeof(float), deviceMatrix, n, matrix, n);
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("data upload failed");
cudaFree(deviceMatrix);
cublasDestroy(handle);
return EXIT_FAILURE;
}
cudaFree(deviceMatrix);
cublasDestroy(handle);
return EXIT_SUCCESS;
}
float* CpuMatrixMultiply(float* matrix, int size)
{
float* result = new float[size * size]();
// Copied from https://msdn.microsoft.com/en-us/library/hh873134.aspx
for (int row = 1; row <= size; row++)
{
for (int col = 1; col <= size; col++)
{
// Multiply the row of A by the column of B to get the row, column of product.
for (int inner = 1; inner <= size; inner++)
{
// result[row][col] += matrix[row][inner] * matrix[inner][col];
result[IDX2F(col, row, size)] += matrix[IDX2F(inner, row, size)] * matrix[IDX2F(col, inner, size)];
}
}
}
free(matrix);
return result;
}
int main(void)
{
// printf("Matrix * Matrix Test\n");
int size = 1000;
int runs = 10;
for (int run = 0; run != runs; run++)
{
printf("=== Test %d (Matrix * Matrix, Size = %d) ===\n\n", run + 1, size);
printf("RAM usage is: %f GB\n", size * size * sizeof(float) / 1000000000.0);
float* cpuMatrix = CreateMatrix(size);
cpuMatrix = CpuMatrixMultiply(cpuMatrix, size);
PrintMatrix(cpuMatrix, 5);
float* gpuMatrix = CreateMatrix(size);
CudaMatrixMultiply(gpuMatrix, size);
PrintMatrix(gpuMatrix, 5);
free(cpuMatrix);
free(gpuMatrix);
}
getchar();
return EXIT_SUCCESS;
}
CPU 版本的 MatrixMultiplication 的输出如下所示:
4000 4000 4000 4000 4000
4000 4000 4000 4000 4000
4000 4000 4000 4000 4000
4000 4000 4000 4000 4000
4000 4000 4000 4000 4000
但 GPU 计算的结果有时是正确的(见上文)或错误的随机(?)结果。当循环第一次执行时,结果总是正确的。
我找不到我的代码中的错误,如果你能帮助我就太好了。
此外,如果我将 size
(int main 方法)设置为例如16000
然后我的 driver 崩溃了,我收到一条错误消息。为此,我已经向 NVidea 写了一份错误报告,因为我的电脑崩溃了两次。但也许这是我的编程错误?
Driver: 364.72 (最新)
SDK:CUDA 工具包 7.5
显卡:NVidia GeForce GTX 960 (4GB)
Windows10个64位
Driver错误
Display driver NVIDIA Windows kernel Mode Driver, Version 362.72 stopped responding and has successfully recovered.
编辑:在社区的帮助下,我发现这是看门狗定时器的问题。请参阅下面的答案。
关于问题的第二部分,根据njuffa的评论,您可以更改驱动程序行为的设置以避免增加大小时的错误。打开 NSIGHT Monitor 并在选项、常规、Microsoft 显示驱动程序中,将 False WDDM TDR enabled 字段更改为 False。
从 spec 开始,单精度的 32 位 FPU 触发器应该约为 2.4 TFLOPS,因此您对 16000 大小的矩阵的操作至少需要 3.5 秒。因此驱动程序在 2 秒后恢复。
我已经实现了以下 CUDA 代码,但我对行为有点困惑。
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <ctime>
#include <chrono>
#include <string>
#define IDX2F(i,j,ld) ((((j)-1)*(ld))+((i)-1))
void PrintMatrix(float* a, int n)
{
int j, i;
for (j = 1; j <= n; j++)
{
for (i = 1; i <= n; i++)
{
printf("%7.0f", a[IDX2F(i, j, n)]);
}
printf("\n");
}
}
float* CreateMatrix(int n)
{
float* matrix = static_cast<float *>(malloc(n * n * sizeof(float)));
if (!matrix)
{
printf("host memory allocation failed");
return nullptr;
}
for (int j = 1; j <= n; j++)
{
for (int i = 1; i <= n; i++)
{
matrix[IDX2F(i, j, n)] = 2;
}
}
return matrix;
}
long CudaMatrixMultiply(float* matrix, int n)
{
cudaError_t cudaStat;
cublasStatus_t status;
cublasHandle_t handle;
float* deviceMatrix;
cudaStat = cudaMalloc(reinterpret_cast<void**>(&deviceMatrix), n * n * sizeof(float));
if (cudaStat != cudaSuccess)
{
printf("device memory allocation failed");
return EXIT_FAILURE;
}
status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("CUBLAS initialization failed\n");
return EXIT_FAILURE;
}
status = cublasSetMatrix(n, n, sizeof(float), matrix, n, deviceMatrix, n);
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("data download failed");
cudaFree(deviceMatrix);
cublasDestroy(handle);
return EXIT_FAILURE;
}
float alpha = 1;
float beta = 0;
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, &alpha, deviceMatrix, n, deviceMatrix, n, &beta, deviceMatrix, n);
status = cublasGetMatrix(n, n, sizeof(float), deviceMatrix, n, matrix, n);
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("data upload failed");
cudaFree(deviceMatrix);
cublasDestroy(handle);
return EXIT_FAILURE;
}
cudaFree(deviceMatrix);
cublasDestroy(handle);
return EXIT_SUCCESS;
}
float* CpuMatrixMultiply(float* matrix, int size)
{
float* result = new float[size * size]();
// Copied from https://msdn.microsoft.com/en-us/library/hh873134.aspx
for (int row = 1; row <= size; row++)
{
for (int col = 1; col <= size; col++)
{
// Multiply the row of A by the column of B to get the row, column of product.
for (int inner = 1; inner <= size; inner++)
{
// result[row][col] += matrix[row][inner] * matrix[inner][col];
result[IDX2F(col, row, size)] += matrix[IDX2F(inner, row, size)] * matrix[IDX2F(col, inner, size)];
}
}
}
free(matrix);
return result;
}
int main(void)
{
// printf("Matrix * Matrix Test\n");
int size = 1000;
int runs = 10;
for (int run = 0; run != runs; run++)
{
printf("=== Test %d (Matrix * Matrix, Size = %d) ===\n\n", run + 1, size);
printf("RAM usage is: %f GB\n", size * size * sizeof(float) / 1000000000.0);
float* cpuMatrix = CreateMatrix(size);
cpuMatrix = CpuMatrixMultiply(cpuMatrix, size);
PrintMatrix(cpuMatrix, 5);
float* gpuMatrix = CreateMatrix(size);
CudaMatrixMultiply(gpuMatrix, size);
PrintMatrix(gpuMatrix, 5);
free(cpuMatrix);
free(gpuMatrix);
}
getchar();
return EXIT_SUCCESS;
}
CPU 版本的 MatrixMultiplication 的输出如下所示:
4000 4000 4000 4000 4000
4000 4000 4000 4000 4000
4000 4000 4000 4000 4000
4000 4000 4000 4000 4000
4000 4000 4000 4000 4000
但 GPU 计算的结果有时是正确的(见上文)或错误的随机(?)结果。当循环第一次执行时,结果总是正确的。
我找不到我的代码中的错误,如果你能帮助我就太好了。
此外,如果我将 size
(int main 方法)设置为例如16000
然后我的 driver 崩溃了,我收到一条错误消息。为此,我已经向 NVidea 写了一份错误报告,因为我的电脑崩溃了两次。但也许这是我的编程错误?
Driver: 364.72 (最新)
SDK:CUDA 工具包 7.5
显卡:NVidia GeForce GTX 960 (4GB)
Windows10个64位
Driver错误
Display driver NVIDIA Windows kernel Mode Driver, Version 362.72 stopped responding and has successfully recovered.
编辑:在社区的帮助下,我发现这是看门狗定时器的问题。请参阅下面的答案。
关于问题的第二部分,根据njuffa的评论,您可以更改驱动程序行为的设置以避免增加大小时的错误。打开 NSIGHT Monitor 并在选项、常规、Microsoft 显示驱动程序中,将 False WDDM TDR enabled 字段更改为 False。
从 spec 开始,单精度的 32 位 FPU 触发器应该约为 2.4 TFLOPS,因此您对 16000 大小的矩阵的操作至少需要 3.5 秒。因此驱动程序在 2 秒后恢复。