某些 CUDA 计算因较大的块维度 (< 1024) 而失败
Some CUDA computations fail with larger block dimension (< 1024)
我正在使用 GTX 960 4GB 学习 CUDA。我编写了一个执行逐元素矩阵乘法的程序。当我将 x 和 y 的块尺寸增加到让我们说 (32 x 32) 与大矩阵(假设 15000 x 15000 元素)结合时,一些 但不是全部 乘法结果是错误(值 0 而不是 6)。
然后当我将块尺寸减小到例如 (8 x 8) 时,所有结果再次正确。当我减小 Matrix 大小时,结果也再次正确。
所以在这个例子中,似乎有总线程数和每个块的线程数的组合,这是行不通的。
我很惊讶我找不到任何关于这个主题的讨论帖。我所能找到的只是关于提高性能和占用率,而不是关于何时中止一些但不是所有计算。
网格尺寸计算如下:
dim3 blocks(ceil<int>(COLS / threads.x), ceil<int>(ROWS / threads.y));
为什么有些乘法运算失败而有些乘法运算成功?
一些例子
Block dim : (8, 8)
Matrix shape : (15000, 15000)
Verification : 0 elements have failed, total length 225000000, shape: (15000, 15000)
Block dim : (16, 16)
Matrix shape : (15000, 15000)
Verification : 239936 elements have failed, total length 225000000, shape: (15000, 15000)
Block dim : (32, 32)
Matrix shape : (15000, 15000)
Verification : 719424 elements have failed, total length 225000000, shape: (15000, 15000).
Block dim : (32, 32)
Matrix shape : (10000, 10000)
Verification : 0 elements have failed, total length 100000000, shape: (10000, 10000).
驱动程序版本
$ cat /proc/driver/nvidia/version
NVRM version: NVIDIA UNIX x86_64 Kernel Module 470.82.00 Thu Oct 14 10:24:40 UTC 2021
完整代码
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define ROWS 10000
#define COLS 10000
#define MAX_ERR 1e-6
typedef struct {
int width;
int height;
float* elements;
} Matrix;
size_t ij(int i, int j){
return j * ROWS + i;
}
__global__ void matrix_multi_elemwise(const Matrix OUT, const Matrix A, const Matrix B) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
if (col < A.width && row < A.height) {
int index = row * A.height + col; // linearisation of index
OUT.elements[index] = A.elements[index] * B.elements[index];
}
}
int main(){
Matrix A, B, OUT;
Matrix dev_A, dev_B, dev_OUT;
size_t SIZE = ROWS * COLS * sizeof(float);
// Allocate host memory
A.elements = (float*) malloc(SIZE);
B.elements = (float*) malloc(SIZE);
OUT.elements = (float*) malloc(SIZE);
// Initialize host matrices
A.height = ROWS; A.width = COLS;
B.height = ROWS; B.width = COLS;
OUT.height = ROWS; OUT.width = COLS;
for (int j = 0; j < ROWS; j++) {
for(int i = 0; i < COLS; i++){
A.elements[ij(i, j)] = 2.0f;
B.elements[ij(i, j)] = 3.0f;
}
}
// Allocate device memory
cudaMalloc((void**) &dev_A.elements, SIZE);
cudaMalloc((void**) &dev_B.elements, SIZE);
cudaMalloc((void**) &dev_OUT.elements, SIZE);
dev_A.height = A.height; dev_A.width = A.width;
dev_B.height = A.height; dev_B.width = B.width;
dev_OUT.height = A.height; dev_OUT.width = OUT.width;
// Transfer data from host to device memory
cudaMemcpy(dev_A.elements, A.elements, SIZE, cudaMemcpyHostToDevice);
cudaMemcpy(dev_B.elements, B.elements, SIZE, cudaMemcpyHostToDevice);
// Executing kernel
dim3 threads(16, 16);
dim3 blocks(ceil<int>(COLS / threads.x), ceil<int>(ROWS / threads.y));
matrix_multi_elemwise<<<blocks, threads>>>(dev_OUT, dev_A, dev_B);
cudaError_t err = cudaGetLastError();
if(err != cudaSuccess) {
printf("CUDA Runtime API Error reported : %s in file %s on line.\n", cudaGetErrorString(err), __FILE__);
}
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
// Transfer data back to host memory
cudaMemcpy(OUT.elements, dev_OUT.elements, SIZE, cudaMemcpyDeviceToHost);
// Verification
int count = 0, length = 0, i = 0, j = 0;
for (j = 0; j < ROWS; j++) {
for(i = 0; i < COLS; i++){
//assert(fabs(OUT.elements[ij(i, j)] / A.elements[ij(i, j)] - B.elements[ij(i, j)]) < MAX_ERR);
if (fabs(OUT.elements[ij(i, j)] / A.elements[ij(i, j)] - B.elements[ij(i, j)]) > MAX_ERR) {
count++;
}
length++;
}
}
printf("Verification: %i elements have failed, total length %i, shape: (%i, %i).\n", count, length, i, j);
// Deallocate device memory
cudaFree(dev_A.elements);
cudaFree(dev_B.elements);
cudaFree(dev_OUT.elements);
// Deallocate host memory
free(A.elements);
free(B.elements);
free(OUT.elements);
}
块数错误。事实上,COLS
和 threads.x
都是整数。因此,结果是 截断整数 。 ceil<int>
无法取消结果,因为它已被截断。这会导致某些块无法计算:15000 可以被 8 整除但不能被 16 整除。您需要将 COLS
转换为浮点数或手动计算 ceil 结果(更安全)。这是一个例子:
dim3 blocks((COLS + threads.x - 1) / threads.x, (ROWS + threads.y - 1) / threads.y);
正如评论中指出的那样,请注意 row * A.height + col
是错误的:应该是 row * A.width + col
。这会导致非方阵出现问题。
我正在使用 GTX 960 4GB 学习 CUDA。我编写了一个执行逐元素矩阵乘法的程序。当我将 x 和 y 的块尺寸增加到让我们说 (32 x 32) 与大矩阵(假设 15000 x 15000 元素)结合时,一些 但不是全部 乘法结果是错误(值 0 而不是 6)。
然后当我将块尺寸减小到例如 (8 x 8) 时,所有结果再次正确。当我减小 Matrix 大小时,结果也再次正确。
所以在这个例子中,似乎有总线程数和每个块的线程数的组合,这是行不通的。
我很惊讶我找不到任何关于这个主题的讨论帖。我所能找到的只是关于提高性能和占用率,而不是关于何时中止一些但不是所有计算。
网格尺寸计算如下:
dim3 blocks(ceil<int>(COLS / threads.x), ceil<int>(ROWS / threads.y));
为什么有些乘法运算失败而有些乘法运算成功?
一些例子
Block dim : (8, 8)
Matrix shape : (15000, 15000)
Verification : 0 elements have failed, total length 225000000, shape: (15000, 15000)
Block dim : (16, 16)
Matrix shape : (15000, 15000)
Verification : 239936 elements have failed, total length 225000000, shape: (15000, 15000)
Block dim : (32, 32)
Matrix shape : (15000, 15000)
Verification : 719424 elements have failed, total length 225000000, shape: (15000, 15000).
Block dim : (32, 32)
Matrix shape : (10000, 10000)
Verification : 0 elements have failed, total length 100000000, shape: (10000, 10000).
驱动程序版本
$ cat /proc/driver/nvidia/version
NVRM version: NVIDIA UNIX x86_64 Kernel Module 470.82.00 Thu Oct 14 10:24:40 UTC 2021
完整代码
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define ROWS 10000
#define COLS 10000
#define MAX_ERR 1e-6
typedef struct {
int width;
int height;
float* elements;
} Matrix;
size_t ij(int i, int j){
return j * ROWS + i;
}
__global__ void matrix_multi_elemwise(const Matrix OUT, const Matrix A, const Matrix B) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
if (col < A.width && row < A.height) {
int index = row * A.height + col; // linearisation of index
OUT.elements[index] = A.elements[index] * B.elements[index];
}
}
int main(){
Matrix A, B, OUT;
Matrix dev_A, dev_B, dev_OUT;
size_t SIZE = ROWS * COLS * sizeof(float);
// Allocate host memory
A.elements = (float*) malloc(SIZE);
B.elements = (float*) malloc(SIZE);
OUT.elements = (float*) malloc(SIZE);
// Initialize host matrices
A.height = ROWS; A.width = COLS;
B.height = ROWS; B.width = COLS;
OUT.height = ROWS; OUT.width = COLS;
for (int j = 0; j < ROWS; j++) {
for(int i = 0; i < COLS; i++){
A.elements[ij(i, j)] = 2.0f;
B.elements[ij(i, j)] = 3.0f;
}
}
// Allocate device memory
cudaMalloc((void**) &dev_A.elements, SIZE);
cudaMalloc((void**) &dev_B.elements, SIZE);
cudaMalloc((void**) &dev_OUT.elements, SIZE);
dev_A.height = A.height; dev_A.width = A.width;
dev_B.height = A.height; dev_B.width = B.width;
dev_OUT.height = A.height; dev_OUT.width = OUT.width;
// Transfer data from host to device memory
cudaMemcpy(dev_A.elements, A.elements, SIZE, cudaMemcpyHostToDevice);
cudaMemcpy(dev_B.elements, B.elements, SIZE, cudaMemcpyHostToDevice);
// Executing kernel
dim3 threads(16, 16);
dim3 blocks(ceil<int>(COLS / threads.x), ceil<int>(ROWS / threads.y));
matrix_multi_elemwise<<<blocks, threads>>>(dev_OUT, dev_A, dev_B);
cudaError_t err = cudaGetLastError();
if(err != cudaSuccess) {
printf("CUDA Runtime API Error reported : %s in file %s on line.\n", cudaGetErrorString(err), __FILE__);
}
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
// Transfer data back to host memory
cudaMemcpy(OUT.elements, dev_OUT.elements, SIZE, cudaMemcpyDeviceToHost);
// Verification
int count = 0, length = 0, i = 0, j = 0;
for (j = 0; j < ROWS; j++) {
for(i = 0; i < COLS; i++){
//assert(fabs(OUT.elements[ij(i, j)] / A.elements[ij(i, j)] - B.elements[ij(i, j)]) < MAX_ERR);
if (fabs(OUT.elements[ij(i, j)] / A.elements[ij(i, j)] - B.elements[ij(i, j)]) > MAX_ERR) {
count++;
}
length++;
}
}
printf("Verification: %i elements have failed, total length %i, shape: (%i, %i).\n", count, length, i, j);
// Deallocate device memory
cudaFree(dev_A.elements);
cudaFree(dev_B.elements);
cudaFree(dev_OUT.elements);
// Deallocate host memory
free(A.elements);
free(B.elements);
free(OUT.elements);
}
块数错误。事实上,COLS
和 threads.x
都是整数。因此,结果是 截断整数 。 ceil<int>
无法取消结果,因为它已被截断。这会导致某些块无法计算:15000 可以被 8 整除但不能被 16 整除。您需要将 COLS
转换为浮点数或手动计算 ceil 结果(更安全)。这是一个例子:
dim3 blocks((COLS + threads.x - 1) / threads.x, (ROWS + threads.y - 1) / threads.y);
正如评论中指出的那样,请注意 row * A.height + col
是错误的:应该是 row * A.width + col
。这会导致非方阵出现问题。