为什么@cuda.jit python 程序比它的 cuda-C 等效程序快?
Why is @cuda.jit python program faster than its cuda-C equivalent?
I 运行 C 和 Python 在 GPU 上添加两个数组的代码。但是我发现 Python 代码比 C 快 100 倍。
这是我的代码
@cuda.jit Python
import sys
import time
import numpy as np
from numba import cuda
@cuda.jit('void(float32[:], float32[:], float32[:])')
def cu_add(a,b,c):
bx = cuda.blockIdx.x
bw = cuda.blockDim.x
tx = cuda.threadIdx.x
i = tx + bx * bw
if i > c.size:
return
c[i] = a[i] + b[i]
def main(num):
device = cuda.get_current_device()
#num = 100
#Host memory
a = np.full(num, 1.0, dtype = np.float32)
b = np.full(num, 1.0, dtype = np.float32)
#create device memory
d_a = cuda.to_device(a)
d_b = cuda.to_device(b)
d_c = cuda.device_array_like(a)
#tpb = device.WARP_SIZE
tpb = 1024
bpg = int(np.ceil(float(num)/tpb))
print 'Blocks per grid:', bpg
print 'Threads per block', tpb
#launch kernel
st = time.time()
cu_add[bpg, tpb](d_a, d_b, d_c)
et = time.time()
print "Time taken ", (et - st), " seconds"
c = d_c.copy_to_host()
for i in xrange(1000):
if c[i] != 2.0:
raise Exception
#print c
if __name__ == "__main__":
main(int(sys.argv[1]))
运行 : python numba_vec_add_float.py 697932185
输出:
每个网格块:681575
每块线程数 1024
耗时 0.000330924987793 秒
CUDA C
#define MEMSIZE (2.6L * 1024L * 1024L * 1024L)
#include<stdio.h>
__global__ void add(float *a, float *b, float *c, unsigned long long num) {
unsigned long long idx = (blockIdx.x * blockDim.x) + threadIdx.x;
if(idx < num) {
c[idx] = a[idx] + b[idx];
}
}
int main() {
cudaEvent_t start, stop;
cudaError_t err;
float *a, *b, *d_a, *c, *d_b, *d_c;
unsigned long long num = MEMSIZE/4;
float elapsedTime;
err = cudaMalloc((void **)&d_a, MEMSIZE);
if (err != cudaSuccess) {
printf("failed to allocate memory to d_a\n");
exit(0);
}
err = cudaMalloc((void **)&d_b, MEMSIZE);
if (err != cudaSuccess) {
printf("failed to allocate memory to d_b\n");
exit(0);
}
err = cudaMalloc((void **)&d_c, MEMSIZE);
if (err != cudaSuccess) {
printf("failed to allocate memory to d_c\n");
exit(0);
}
a = (float *)malloc(MEMSIZE);
if(a==NULL) {
printf("Failed to allocate memory to a");
exit(0);
}
b = (float *)malloc(MEMSIZE);
if(b==NULL) {
printf("Failed to allocate memory to b");
exit(0);
}
c = (float *)malloc(MEMSIZE);
if(c==NULL) {
printf("Failed to allocate memory to c");
exit(0);
}
for(unsigned long long i=0; i<num; i++) {
float v = i/1000.0;
a[i] = v;
b[i] = v;
}
err = cudaMemcpy(d_a, a, MEMSIZE, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
printf("failed to copy memory from host to device\n");
exit(0);
}
err = cudaMemcpy(d_b, b, MEMSIZE, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
printf("failed to copy memory from host to device\n");
exit(0);
}
int thr = 1024;
long int bloc = (num/thr)+1;
printf("Blocks per grid: %ld", bloc);
printf("\nThreads per bloc: %d", thr);
cudaEventCreate(&start);
cudaEventRecord(start, 0);
add<<<bloc, thr>>>(d_a, d_b, d_c, num);
cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();
if (errSync != cudaSuccess) {
printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));
exit(0);
}
if (errAsync != cudaSuccess) {
printf("Async kernel error: %s\n", cudaGetErrorString(errAsync));
exit(0);
}
cudaEventCreate(&stop);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);;
printf("\nGPu time --> %f milliseconds\n", elapsedTime);
printf("Gpus time --> %f seconds\n", elapsedTime/1000);
err = cudaMemcpy(c, d_c, MEMSIZE, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
printf("failed to copy memory from Device to host\n");
exit(0);
}
free(a); free(b); free(c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
编译:nvcc --gpu-architecture=compute_61 nvidia_vector_addition.cu
运行 : ./a.out
输出:
每个网格块:681575
每个区块的线程数:1024
GPU 时间 --> 34.359295 毫秒
Gpus 时间 --> 0.034359 秒
观察到@cuda.jit python比cuda C快103倍,谁能说明我做的对还是错?
在 numba 案例中,您只测量内核启动开销,而不是 运行 内核所需的全部时间。在 CUDA-C 的情况下,您正在测量 运行 内核所需的全部时间。
要使 numba 案例执行与 CUDA-C 案例类似的测量,请尝试以下修改:
#launch kernel
mystream = cuda.stream()
st = time.time()
cu_add[bpg, tpb, mystream](d_a, d_b, d_c)
mystream.synchronize()
et = time.time()
(来自 here)。
I 运行 C 和 Python 在 GPU 上添加两个数组的代码。但是我发现 Python 代码比 C 快 100 倍。
这是我的代码
@cuda.jit Python
import sys
import time
import numpy as np
from numba import cuda
@cuda.jit('void(float32[:], float32[:], float32[:])')
def cu_add(a,b,c):
bx = cuda.blockIdx.x
bw = cuda.blockDim.x
tx = cuda.threadIdx.x
i = tx + bx * bw
if i > c.size:
return
c[i] = a[i] + b[i]
def main(num):
device = cuda.get_current_device()
#num = 100
#Host memory
a = np.full(num, 1.0, dtype = np.float32)
b = np.full(num, 1.0, dtype = np.float32)
#create device memory
d_a = cuda.to_device(a)
d_b = cuda.to_device(b)
d_c = cuda.device_array_like(a)
#tpb = device.WARP_SIZE
tpb = 1024
bpg = int(np.ceil(float(num)/tpb))
print 'Blocks per grid:', bpg
print 'Threads per block', tpb
#launch kernel
st = time.time()
cu_add[bpg, tpb](d_a, d_b, d_c)
et = time.time()
print "Time taken ", (et - st), " seconds"
c = d_c.copy_to_host()
for i in xrange(1000):
if c[i] != 2.0:
raise Exception
#print c
if __name__ == "__main__":
main(int(sys.argv[1]))
运行 : python numba_vec_add_float.py 697932185
输出: 每个网格块:681575 每块线程数 1024 耗时 0.000330924987793 秒
CUDA C
#define MEMSIZE (2.6L * 1024L * 1024L * 1024L)
#include<stdio.h>
__global__ void add(float *a, float *b, float *c, unsigned long long num) {
unsigned long long idx = (blockIdx.x * blockDim.x) + threadIdx.x;
if(idx < num) {
c[idx] = a[idx] + b[idx];
}
}
int main() {
cudaEvent_t start, stop;
cudaError_t err;
float *a, *b, *d_a, *c, *d_b, *d_c;
unsigned long long num = MEMSIZE/4;
float elapsedTime;
err = cudaMalloc((void **)&d_a, MEMSIZE);
if (err != cudaSuccess) {
printf("failed to allocate memory to d_a\n");
exit(0);
}
err = cudaMalloc((void **)&d_b, MEMSIZE);
if (err != cudaSuccess) {
printf("failed to allocate memory to d_b\n");
exit(0);
}
err = cudaMalloc((void **)&d_c, MEMSIZE);
if (err != cudaSuccess) {
printf("failed to allocate memory to d_c\n");
exit(0);
}
a = (float *)malloc(MEMSIZE);
if(a==NULL) {
printf("Failed to allocate memory to a");
exit(0);
}
b = (float *)malloc(MEMSIZE);
if(b==NULL) {
printf("Failed to allocate memory to b");
exit(0);
}
c = (float *)malloc(MEMSIZE);
if(c==NULL) {
printf("Failed to allocate memory to c");
exit(0);
}
for(unsigned long long i=0; i<num; i++) {
float v = i/1000.0;
a[i] = v;
b[i] = v;
}
err = cudaMemcpy(d_a, a, MEMSIZE, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
printf("failed to copy memory from host to device\n");
exit(0);
}
err = cudaMemcpy(d_b, b, MEMSIZE, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
printf("failed to copy memory from host to device\n");
exit(0);
}
int thr = 1024;
long int bloc = (num/thr)+1;
printf("Blocks per grid: %ld", bloc);
printf("\nThreads per bloc: %d", thr);
cudaEventCreate(&start);
cudaEventRecord(start, 0);
add<<<bloc, thr>>>(d_a, d_b, d_c, num);
cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();
if (errSync != cudaSuccess) {
printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));
exit(0);
}
if (errAsync != cudaSuccess) {
printf("Async kernel error: %s\n", cudaGetErrorString(errAsync));
exit(0);
}
cudaEventCreate(&stop);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);;
printf("\nGPu time --> %f milliseconds\n", elapsedTime);
printf("Gpus time --> %f seconds\n", elapsedTime/1000);
err = cudaMemcpy(c, d_c, MEMSIZE, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
printf("failed to copy memory from Device to host\n");
exit(0);
}
free(a); free(b); free(c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
编译:nvcc --gpu-architecture=compute_61 nvidia_vector_addition.cu
运行 : ./a.out
输出: 每个网格块:681575 每个区块的线程数:1024 GPU 时间 --> 34.359295 毫秒 Gpus 时间 --> 0.034359 秒
观察到@cuda.jit python比cuda C快103倍,谁能说明我做的对还是错?
在 numba 案例中,您只测量内核启动开销,而不是 运行 内核所需的全部时间。在 CUDA-C 的情况下,您正在测量 运行 内核所需的全部时间。
要使 numba 案例执行与 CUDA-C 案例类似的测量,请尝试以下修改:
#launch kernel
mystream = cuda.stream()
st = time.time()
cu_add[bpg, tpb, mystream](d_a, d_b, d_c)
mystream.synchronize()
et = time.time()
(来自 here)。