为什么@cuda.jit python 程序比它的 cuda-C 等效程序快？

Question

I 运行 C 和 Python 在 GPU 上添加两个数组的代码。但是我发现 Python 代码比 C 快 100 倍。

这是我的代码

@cuda.jit Python

import sys
import time
import numpy as np
from numba import cuda

@cuda.jit('void(float32[:], float32[:], float32[:])')
def cu_add(a,b,c):

    bx = cuda.blockIdx.x
    bw = cuda.blockDim.x
    tx = cuda.threadIdx.x

    i = tx + bx * bw

    if i > c.size:
        return

    c[i] = a[i] + b[i]



def main(num):

    device = cuda.get_current_device()

    #num = 100
    #Host memory

    a = np.full(num, 1.0, dtype = np.float32)
    b = np.full(num, 1.0, dtype = np.float32)


    #create device memory

    d_a = cuda.to_device(a)
    d_b = cuda.to_device(b)
    d_c = cuda.device_array_like(a)

    #tpb = device.WARP_SIZE
    tpb = 1024

    bpg = int(np.ceil(float(num)/tpb))

    print 'Blocks per grid:', bpg
    print 'Threads per block', tpb

    #launch kernel
    st = time.time()

    cu_add[bpg, tpb](d_a, d_b, d_c)

    et = time.time()

    print "Time taken ", (et - st), " seconds"
    c = d_c.copy_to_host()

    for i in xrange(1000):
        if c[i] != 2.0:
            raise Exception
    #print c
if __name__ == "__main__":
    main(int(sys.argv[1]))

运行 : python numba_vec_add_float.py 697932185

输出：每个网格块：681575 每块线程数 1024 耗时 0.000330924987793 秒

CUDA C

 #define MEMSIZE (2.6L * 1024L * 1024L * 1024L)
 #include<stdio.h>
 __global__ void add(float *a, float *b, float *c, unsigned long long num)     {
     unsigned long long idx = (blockIdx.x * blockDim.x) + threadIdx.x;
     if(idx < num) {
         c[idx] = a[idx] + b[idx];
     }
 }

 int main() {

     cudaEvent_t start, stop;
     cudaError_t err;

     float *a, *b, *d_a, *c, *d_b, *d_c;
     unsigned long long num = MEMSIZE/4;
     float elapsedTime;

     err = cudaMalloc((void **)&d_a, MEMSIZE);
     if (err != cudaSuccess) {
         printf("failed to allocate memory to d_a\n");
         exit(0);
     }

     err = cudaMalloc((void **)&d_b, MEMSIZE);
     if (err != cudaSuccess) {
         printf("failed to allocate memory to d_b\n");
         exit(0);
     }

     err = cudaMalloc((void **)&d_c, MEMSIZE);
     if (err != cudaSuccess) {
         printf("failed to allocate memory to d_c\n");
         exit(0);
     }

a = (float *)malloc(MEMSIZE);
if(a==NULL) {
    printf("Failed to allocate memory to a");
    exit(0);
}

b = (float *)malloc(MEMSIZE);
if(b==NULL) {
    printf("Failed to allocate memory to b");
    exit(0);
}
c = (float *)malloc(MEMSIZE);
if(c==NULL) {
    printf("Failed to allocate memory to c");
    exit(0);
}

for(unsigned long long i=0; i<num; i++) {
    float v = i/1000.0;
    a[i] = v;
    b[i] = v;
}

err = cudaMemcpy(d_a, a, MEMSIZE, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
    printf("failed to copy memory from host to device\n");
    exit(0);
}

err = cudaMemcpy(d_b, b, MEMSIZE, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
    printf("failed to copy memory from host to device\n");
    exit(0);
}

int thr = 1024;
long int bloc = (num/thr)+1;
printf("Blocks per grid: %ld", bloc);
printf("\nThreads per bloc: %d", thr);

cudaEventCreate(&start);
cudaEventRecord(start, 0);

add<<<bloc, thr>>>(d_a, d_b, d_c, num);

cudaError_t errSync  = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();
if (errSync != cudaSuccess) {
    printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));
    exit(0);
}
if (errAsync != cudaSuccess) {
    printf("Async kernel error: %s\n", cudaGetErrorString(errAsync));
    exit(0);
}

cudaEventCreate(&stop);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);;

printf("\nGPu time --> %f milliseconds\n", elapsedTime);
printf("Gpus time --> %f seconds\n", elapsedTime/1000);

err = cudaMemcpy(c, d_c, MEMSIZE, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
    printf("failed to copy memory from Device to host\n");
    exit(0);
}
free(a); free(b); free(c);

cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);

return 0;
}

编译：nvcc --gpu-architecture=compute_61 nvidia_vector_addition.cu

运行 : ./a.out

输出：每个网格块：681575 每个区块的线程数：1024 GPU 时间 --> 34.359295 毫秒 Gpus 时间 --> 0.034359 秒

观察到@cuda.jit python比cuda C快103倍，谁能说明我做的对还是错？

Answer 1

在 numba 案例中，您只测量内核启动开销，而不是运行内核所需的全部时间。在 CUDA-C 的情况下，您正在测量运行内核所需的全部时间。

要使 numba 案例执行与 CUDA-C 案例类似的测量，请尝试以下修改：

#launch kernel
mystream = cuda.stream()
st = time.time()

cu_add[bpg, tpb, mystream](d_a, d_b, d_c)
mystream.synchronize()
et = time.time()

（来自 here）。

为什么@cuda.jit python 程序比它的 cuda-C 等效程序快？

Why is @cuda.jit python program faster than its cuda-C equivalent?

python

performance

cuda

timing

numba