使用 numba cuda 减少复数

Complex number reduction with numba cuda

我正在尝试使用 cuda\numba 加速 python 代码。该代码适用于复数、浮点数和整数的大型数组。我在这里包括了 python 版本和 numba-cuda 版本。 numba-cuda 版本无法编译。

我已经尝试将复数计算作为单独的实数和虚数来执行,尽管复数格式可能是问题所在。

def random_choice_noreplace(m,n, axis=-1):
 # m, n are the number of rows, cols of output
 return np.random.rand(m,n).argsort(axis=axis)

@cuda.jit
def cuda_kernel (d_npart, d_npts, d_data, d_data_index, d_coef, d_datasum, d_tmp):

 row, col = cuda.grid(2)
 if row < d_npart and col < d_npts :
 d_tmp[row, col] = d_data[d_data_index[row, col]]
 d_tmp[row, col] =d_tmp[row, col] * d_coef[row, col]
 # All threads get to this point ===============================
 cuda.syncthreads()
 if row == 0 and col ==0 :
 d_datasum = np.sum(d_tmp, axis=0)

def calculate_cuda (data, data_index, coef):

 npart, npts = data_index.shape
 # arrays to copy to GPU memory =====================================
 d_npart = cuda.to_device(npart)
 d_npts = cuda.to_device(npts)
 d_data = cuda.to_device(data)
 d_data_index = cuda.to_device(data_index)
 d_coef = cuda.to_device(coef)

 d_datasum = cuda.device_array(npts, np.complex64)
 d_tmp = cuda.device_array((npart,npts), np.complex64)

 threadsperblock = (16, 16)
 blockspergrid_x = int(math.ceil(npts / threadsperblock[0]))+1
 blockspergrid_y = int(math.ceil(npart / threadsperblock[1]))+1
 blockspergrid = (blockspergrid_x, blockspergrid_y)
 cuda_kernel[blockspergrid, threadsperblock](d_npart, d_npts, d_data, d_data_index, d_coef, d_datasum, d_tmp)
 # Copy data from GPU to CPU ========================================
 final_data_sum = d_datasum.copy_to_host()
 return final_data_sum



def calculate_python (data, data_index, coef):
 npart, npts = data_index.shape
 data_sum = np.zeros(npts, dtype=np.complex64)
 tmp = np.zeros(npts, dtype=np.complex64)
 print(" Calling python function...")
 start_time = time.time()
 for i in range(npart):
 tmp[:] = data[data_index[i]]
 data_sum += tmp * coef[i]
 return data_sum

if __name__ == '__main__':

 data_size = 1200
 rows = 31
 cols = 1000

 rand_float1 = np.random.randn(data_size)
 rand_float2 = np.random.randn(data_size)

 data = rand_float1 + 1j * rand_float2
 coef = np.random.randn(rows, cols)
 data_index = random_choice_noreplace(rows, cols)

 start_time = time.time()
 gpu_data_sum_python = calculate_python (data, data_index, coef)
 python_time = time.time() - start_time #print("gpu c : ", c_gpu)
 print("---- %s seconds for python ----:" % (python_time))


 start_time = time.time()
 gpu_data_sum = calculate_cuda (data, data_index, coef)
 gpu_time = time.time() - start_time
 print("---- %s seconds for gpu ----:" % (gpu_time))

当我 运行 代码时,我得到这个错误:

    Calling python function...
---- 0.000344038009644 seconds for python ----:
Traceback (most recent call last):
  File "GPU_Fake_PA_partial.py", line 82, in <module>
    gpu_data_sum = calculate_cuda (data, data_index, coef)
  File "GPU_Fake_PA_partial.py", line 44, in calculate_cuda
    cuda_kernel[blockspergrid, threadsperblock](d_npart, d_npts, d_data, d_data_index, d_coef, d_datasum, d_tmp)
  File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/cuda/compiler.py", line 765, in __call__
    kernel = self.specialize(*args)
  File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/cuda/compiler.py", line 776, in specialize
    kernel = self.compile(argtypes)
  File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/cuda/compiler.py", line 792, in compile
    **self.targetoptions)
  File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler_lock.py", line 32, in _acquire_compile_lock
    return func(*args, **kwargs)
  File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/cuda/compiler.py", line 62, in compile_kernel
    cres = compile_cuda(pyfunc, types.void, args, debug=debug, inline=inline)
  File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler_lock.py", line 32, in _acquire_compile_lock
    return func(*args, **kwargs)
  File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/cuda/compiler.py", line 51, in compile_cuda
    locals={})
  File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler.py", line 926, in compile_extra
    return pipeline.compile_extra(func)
  File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler.py", line 374, in compile_extra
    return self._compile_bytecode()
  File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler.py", line 857, in _compile_bytecode
    return self._compile_core()
  File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler.py", line 844, in _compile_core
    res = pm.run(self.status)
  File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler_lock.py", line 32, in _acquire_compile_lock
    return func(*args, **kwargs)
  File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler.py", line 255, in run
    raise patched_exception
numba.errors.TypingError: Failed in nopython mode pipeline (step: nopython frontend)
Invalid use of Function(<built-in function lt>) with argument(s) of type(s): (int32, array(int64, 1d, A))
Known signatures:
 * (bool, bool) -> bool
 * (int8, int8) -> bool
 * (int16, int16) -> bool
 * (int32, int32) -> bool
 * (int64, int64) -> bool
 * (uint8, uint8) -> bool
 * (uint16, uint16) -> bool
 * (uint32, uint32) -> bool
 * (uint64, uint64) -> bool
 * (float32, float32) -> bool
 * (float64, float64) -> bool
 * parameterized
In definition 0:
    All templates rejected with literals.
In definition 1:
    All templates rejected without literals.
In definition 2:
    All templates rejected with literals.
In definition 3:
    All templates rejected without literals.
In definition 4:
    All templates rejected with literals.
In definition 5:
    All templates rejected without literals.
In definition 6:
    All templates rejected with literals.
In definition 7:
    All templates rejected without literals.
This error is usually caused by passing an argument of a type that is unsupported by the named function.
[1] During: typing of intrinsic-call at GPU_Fake_PA_partial.py (15)


File "GPU_Fake_PA_partial.py", line 15:
def cuda_kernel (d_npart, d_npts,  d_data, d_data_index, d_coef, d_datasum, d_tmp):
    <source elided>
    row, col = cuda.grid(2)
    if row < d_npart and col < d_npts :

非常感谢任何帮助。

您的代码存在各种问题。我可能不会涵盖所有这些,所以请将我的代码与您的代码进行比较。

  1. numpy 数组方法(如 np.sum()cannot be used in numba CUDA kernels.

  2. 传递给 numba cuda 内核的标量(如 npartnpts)不需要也不应该像 .to_device() 那样进行数组处理。只需按原样使用它们。这就是您显示 python 错误的原因:

    Invalid use of Function(<built-in function lt>) with argument(s) of type(s): (int32, array(int64, 1d, A))
    
  3. 你的内核太复杂了。您基本上是在执行根据索引模式排列的矩阵的列总和,乘以系数数组。我们可以在每个线程中使用一个循环来执行此操作。

  4. 对于上述实现,我们不需要二维的线程网格。

  5. 您发布的代码中存在缩进问题。

出于演示目的,我已将您的数据集大小从 1000 列减少到 15 列。这是解决上述项目的示例:

$ cat t31.py
import numba as nb
import numpy as np
from numba import cuda
import time
import math

def random_choice_noreplace(m,n, axis=-1):
 # m, n are the number of rows, cols of output
 return np.random.rand(m,n).argsort(axis=axis)

@cuda.jit
def cuda_kernel (npart, npts, d_data, d_data_index, d_coef, d_datasum):
 col = cuda.grid(1)
 if col < npts:
     temp = 0
     for i in range(npart):
         temp += d_data[d_data_index[i, col]] * d_coef[i, col]
     d_datasum[col] = temp


def calculate_cuda (data, data_index, coef):

 npart, npts = data_index.shape
 # arrays to copy to GPU memory =====================================
 d_data = cuda.to_device(data)
 d_data_imag = cuda.to_device(data_imag)
 d_data_index = cuda.to_device(data_index)
 d_coef = cuda.to_device(coef)

 d_datasum = cuda.device_array(npts, np.complex64)

 threadsperblock = 64
 blockspergrid = int(math.ceil(npts / threadsperblock))+1
 cuda_kernel[blockspergrid, threadsperblock](npart, npts, d_data, d_data_index, d_coef, d_datasum)
 # Copy data from GPU to CPU ========================================
 final_data_sum = d_datasum.copy_to_host()
 return final_data_sum



def calculate_python (data, data_index, coef):
 npart, npts = data_index.shape
 data_sum = np.zeros(npts, dtype=np.complex64)
 tmp = np.zeros(npts, dtype=np.complex64)
 print(" Calling python function...")
 for i in range(npart):
  tmp[:] = data[data_index[i]]
  data_sum += tmp * coef[i]
 return data_sum

if __name__ == '__main__':

 rows = 31
 cols = 15
 data_size = rows * cols

 data_real = np.random.randn(data_size).astype(np.float32)
 data_imag = np.random.randn(data_size).astype(np.float32)

 data = data_real + 1j * data_imag
 coef = np.random.randn(rows, cols)
 data_index = random_choice_noreplace(rows, cols)

 start_time = time.time()
 gpu_data_sum_python = calculate_python (data, data_index, coef)
 python_time = time.time() - start_time #print("gpu c : ", c_gpu)
 print("---- %s seconds for python ----:" % (python_time))
 print(gpu_data_sum_python)

 start_time = time.time()
 gpu_data_sum = calculate_cuda (data, data_index, coef)
 gpu_time = time.time() - start_time
 print("---- %s seconds for gpu ----:" % (gpu_time))
 print(gpu_data_sum)
$ python t31.py
 Calling python function...
---- 0.000281095504761 seconds for python ----:
[-1.10292518+0.90700358j  2.67771578+2.47935939j -5.22553015-2.22675705j
 -3.55810285+2.39755774j  4.11441088-3.89396238j -2.70894790-0.75690132j
  3.24859619+0.65993834j  1.05531025+2.3959775j  -4.27368307+1.6297332j
  0.17896785-7.0437355j  -6.31506491+6.22674656j -1.85534143-6.08459902j
  0.40037563+6.33309507j -1.71916604-0.55055946j  0.49263301+1.08690035j]
---- 0.593510866165 seconds for gpu ----:
[-1.10292506+0.9070037j   2.67771506+2.47935939j -5.22553062-2.22675681j
 -3.55810285+2.39755821j  4.11441135-3.89396238j -2.70894790-0.75690138j
  3.24859619+0.65993822j  1.05531013+2.39597774j -4.27368307+1.62973344j
  0.17896791-7.0437355j  -6.31506491+6.22674656j -1.85534155-6.08459902j
  0.40037528+6.33309603j -1.71916604-0.55055946j  0.49263287+1.08690035j]
$

请注意,主机和设备计算结果在某些情况下从小数点后 6 位开始在数值上略有不同。我将此归因于主机和设备代码之间可能的计算顺序差异,以及 float32(或 complex64)numpy 数据类型的限制。

由于您的代码中内置了计时功能,因此您可能对性能感兴趣。对于 numba python,我推荐一个典型的基准测试实践,即不测量第一个 运行,而是测量第二个 运行。这避免了一次性开销进入测量。此外,我们希望选择比 15 列大得多的数据集大小,以便为 GPU 提供足够大的工作量来分摊各种成本。通过这些修改,这里有一个基准显示此代码中的 GPU 版本可以比此代码中的 CPU 版本更快:

$ cat t31.py
import numba as nb
import numpy as np
from numba import cuda
import time
import math

def random_choice_noreplace(m,n, axis=-1):
 # m, n are the number of rows, cols of output
 return np.random.rand(m,n).argsort(axis=axis)

@cuda.jit
def cuda_kernel (npart, npts, d_data, d_data_index, d_coef, d_datasum):
 col = cuda.grid(1)
 if col < npts:
     temp = 0
     for i in range(npart):
         temp += d_data[d_data_index[i, col]] * d_coef[i, col]
     d_datasum[col] = temp


def calculate_cuda (data, data_index, coef):

 npart, npts = data_index.shape
 # arrays to copy to GPU memory =====================================
 d_data = cuda.to_device(data)
 d_data_imag = cuda.to_device(data_imag)
 d_data_index = cuda.to_device(data_index)
 d_coef = cuda.to_device(coef)

 d_datasum = cuda.device_array(npts, np.complex64)

 threadsperblock = 64
 blockspergrid = int(math.ceil(npts / threadsperblock))+1
 cuda_kernel[blockspergrid, threadsperblock](npart, npts, d_data, d_data_index, d_coef, d_datasum)
 # Copy data from GPU to CPU ========================================
 final_data_sum = d_datasum.copy_to_host()
 return final_data_sum



def calculate_python (data, data_index, coef):
 npart, npts = data_index.shape
 data_sum = np.zeros(npts, dtype=np.complex64)
 tmp = np.zeros(npts, dtype=np.complex64)
 print(" Calling python function...")
 for i in range(npart):
  tmp[:] = data[data_index[i]]
  data_sum += tmp * coef[i]
 return data_sum

if __name__ == '__main__':

 rows = 31
 cols = 1000000
 data_size = rows * cols

 data_real = np.random.randn(data_size).astype(np.float32)
 data_imag = np.random.randn(data_size).astype(np.float32)

 data = data_real + 1j * data_imag
 coef = np.random.randn(rows, cols)
 data_index = random_choice_noreplace(rows, cols)

 gpu_data_sum_python = calculate_python (data, data_index, coef)
 start_time = time.time()
 gpu_data_sum_python = calculate_python (data, data_index, coef)
 python_time = time.time() - start_time #print("gpu c : ", c_gpu)
 print("---- %s seconds for python ----:" % (python_time))
 print(gpu_data_sum_python)

 gpu_data_sum = calculate_cuda (data, data_index, coef)
 start_time = time.time()
 gpu_data_sum = calculate_cuda (data, data_index, coef)
 gpu_time = time.time() - start_time
 print("---- %s seconds for gpu ----:" % (gpu_time))
 print(gpu_data_sum)
$ python t31.py
 Calling python function...
 Calling python function...
---- 0.806931018829 seconds for python ----:
[  6.56164026-7.95271683j  -7.42586899+3.68758106j   3.48999476+3.10376763j
 ...,  13.12746525+4.61855698j   0.08796659+0.9710322j
  -6.54224586+4.89485168j]
---- 0.417661905289 seconds for gpu ----:
[  6.56164074-7.95271683j  -7.42586851+3.68758035j   3.48999500+3.10376763j
 ...,  13.12746525+4.61855745j   0.08796643+0.97103256j
  -6.54224634+4.89485121j]
$

通过这些修改,GPU 代码似乎比 CPU 代码快大约 2 倍。

这是在 CUDA 9.2、Fedora 27、Quadro K2000(相对较小、速度较慢的 GPU)上测得的。我不会过多地了解这种比较,因为 CPU 代码几乎肯定也不是最佳的,而且每个输出数据点的工作量仍然相对较小,GPU 加速很有趣。