如何使用共享内存和全局内存,是否可以使用共享作为计算的中间阶段
how to use Shared memory and Global memory and is it possible to use shared as intermediate stage in calculating
我正在尝试用 numba cuda 编写代码。我看到了很多分别处理设备内存和共享内存的例子。我陷入困境和困惑。代码或函数是否可以同时处理这两者,例如,代码可以在某种规模上使用共享内存来乘法数字,而在另一种规模上使用设备。
另一件事要问,当我试图逐步使代码复杂化以计算适应度函数时,我使用 space 我共享内存作为中间阶段 sD,根据 mark harris 的介绍将线程减半并添加 2 作为 s
Sdata[tid] += Sdata[tid+s]
当我写下面的代码时,出现了一个错误,我也不知道为什么。
import numpy as np
import math
from numba import cuda, float32
@cuda.jit
def fast_matmul(A, C):
sA = cuda.shared.array(shape=(1, TPB), dtype=float32)
sD = cuda.shared.array(shape=(1, TPB), dtype=float32)
thread_idx_x = cuda.threadIdx.x
thread_idx_y = cuda.threadIdx.y
totla_No_of_threads_x = cuda.blockDim.x
totla_No_of_threads_y = cuda.blockDim.y
block_idx_x = cuda.blockIdx.x
block_idx_y = cuda.blockIdx.y
x, y = cuda.grid(2)
if x >= A.shape[1]: #and y >= C.shape[1]:
return
s = 0
index_1 = 1
for i in range(int(A.shape[1] / TPB)):
sA[thread_idx_x, thread_idx_y] = A[x, thread_idx_y + i * TPB]
cuda.syncthreads()
if thread_idx_y <= (totla_No_of_threads_y-index_1):
sD[thread_idx_x, thread_idx_y] = sA[thread_idx_x, (thread_idx_y +index_1)] - sA[thread_idx_x, thread_idx_y]
cuda.syncthreads()
for s in range(totla_No_of_threads_y//2):
if thread_idx_y < s:
sD[thread_idx_x, thread_idx_y] += sD[thread_idx_x, thread_idx_y+s]
cuda.syncthreads()
C[x, y] = sD[x,y]
A = np.full((1, 16), 3, dtype=np.float32)
C = np.zeros((1, 16))
print('A:', A, 'C:', C)
TPB = 32
dA = cuda.to_device(A)
dC= cuda.to_device(C)
fast_matmul[(1, 1), (32, 32)](dA, dC)
res= dC.copy_to_host()
print(res)
错误显示为
CudaAPIError Traceback (most recent call last)
<ipython-input-214-780fde9bbab5> in <module>
5 TPB = 32
6
----> 7 dA = cuda.to_device(A)
8 dC= cuda.to_device(C)
9 fast_matmul[(8, 8), (32, 32)](dA, dC)
~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\devices.py in _require_cuda_context(*args, **kws)
222 def _require_cuda_context(*args, **kws):
223 with _runtime.ensure_context():
--> 224 return fn(*args, **kws)
225
226 return _require_cuda_context
~\Anaconda3\lib\site-packages\numba\cuda\api.py in to_device(obj, stream, copy, to)
108 """
109 if to is None:
--> 110 to, new = devicearray.auto_device(obj, stream=stream, copy=copy)
111 return to
112 if copy:
~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\devicearray.py in auto_device(obj, stream, copy)
764 subok=True)
765 sentry_contiguous(obj)
--> 766 devobj = from_array_like(obj, stream=stream)
767 if copy:
768 devobj.copy_to_device(obj, stream=stream)
~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\devicearray.py in from_array_like(ary, stream, gpu_data)
686 "Create a DeviceNDArray object that is like ary."
687 return DeviceNDArray(ary.shape, ary.strides, ary.dtype,
--> 688 writeback=ary, stream=stream, gpu_data=gpu_data)
689
690
~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\devicearray.py in __init__(self, shape, strides, dtype, stream, writeback, gpu_data)
102 self.strides,
103 self.dtype.itemsize)
--> 104 gpu_data = devices.get_context().memalloc(self.alloc_size)
105 else:
106 self.alloc_size = _driver.device_memory_size(gpu_data)
~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\driver.py in memalloc(self, bytesize)
1099
1100 def memalloc(self, bytesize):
-> 1101 return self.memory_manager.memalloc(bytesize)
1102
1103 def memhostalloc(self, bytesize, mapped=False, portable=False, wc=False):
~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\driver.py in memalloc(self, size)
849 driver.cuMemAlloc(byref(ptr), size)
850
--> 851 self._attempt_allocation(allocator)
852
853 finalizer = _alloc_finalizer(self, ptr, size)
~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\driver.py in _attempt_allocation(self, allocator)
709 """
710 try:
--> 711 allocator()
712 except CudaAPIError as e:
713 # is out-of-memory?
~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\driver.py in allocator()
847
848 def allocator():
--> 849 driver.cuMemAlloc(byref(ptr), size)
850
851 self._attempt_allocation(allocator)
~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\driver.py in safe_cuda_api_call(*args)
300 _logger.debug('call driver api: %s', libfn.__name__)
301 retcode = libfn(*args)
--> 302 self._check_error(fname, retcode)
303 return safe_cuda_api_call
304
~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\driver.py in _check_error(self, fname, retcode)
335 _logger.critical(msg, _getpid(), self.pid)
336 raise CudaDriverError("CUDA initialized before forking")
--> 337 raise CudaAPIError(retcode, msg)
338
339 def get_device(self, devnum=0):
CudaAPIError: [700] Call to cuMemAlloc results in UNKNOWN_CUDA_ERROR
是的,您可以同时使用两者。当您将数据从主机复制到设备时,它将从“设备内存”开始。此后,如果您想使用共享内存,则必须从内核代码中显式地将数据复制到其中。同样,当您想要 return 结果返回主机代码(将数据从设备复制到主机)时,数据必须是“设备内存”。
共享内存是一种较小的 scratchpad-style 资源。
This提供了很好的example/comparison.
我不知道这是否会解决您的错误,因为看起来您没有使用多处理。但是我遇到了完全相同的错误“raise CudaDriverError("CUDA initialized before forking")”,问题是 python 多处理使用的是“fork”而不是“spawn”。
multiprocessing.set_start_method('spawn')
为我解决了这个问题,它可能对您没有帮助,但可能会帮助其他基于此 numba 错误进行搜索的人。
我正在尝试用 numba cuda 编写代码。我看到了很多分别处理设备内存和共享内存的例子。我陷入困境和困惑。代码或函数是否可以同时处理这两者,例如,代码可以在某种规模上使用共享内存来乘法数字,而在另一种规模上使用设备。
另一件事要问,当我试图逐步使代码复杂化以计算适应度函数时,我使用 space 我共享内存作为中间阶段 sD,根据 mark harris 的介绍将线程减半并添加 2 作为 s Sdata[tid] += Sdata[tid+s]
当我写下面的代码时,出现了一个错误,我也不知道为什么。
import numpy as np
import math
from numba import cuda, float32
@cuda.jit
def fast_matmul(A, C):
sA = cuda.shared.array(shape=(1, TPB), dtype=float32)
sD = cuda.shared.array(shape=(1, TPB), dtype=float32)
thread_idx_x = cuda.threadIdx.x
thread_idx_y = cuda.threadIdx.y
totla_No_of_threads_x = cuda.blockDim.x
totla_No_of_threads_y = cuda.blockDim.y
block_idx_x = cuda.blockIdx.x
block_idx_y = cuda.blockIdx.y
x, y = cuda.grid(2)
if x >= A.shape[1]: #and y >= C.shape[1]:
return
s = 0
index_1 = 1
for i in range(int(A.shape[1] / TPB)):
sA[thread_idx_x, thread_idx_y] = A[x, thread_idx_y + i * TPB]
cuda.syncthreads()
if thread_idx_y <= (totla_No_of_threads_y-index_1):
sD[thread_idx_x, thread_idx_y] = sA[thread_idx_x, (thread_idx_y +index_1)] - sA[thread_idx_x, thread_idx_y]
cuda.syncthreads()
for s in range(totla_No_of_threads_y//2):
if thread_idx_y < s:
sD[thread_idx_x, thread_idx_y] += sD[thread_idx_x, thread_idx_y+s]
cuda.syncthreads()
C[x, y] = sD[x,y]
A = np.full((1, 16), 3, dtype=np.float32)
C = np.zeros((1, 16))
print('A:', A, 'C:', C)
TPB = 32
dA = cuda.to_device(A)
dC= cuda.to_device(C)
fast_matmul[(1, 1), (32, 32)](dA, dC)
res= dC.copy_to_host()
print(res)
错误显示为
CudaAPIError Traceback (most recent call last)
<ipython-input-214-780fde9bbab5> in <module>
5 TPB = 32
6
----> 7 dA = cuda.to_device(A)
8 dC= cuda.to_device(C)
9 fast_matmul[(8, 8), (32, 32)](dA, dC)
~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\devices.py in _require_cuda_context(*args, **kws)
222 def _require_cuda_context(*args, **kws):
223 with _runtime.ensure_context():
--> 224 return fn(*args, **kws)
225
226 return _require_cuda_context
~\Anaconda3\lib\site-packages\numba\cuda\api.py in to_device(obj, stream, copy, to)
108 """
109 if to is None:
--> 110 to, new = devicearray.auto_device(obj, stream=stream, copy=copy)
111 return to
112 if copy:
~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\devicearray.py in auto_device(obj, stream, copy)
764 subok=True)
765 sentry_contiguous(obj)
--> 766 devobj = from_array_like(obj, stream=stream)
767 if copy:
768 devobj.copy_to_device(obj, stream=stream)
~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\devicearray.py in from_array_like(ary, stream, gpu_data)
686 "Create a DeviceNDArray object that is like ary."
687 return DeviceNDArray(ary.shape, ary.strides, ary.dtype,
--> 688 writeback=ary, stream=stream, gpu_data=gpu_data)
689
690
~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\devicearray.py in __init__(self, shape, strides, dtype, stream, writeback, gpu_data)
102 self.strides,
103 self.dtype.itemsize)
--> 104 gpu_data = devices.get_context().memalloc(self.alloc_size)
105 else:
106 self.alloc_size = _driver.device_memory_size(gpu_data)
~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\driver.py in memalloc(self, bytesize)
1099
1100 def memalloc(self, bytesize):
-> 1101 return self.memory_manager.memalloc(bytesize)
1102
1103 def memhostalloc(self, bytesize, mapped=False, portable=False, wc=False):
~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\driver.py in memalloc(self, size)
849 driver.cuMemAlloc(byref(ptr), size)
850
--> 851 self._attempt_allocation(allocator)
852
853 finalizer = _alloc_finalizer(self, ptr, size)
~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\driver.py in _attempt_allocation(self, allocator)
709 """
710 try:
--> 711 allocator()
712 except CudaAPIError as e:
713 # is out-of-memory?
~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\driver.py in allocator()
847
848 def allocator():
--> 849 driver.cuMemAlloc(byref(ptr), size)
850
851 self._attempt_allocation(allocator)
~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\driver.py in safe_cuda_api_call(*args)
300 _logger.debug('call driver api: %s', libfn.__name__)
301 retcode = libfn(*args)
--> 302 self._check_error(fname, retcode)
303 return safe_cuda_api_call
304
~\Anaconda3\lib\site-packages\numba\cuda\cudadrv\driver.py in _check_error(self, fname, retcode)
335 _logger.critical(msg, _getpid(), self.pid)
336 raise CudaDriverError("CUDA initialized before forking")
--> 337 raise CudaAPIError(retcode, msg)
338
339 def get_device(self, devnum=0):
CudaAPIError: [700] Call to cuMemAlloc results in UNKNOWN_CUDA_ERROR
是的,您可以同时使用两者。当您将数据从主机复制到设备时,它将从“设备内存”开始。此后,如果您想使用共享内存,则必须从内核代码中显式地将数据复制到其中。同样,当您想要 return 结果返回主机代码(将数据从设备复制到主机)时,数据必须是“设备内存”。
共享内存是一种较小的 scratchpad-style 资源。
This提供了很好的example/comparison.
我不知道这是否会解决您的错误,因为看起来您没有使用多处理。但是我遇到了完全相同的错误“raise CudaDriverError("CUDA initialized before forking")”,问题是 python 多处理使用的是“fork”而不是“spawn”。
multiprocessing.set_start_method('spawn')
为我解决了这个问题,它可能对您没有帮助,但可能会帮助其他基于此 numba 错误进行搜索的人。