为什么代码加速不适用于 Cython?
Why doesn't code acceleration work with Cython?
我需要将这段代码加速到 4 毫秒。
import numpy as np
def return_call(data):
num = int(data.shape[0] / 4096)
buff_spectrum = np.empty(2048,dtype= np.uint64)
buff_detect = np.empty(2048,dtype= np.uint64)
end_spetrum = np.empty(num*1024,dtype=np.uint64)
end_detect = np.empty(num*1024,dtype= np.uint64)
_data = np.reshape(data,(num,4096))
for _raw_data_spec in _data:
raw_data_spec = np.reshape(_raw_data_spec,(2048,2))
for i in range(2048):
buff_spectrum[i] = (np.int16(raw_data_spec[i][0])<<17)|(np.int16(raw_data_spec[i][1] <<1))>>1
buff_detect[i] = (np.int16(raw_data_spec[i][0])>>15)
for i in range (511,-1,-1):
if buff_spectrum[i+1024] != 0:
end_spetrum[i]=(np.log10(buff_spectrum[i+1024]))
end_detect[i]=buff_detect[i+1024]
else:
end_spetrum[i] =0
end_detect[i] = 0
for i in range(1023, 511, -1):
if buff_spectrum[i+1024] != 0:
end_spetrum[i] = (np.log10(buff_spectrum[i + 1024]))
end_detect[i] = buff_detect[i + 1024]
else:
end_spetrum[i] = 0
end_detect[i] = 0
return end_spetrum, end_detect
我决定使用 Cython 来完成这项任务。但是我没有得到任何加速。
import numpy as np
cimport numpy
ctypedef signed short DTYPE_t
cpdef return_call(numpy.ndarray[DTYPE_t, ndim=1] data):
cdef int i
cdef int num = data.shape[0]/4096
cdef numpy.ndarray _data
cdef numpy.ndarray[unsigned long long, ndim=1] buff_spectrum = np.empty(2048,dtype= np.uint64)
cdef numpy.ndarray[ unsigned long long, ndim=1] buff_detect = np.empty(2048,dtype= np.uint64)
cdef numpy.ndarray[double , ndim=1] end_spetrum = np.empty(num*1024,dtype= np.double)
cdef numpy.ndarray[double , ndim=1] end_detect = np.empty(num*1024,dtype= np.double)
_data = np.reshape(data,(num,4096))
for _raw_data_spec in _data:
raw_data_spec = np.reshape(_raw_data_spec,(2048,2))
for i in range(2048):
buff_spectrum[i] = (np.uint16(raw_data_spec[i][0])<<17)|(np.uint16(raw_data_spec[i][1] <<1))>>1
buff_detect[i] = (np.uint16(raw_data_spec[i][0])>>15)
for i in range (511,-1,-1):
if buff_spectrum[i+1024] != 0:
end_spetrum[i]=(np.log10(buff_spectrum[i+1024]))
end_detect[i]=buff_detect[i+1024]
else:
end_spetrum[i] =0
end_detect[i] = 0
for i in range(1023, 511, -1):
if buff_spectrum[i+1024] != 0:
end_spetrum[i] = (np.log10(buff_spectrum[i + 1024]))
end_detect[i] = buff_detect[i + 1024]
else:
end_spetrum[i] = 0
end_detect[i] = 0
return end_spetrum, end_detect
我达到的最大速度是 80 毫秒,但我需要更快。由于您需要几乎实时地处理来自 iron 的数据
告诉我原因。并且达到预期的结果是否现实。我还附上了测试文件的代码。
import numpy as np
import example_original
import example_cython
data = np.empty(8192*2, dtype=np.int16)
import time
startpy = time.time()
example_original.return_call(data)
finpy = time.time() -startpy
startcy = time.time()
k,r = example_cython.return_call(data)
fincy = time.time() -startcy
print( fincy, finpy)
print('Cython is {}x faster'.format(finpy/fincy))
我认为一个主要原因可能是因为您的 python 代码几乎没有 python 操作并且所有操作都是 numpy 操作。大部分 numpy 代码是用 C 语言编写的。其中一些是用 Fortran 语言编写的。 Python里面写了很多。编写良好的 numpy 代码在速度上可与 C 代码相媲美。
我对 Cython 的经验不多,所以这只是一个例子,Cython 也应该可以实现什么计时。
例子
import numpy as np
import numba as nb
@nb.njit(cache=True)
def return_call(data_in):
#If the input is not contigous the reshape will fail
#-> make a c-contigous copy if the array isn't c-contigous
data=np.ascontiguousarray(data_in)
num = int(data.shape[0] / 4096)
buff_spectrum = np.zeros(2048,dtype= np.uint64)
buff_detect = np.zeros(2048,dtype= np.uint64)
end_spetrum = np.zeros(num*1024,dtype=np.float64)
end_detect = np.zeros(num*1024,dtype= np.float64)
_data = np.reshape(data,(num,4096))
#for _raw_data_spec in _data: is not supported
#but the followin works
for x in range(_data.shape[0]):
raw_data_spec = np.reshape(_data[x],(2048,2))
for i in range(2048):
buff_spectrum[i] = (np.int16(raw_data_spec[i][0])<<17)|(np.int16(raw_data_spec[i][1] <<1))>>1
buff_detect[i] = (np.int16(raw_data_spec[i][0])>>15)
for i in range (511,-1,-1):
if buff_spectrum[i+1024] != 0:
end_spetrum[i]=(np.log10(buff_spectrum[i+1024]))
end_detect[i]=buff_detect[i+1024]
else:
end_spetrum[i] =0
end_detect[i] = 0
for i in range(1023, 511, -1):
if buff_spectrum[i+1024] != 0:
end_spetrum[i] = (np.log10(buff_spectrum[i + 1024]))
end_detect[i] = buff_detect[i + 1024]
else:
end_spetrum[i] = 0
end_detect[i] = 0
return end_spetrum, end_detect
计时
data = np.random.rand(8192*2)*20
data=data.astype(np.int16)
#with compilation
%timeit end_spetrum, end_detect=return_call(data)
#32.7 µs ± 5.61 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
#without compilation
%timeit end_spetrum, end_detect=return_call_orig(data)
#106 ms ± 448 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
raw_data_spec = np.reshape(_raw_data_spec,(2048,2))
raw_data_spec
未输入。在函数的开头为其添加一个定义。我推荐较新的 memoryview 语法(但如果需要,请使用旧的 numpy 语法):
cdef DTYPE_t[:,:] raw_data_spec
这一行(您已确定为瓶颈)一团糟:
buff_spectrum[i] = (np.int16(raw_data_spec[i][0])<<17)|(np.int16(raw_data_spec[i][1] <<1))>>1
一步完成索引,而不是两步:raw_data_spec[i, 0]
(注意很多括号和逗号)。
重新考虑转换为 16 位整数。将 16 位整数移动 17 位 真的有意义吗?
您可能根本不需要强制转换,因为已知数据是 DTYPE_t
,但如果您确实需要强制转换,请使用尖括号:<numpy.uint16_t>(raw_data_spec[i, 0])
考虑关闭 boundscheck
和 wraparound
。 向自己确认这样做是安全的,并且当您索引超出数组末尾或使用负索引时,您不会依赖异常来告诉您。只有经过深思熟虑后才这样做 - 不会以 "cargo cult" 的方式自动执行。
cimport cython
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef return_call(numpy.ndarray[DTYPE_t, ndim=1] data):
放弃对 np.log10
的呼叫。这是对单个元素的整个 Python 调用,最终效率低下。您可以改用 C 标准库数学函数:
from libc.math cimport log10
然后将 np.log10
替换为 log10
。
我需要将这段代码加速到 4 毫秒。
import numpy as np
def return_call(data):
num = int(data.shape[0] / 4096)
buff_spectrum = np.empty(2048,dtype= np.uint64)
buff_detect = np.empty(2048,dtype= np.uint64)
end_spetrum = np.empty(num*1024,dtype=np.uint64)
end_detect = np.empty(num*1024,dtype= np.uint64)
_data = np.reshape(data,(num,4096))
for _raw_data_spec in _data:
raw_data_spec = np.reshape(_raw_data_spec,(2048,2))
for i in range(2048):
buff_spectrum[i] = (np.int16(raw_data_spec[i][0])<<17)|(np.int16(raw_data_spec[i][1] <<1))>>1
buff_detect[i] = (np.int16(raw_data_spec[i][0])>>15)
for i in range (511,-1,-1):
if buff_spectrum[i+1024] != 0:
end_spetrum[i]=(np.log10(buff_spectrum[i+1024]))
end_detect[i]=buff_detect[i+1024]
else:
end_spetrum[i] =0
end_detect[i] = 0
for i in range(1023, 511, -1):
if buff_spectrum[i+1024] != 0:
end_spetrum[i] = (np.log10(buff_spectrum[i + 1024]))
end_detect[i] = buff_detect[i + 1024]
else:
end_spetrum[i] = 0
end_detect[i] = 0
return end_spetrum, end_detect
我决定使用 Cython 来完成这项任务。但是我没有得到任何加速。
import numpy as np
cimport numpy
ctypedef signed short DTYPE_t
cpdef return_call(numpy.ndarray[DTYPE_t, ndim=1] data):
cdef int i
cdef int num = data.shape[0]/4096
cdef numpy.ndarray _data
cdef numpy.ndarray[unsigned long long, ndim=1] buff_spectrum = np.empty(2048,dtype= np.uint64)
cdef numpy.ndarray[ unsigned long long, ndim=1] buff_detect = np.empty(2048,dtype= np.uint64)
cdef numpy.ndarray[double , ndim=1] end_spetrum = np.empty(num*1024,dtype= np.double)
cdef numpy.ndarray[double , ndim=1] end_detect = np.empty(num*1024,dtype= np.double)
_data = np.reshape(data,(num,4096))
for _raw_data_spec in _data:
raw_data_spec = np.reshape(_raw_data_spec,(2048,2))
for i in range(2048):
buff_spectrum[i] = (np.uint16(raw_data_spec[i][0])<<17)|(np.uint16(raw_data_spec[i][1] <<1))>>1
buff_detect[i] = (np.uint16(raw_data_spec[i][0])>>15)
for i in range (511,-1,-1):
if buff_spectrum[i+1024] != 0:
end_spetrum[i]=(np.log10(buff_spectrum[i+1024]))
end_detect[i]=buff_detect[i+1024]
else:
end_spetrum[i] =0
end_detect[i] = 0
for i in range(1023, 511, -1):
if buff_spectrum[i+1024] != 0:
end_spetrum[i] = (np.log10(buff_spectrum[i + 1024]))
end_detect[i] = buff_detect[i + 1024]
else:
end_spetrum[i] = 0
end_detect[i] = 0
return end_spetrum, end_detect
我达到的最大速度是 80 毫秒,但我需要更快。由于您需要几乎实时地处理来自 iron 的数据 告诉我原因。并且达到预期的结果是否现实。我还附上了测试文件的代码。
import numpy as np
import example_original
import example_cython
data = np.empty(8192*2, dtype=np.int16)
import time
startpy = time.time()
example_original.return_call(data)
finpy = time.time() -startpy
startcy = time.time()
k,r = example_cython.return_call(data)
fincy = time.time() -startcy
print( fincy, finpy)
print('Cython is {}x faster'.format(finpy/fincy))
我认为一个主要原因可能是因为您的 python 代码几乎没有 python 操作并且所有操作都是 numpy 操作。大部分 numpy 代码是用 C 语言编写的。其中一些是用 Fortran 语言编写的。 Python里面写了很多。编写良好的 numpy 代码在速度上可与 C 代码相媲美。
我对 Cython 的经验不多,所以这只是一个例子,Cython 也应该可以实现什么计时。
例子
import numpy as np
import numba as nb
@nb.njit(cache=True)
def return_call(data_in):
#If the input is not contigous the reshape will fail
#-> make a c-contigous copy if the array isn't c-contigous
data=np.ascontiguousarray(data_in)
num = int(data.shape[0] / 4096)
buff_spectrum = np.zeros(2048,dtype= np.uint64)
buff_detect = np.zeros(2048,dtype= np.uint64)
end_spetrum = np.zeros(num*1024,dtype=np.float64)
end_detect = np.zeros(num*1024,dtype= np.float64)
_data = np.reshape(data,(num,4096))
#for _raw_data_spec in _data: is not supported
#but the followin works
for x in range(_data.shape[0]):
raw_data_spec = np.reshape(_data[x],(2048,2))
for i in range(2048):
buff_spectrum[i] = (np.int16(raw_data_spec[i][0])<<17)|(np.int16(raw_data_spec[i][1] <<1))>>1
buff_detect[i] = (np.int16(raw_data_spec[i][0])>>15)
for i in range (511,-1,-1):
if buff_spectrum[i+1024] != 0:
end_spetrum[i]=(np.log10(buff_spectrum[i+1024]))
end_detect[i]=buff_detect[i+1024]
else:
end_spetrum[i] =0
end_detect[i] = 0
for i in range(1023, 511, -1):
if buff_spectrum[i+1024] != 0:
end_spetrum[i] = (np.log10(buff_spectrum[i + 1024]))
end_detect[i] = buff_detect[i + 1024]
else:
end_spetrum[i] = 0
end_detect[i] = 0
return end_spetrum, end_detect
计时
data = np.random.rand(8192*2)*20
data=data.astype(np.int16)
#with compilation
%timeit end_spetrum, end_detect=return_call(data)
#32.7 µs ± 5.61 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
#without compilation
%timeit end_spetrum, end_detect=return_call_orig(data)
#106 ms ± 448 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
raw_data_spec = np.reshape(_raw_data_spec,(2048,2))
raw_data_spec
未输入。在函数的开头为其添加一个定义。我推荐较新的 memoryview 语法(但如果需要,请使用旧的 numpy 语法):
cdef DTYPE_t[:,:] raw_data_spec
这一行(您已确定为瓶颈)一团糟:
buff_spectrum[i] = (np.int16(raw_data_spec[i][0])<<17)|(np.int16(raw_data_spec[i][1] <<1))>>1
一步完成索引,而不是两步:
raw_data_spec[i, 0]
(注意很多括号和逗号)。重新考虑转换为 16 位整数。将 16 位整数移动 17 位 真的有意义吗?
您可能根本不需要强制转换,因为已知数据是
DTYPE_t
,但如果您确实需要强制转换,请使用尖括号:<numpy.uint16_t>(raw_data_spec[i, 0])
考虑关闭 boundscheck
和 wraparound
。 向自己确认这样做是安全的,并且当您索引超出数组末尾或使用负索引时,您不会依赖异常来告诉您。只有经过深思熟虑后才这样做 - 不会以 "cargo cult" 的方式自动执行。
cimport cython
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef return_call(numpy.ndarray[DTYPE_t, ndim=1] data):
放弃对 np.log10
的呼叫。这是对单个元素的整个 Python 调用,最终效率低下。您可以改用 C 标准库数学函数:
from libc.math cimport log10
然后将 np.log10
替换为 log10
。