用 cython 声明一个 numpy 数组会奇怪地产生大量开销
Declaring a numpy array with cython strangely generates a lot of overhead
我正在使用 Cython 重写一些 Python 代码。
按照建议 in the documentation 我开始用优化的 cython 定义替换我的 python 数组。
特别是,下面应该是 'best' 声明 numpy 数组的方式:
# cython: profile=True
# cython: boundscheck=False
# cython: wraparound=False
import numpy as np
cimport numpy as np
cpdef test():
cdef np.ndarray[np.int_t, ndim=1] seeds_idx = np.empty(10, dtype=np.int)
pass
但是,通过 cython -a my_file.pyx
分析上述代码生成的 html 文件显示如下:
+10: cdef np.ndarray[np.int_t, ndim=1] seeds_idx = np.empty(10, dtype=np.int)
__pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_1);
__pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_empty); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 10, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_2);
__Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
__pyx_t_1 = PyDict_New(); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_1);
__pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 10, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_3);
__pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_int); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 10, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_4);
__Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
if (PyDict_SetItem(__pyx_t_1, __pyx_n_s_dtype, __pyx_t_4) < 0) __PYX_ERR(0, 10, __pyx_L1_error)
__Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
__pyx_t_4 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_tuple_, __pyx_t_1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 10, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_4);
__Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
__Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
if (!(likely(((__pyx_t_4) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_4, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 10, __pyx_L1_error)
__pyx_t_5 = ((PyArrayObject *)__pyx_t_4);
{
__Pyx_BufFmt_StackElem __pyx_stack[1];
if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_seeds_idx.rcbuffer->pybuffer, (PyObject*)__pyx_t_5, &__Pyx_TypeInfo_nn___pyx_t_5numpy_int_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {
__pyx_v_seeds_idx = ((PyArrayObject *)Py_None); __Pyx_INCREF(Py_None); __pyx_pybuffernd_seeds_idx.rcbuffer->pybuffer.buf = NULL;
__PYX_ERR(0, 10, __pyx_L1_error)
} else {__pyx_pybuffernd_seeds_idx.diminfo[0].strides = __pyx_pybuffernd_seeds_idx.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_seeds_idx.diminfo[0].shape = __pyx_pybuffernd_seeds_idx.rcbuffer->pybuffer.shape[0];
}
}
__pyx_t_5 = 0;
__pyx_v_seeds_idx = ((PyArrayObject *)__pyx_t_4);
__pyx_t_4 = 0;
/* … */
__pyx_tuple_ = PyTuple_Pack(1, __pyx_int_10); if (unlikely(!__pyx_tuple_)) __PYX_ERR(0, 10, __pyx_L1_error)
__Pyx_GOTREF(__pyx_tuple_);
__Pyx_GIVEREF(__pyx_tuple_);
这是在 Python 2.7 上使用 cython 0.24 和 numpy 1.10.4 获得的。
另一方面,非常简单的声明 seeds_idx = np.empty(10)
结果是:
+10: seeds_idx = np.empty(10)
__pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_1);
__pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_empty); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 10, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_2);
__Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
__pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_tuple_, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_1);
__Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
__pyx_v_seeds_idx = __pyx_t_1;
__pyx_t_1 = 0;
/* … */
__pyx_tuple_ = PyTuple_Pack(1, __pyx_int_10); if (unlikely(!__pyx_tuple_)) __PYX_ERR(0, 10, __pyx_L1_error)
__Pyx_GOTREF(__pyx_tuple_);
__Pyx_GIVEREF(__pyx_tuple_);
这里出了什么问题(如果有的话)?谢谢!
如评论所述,这里没有任何问题,因此无需担心。另外,请记住,您正在检查为简单赋值生成的代码,任何差异都不会影响性能。
虽然有一个小勘误表,但在第二种情况下 seeds_idx = np.empty(10)
应该更改为 seeds_idx = np.empty(10, dtype=np.int)
以匹配第一种情况。
如果添加它,则还会添加为存储函数调用 (np.empty
) 的参数而创建的字典:
__pyx_t_1 = PyDict_New(); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 8, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_1);
np.int
的查找:
__pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 10, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_3);
__pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_int); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 10, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_4);
__Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
新创建的字典中的参数设置完成:
if (PyDict_SetItem(__pyx_t_1, __pyx_n_s_dtype, __pyx_t_4) < 0) __PYX_ERR(0, 8, __pyx_L1_error)
__Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
除了这些,它们之间唯一的区别是:
if (!(likely(((__pyx_t_4) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_4, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 10, __pyx_L1_error)
__pyx_t_5 = ((PyArrayObject *)__pyx_t_4);
{
__Pyx_BufFmt_StackElem __pyx_stack[1];
if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_seeds_idx.rcbuffer->pybuffer, (PyObject*)__pyx_t_5, &__Pyx_TypeInfo_nn___pyx_t_5numpy_int_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {
__pyx_v_seeds_idx = ((PyArrayObject *)Py_None); __Pyx_INCREF(Py_None); __pyx_pybuffernd_seeds_idx.rcbuffer->pybuffer.buf = NULL;
__PYX_ERR(0, 10, __pyx_L1_error)
} else {__pyx_pybuffernd_seeds_idx.diminfo[0].strides = __pyx_pybuffernd_seeds_idx.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_seeds_idx.diminfo[0].shape = __pyx_pybuffernd_seeds_idx.rcbuffer->pybuffer.shape[0];
}
}
其中,as stated in the documentation you linked最有可能是为了快速访问数据缓冲区。
到目前为止,最好的选择是使用 typed memoryviews. These are the native way and most likely the easiest way to work with arrays in cython. Their performance is usually on par with numpy arrays,如果不是,您可以随时轻松地在它们之间切换。
我正在使用 Cython 重写一些 Python 代码。
按照建议 in the documentation 我开始用优化的 cython 定义替换我的 python 数组。
特别是,下面应该是 'best' 声明 numpy 数组的方式:
# cython: profile=True
# cython: boundscheck=False
# cython: wraparound=False
import numpy as np
cimport numpy as np
cpdef test():
cdef np.ndarray[np.int_t, ndim=1] seeds_idx = np.empty(10, dtype=np.int)
pass
但是,通过 cython -a my_file.pyx
分析上述代码生成的 html 文件显示如下:
+10: cdef np.ndarray[np.int_t, ndim=1] seeds_idx = np.empty(10, dtype=np.int)
__pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_1);
__pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_empty); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 10, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_2);
__Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
__pyx_t_1 = PyDict_New(); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_1);
__pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 10, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_3);
__pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_int); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 10, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_4);
__Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
if (PyDict_SetItem(__pyx_t_1, __pyx_n_s_dtype, __pyx_t_4) < 0) __PYX_ERR(0, 10, __pyx_L1_error)
__Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
__pyx_t_4 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_tuple_, __pyx_t_1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 10, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_4);
__Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
__Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
if (!(likely(((__pyx_t_4) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_4, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 10, __pyx_L1_error)
__pyx_t_5 = ((PyArrayObject *)__pyx_t_4);
{
__Pyx_BufFmt_StackElem __pyx_stack[1];
if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_seeds_idx.rcbuffer->pybuffer, (PyObject*)__pyx_t_5, &__Pyx_TypeInfo_nn___pyx_t_5numpy_int_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {
__pyx_v_seeds_idx = ((PyArrayObject *)Py_None); __Pyx_INCREF(Py_None); __pyx_pybuffernd_seeds_idx.rcbuffer->pybuffer.buf = NULL;
__PYX_ERR(0, 10, __pyx_L1_error)
} else {__pyx_pybuffernd_seeds_idx.diminfo[0].strides = __pyx_pybuffernd_seeds_idx.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_seeds_idx.diminfo[0].shape = __pyx_pybuffernd_seeds_idx.rcbuffer->pybuffer.shape[0];
}
}
__pyx_t_5 = 0;
__pyx_v_seeds_idx = ((PyArrayObject *)__pyx_t_4);
__pyx_t_4 = 0;
/* … */
__pyx_tuple_ = PyTuple_Pack(1, __pyx_int_10); if (unlikely(!__pyx_tuple_)) __PYX_ERR(0, 10, __pyx_L1_error)
__Pyx_GOTREF(__pyx_tuple_);
__Pyx_GIVEREF(__pyx_tuple_);
这是在 Python 2.7 上使用 cython 0.24 和 numpy 1.10.4 获得的。
另一方面,非常简单的声明 seeds_idx = np.empty(10)
结果是:
+10: seeds_idx = np.empty(10)
__pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_1);
__pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_empty); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 10, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_2);
__Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
__pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_tuple_, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_1);
__Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
__pyx_v_seeds_idx = __pyx_t_1;
__pyx_t_1 = 0;
/* … */
__pyx_tuple_ = PyTuple_Pack(1, __pyx_int_10); if (unlikely(!__pyx_tuple_)) __PYX_ERR(0, 10, __pyx_L1_error)
__Pyx_GOTREF(__pyx_tuple_);
__Pyx_GIVEREF(__pyx_tuple_);
这里出了什么问题(如果有的话)?谢谢!
如评论所述,这里没有任何问题,因此无需担心。另外,请记住,您正在检查为简单赋值生成的代码,任何差异都不会影响性能。
虽然有一个小勘误表,但在第二种情况下 seeds_idx = np.empty(10)
应该更改为 seeds_idx = np.empty(10, dtype=np.int)
以匹配第一种情况。
如果添加它,则还会添加为存储函数调用 (np.empty
) 的参数而创建的字典:
__pyx_t_1 = PyDict_New(); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 8, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_1);
np.int
的查找:
__pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 10, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_3);
__pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_int); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 10, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_4);
__Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
新创建的字典中的参数设置完成:
if (PyDict_SetItem(__pyx_t_1, __pyx_n_s_dtype, __pyx_t_4) < 0) __PYX_ERR(0, 8, __pyx_L1_error)
__Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
除了这些,它们之间唯一的区别是:
if (!(likely(((__pyx_t_4) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_4, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 10, __pyx_L1_error)
__pyx_t_5 = ((PyArrayObject *)__pyx_t_4);
{
__Pyx_BufFmt_StackElem __pyx_stack[1];
if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_seeds_idx.rcbuffer->pybuffer, (PyObject*)__pyx_t_5, &__Pyx_TypeInfo_nn___pyx_t_5numpy_int_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {
__pyx_v_seeds_idx = ((PyArrayObject *)Py_None); __Pyx_INCREF(Py_None); __pyx_pybuffernd_seeds_idx.rcbuffer->pybuffer.buf = NULL;
__PYX_ERR(0, 10, __pyx_L1_error)
} else {__pyx_pybuffernd_seeds_idx.diminfo[0].strides = __pyx_pybuffernd_seeds_idx.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_seeds_idx.diminfo[0].shape = __pyx_pybuffernd_seeds_idx.rcbuffer->pybuffer.shape[0];
}
}
其中,as stated in the documentation you linked最有可能是为了快速访问数据缓冲区。
到目前为止,最好的选择是使用 typed memoryviews. These are the native way and most likely the easiest way to work with arrays in cython. Their performance is usually on par with numpy arrays,如果不是,您可以随时轻松地在它们之间切换。