直接在 Scipy 稀疏矩阵上使用 Intel mkl 库计算 A 点 A.T 内存更少
Directly use Intel mkl library on Scipy sparse matrix to calculate A dot A.T with less memory
我要调用mkl.mkl_scsrmultcsr from python. The goal is to calculate a sparse matrix C in compressed sparse row格式。稀疏矩阵C是A与A的转置的矩阵乘积,其中A也是csr格式的稀疏矩阵。在用scipy计算C = A点(A.T)时,scipy似乎(?)分配新内存用于保存A(A.T)的转置,并且肯定分配内存对于新的 C 矩阵(这意味着我不能使用现有的 C 矩阵)。所以,我想尝试直接使用 mkl c 函数来减少内存占用。
Here 是适用于另一个 mkl 函数的答案。在那个答案中,mkl 函数快了 4 倍。
经过4天的努力,以下版本终于可以使用了。我想知道我是否浪费了任何记忆。 ctype 会复制 numpy 数组吗?测试函数中的 csr->csc 转换是否必要? intel c函数可以计算(A.T)点A,或者A点A,但不能计算出A点(A.T).
再次感谢。
from ctypes import *
import scipy.sparse as spsp
import numpy as np
import multiprocessing as mp
# June 2nd 2016 version.
# Load the share library
mkl = cdll.LoadLibrary("libmkl_rt.so")
def get_csr_handle(A,clear=False):
if clear == True:
A.indptr[:] = 0
A.indices[:] = 0
A.data[:] = 0
a_pointer = A.data.ctypes.data_as(POINTER(c_float))
# Array containing non-zero elements of the matrix A.
# This corresponds to data array of csr_matrix
# Its length is equal to #non zero elements in A
# (Can this be longer than actual #non-zero elements?)
assert A.data.ctypes.data % 16 == 0 # Check alignment
ja_pointer = A.indices.ctypes.data_as(POINTER(c_int))
# Array of column indices of all non-zero elements of A.
# This corresponds to the indices array of csr_matrix
assert A.indices.ctypes.data % 16 == 0 # Check alignment
ia_pointer = A.indptr.ctypes.data_as(POINTER(c_int))
# Array of length m+1.
# a[ia[i]:ia[i+1]] is the value of nonzero entries of
# the ith row of A.
# ja[ia[i]:ia[i+1]] is the column indices of nonzero
# entries of the ith row of A
# This corresponds to the indptr array of csr_matrix
assert A.indptr.ctypes.data % 16 == 0 # Check alignment
A_data_size = A.data.size
A_indices_size = A.indices.size
A_indptr_size = A.indptr.size
return (a_pointer, ja_pointer, ia_pointer, A)
def csr_dot_csr_t(A_handle, C_handle, nz=None):
# Calculate (A.T).dot(A) and put result into C
#
# This uses one-based indexing
#
# Both C.data and A.data must be in np.float32 type.
#
# Number of nonzero elements in C must be greater than
# or equal to the size of C.data
#
# size of C.indptr must be greater than or equal to
# 1 + (num rows of A).
#
# C_data = np.zeros((nz), dtype=np.single)
# C_indices = np.zeros((nz), dtype=np.int32)
# C_indptr = np.zeros((m+1),dtype=np.int32)
#assert len(c_pointer._obj) >= 1 + A_shape[0]
(a_pointer, ja_pointer, ia_pointer, A) = A_handle
(c_pointer, jc_pointer, ic_pointer, C) = C_handle
#print "CCC",C
#assert type(C.data[0]) == np.float32
#assert type(A.data[0]) == np.float32
#assert C.indptr.size >= A.shape[0] + 1
#CC = A.dot(A.T)
#assert C.data.size >= nz
#assert C.indices.size >= nz
trans_pointer = byref(c_char('T'))
sort_pointer = byref(c_int(0))
(m, n) = A.shape
sort_pointer = byref(c_int(0))
m_pointer = byref(c_int(m)) # Number of rows of matrix A
n_pointer = byref(c_int(n)) # Number of columns of matrix A
k_pointer = byref(c_int(n)) # Number of columns of matrix B
# should be n when trans='T'
# Otherwise, I guess should be m
###
b_pointer = a_pointer
jb_pointer = ja_pointer
ib_pointer = ia_pointer
###
if nz == None:
nz = n*n #*n # m*m # Number of nonzero elements expected
# probably can use lower value for sparse
# matrices.
nzmax_pointer = byref(c_int(nz))
# length of arrays c and jc. (which are data and
# indices of csr_matrix). So this is the number of
# nonzero elements of matrix C
#
# This parameter is used only if request=0.
# The routine stops calculation if the number of
# elements in the result matrix C exceeds the
# specified value of nzmax.
info = c_int(-3)
info_pointer = byref(info)
request_pointer_list = [byref(c_int(0)), byref(c_int(1)), byref(c_int(2))]
return_list = []
for ii in [0]:
request_pointer = request_pointer_list[ii]
ret = mkl.mkl_scsrmultcsr(trans_pointer, request_pointer, sort_pointer,
m_pointer, n_pointer, k_pointer,
a_pointer, ja_pointer, ia_pointer,
b_pointer, jb_pointer, ib_pointer,
c_pointer, jc_pointer, ic_pointer,
nzmax_pointer, info_pointer)
info_val = info.value
return_list += [ (ret,info_val) ]
return return_list
def show_csr_internal(A, indent=4):
# Print data, indptr, and indices
# of a scipy csr_matrix A
name = ['data', 'indptr', 'indices']
mat = [A.data, A.indptr, A.indices]
for i in range(3):
str_print = ' '*indent+name[i]+':\n%s'%mat[i]
str_print = str_print.replace('\n', '\n'+' '*indent*2)
print(str_print)
def fix_for_scipy(C,A):
n = A.shape[1]
print "fix n", n
nz = C.indptr[n] - 1 # -1 as this is still one based indexing.
print "fix nz", nz
data = C.data[:nz]
C.indptr[:n+1] -= 1
indptr = C.indptr[:n+1]
C.indices[:nz] -= 1
indices = C.indices[:nz]
return spsp.csr_matrix( (data, indices, indptr), shape=(n,n))
def test():
AA= [[1,0,0,1],
[1,0,1,0],
[0,0,1,0]]
AA = np.random.choice([0,1], size=(3,750000), replace=True, p=[0.99,0.01])
A_original = spsp.csr_matrix(AA)
#A = spsp.csr_matrix(A_original, dtype=np.float32)
A = A_original.astype(np.float32).tocsc()
#A_original = A.todense()
A = spsp.csr_matrix( (A.data, A.indices, A.indptr) )
print "A:"
show_csr_internal(A)
print A.todense()
A.indptr += 1 # convert to 1-based indexing
A.indices += 1 # convert to 1-based indexing
A_ptrs = get_csr_handle(A)
C = spsp.csr_matrix( np.ones((3,3)), dtype=np.float32)
#C.data = C.data[:16].view()
#C.indptr = C.indptr
C_ptrs = get_csr_handle(C, clear=True)
print "C:"
show_csr_internal(C)
print "=call mkl function="
return_list= csr_dot_csr_t(A_ptrs, C_ptrs)
print "(ret, info):", return_list
print "C after calling mkl:"
show_csr_internal(C)
C_fix = fix_for_scipy(C,A)
print "C_fix for scipy:"
show_csr_internal(C_fix)
print C_fix.todense()
print "Original C after fixing:"
show_csr_internal(C)
print "scipy's (A).dot(A.T)"
scipy_ans = (A_original).dot(A_original.T)
#scipy_ans = spsp.csr_matrix(scipy_ans)
show_csr_internal(scipy_ans)
print scipy_ans.todense()
if __name__ == "__main__":
test()
结果:
A:
data:
[ 1. 1. 1. ..., 1. 1. 1.]
indptr:
[ 0 0 0 ..., 22673 22673 22673]
indices:
[1 0 2 ..., 2 1 2]
[[ 0. 0. 0.]
[ 0. 0. 0.]
[ 0. 0. 0.]
...,
[ 0. 0. 0.]
[ 0. 0. 0.]
[ 0. 0. 0.]]
C:
data:
[ 0. 0. 0. 0. 0. 0. 0. 0. 0.]
indptr:
[0 0 0 0]
indices:
[0 0 0 0 0 0 0 0 0]
=call mkl function=
(ret, info): [(2, 0)]
C after calling mkl:
data:
[ 7576. 77. 83. 77. 7607. 104. 83. 104. 7490.]
indptr:
[ 1 4 7 10]
indices:
[1 2 3 1 2 3 1 2 3]
fix n 3
fix nz 9
C_fix for scipy:
data:
[ 7576. 77. 83. 77. 7607. 104. 83. 104. 7490.]
indptr:
[0 3 6 9]
indices:
[0 1 2 0 1 2 0 1 2]
[[ 7576. 77. 83.]
[ 77. 7607. 104.]
[ 83. 104. 7490.]]
Original C after fixing:
data:
[ 7576. 77. 83. 77. 7607. 104. 83. 104. 7490.]
indptr:
[0 3 6 9]
indices:
[0 1 2 0 1 2 0 1 2]
scipy's (A.T).dot(A)
data:
[ 83 77 7576 104 77 7607 83 104 7490]
indptr:
[0 3 6 9]
indices:
[2 1 0 2 0 1 0 1 2]
[[7576 77 83]
[ 77 7607 104]
[ 83 104 7490]]
学到的东西:
- 对于矩阵 A、B 和 C,索引从 1 开始。
- 我在原始代码中混淆了 c_indptr 和 c_indices。应该是ia
= scipy csr_matrix 的 indptr。 ja = scipy csr_matrix.
的索引
来自代码here。一切都作为指针传递给 mkl_?csrmultcsr。 mkl_scsrmultcsr(&ta, &r[1], &sort, &m, &m, &m, a, ja, ia, a, ja, ia, c, jc, ic, &nzmax, &info);
我想要一个可以使用从零开始的索引的 mkl 函数。 mkl.mkl_scsrmultcsr 函数只能用于基于一个的索引。 (或者我可以对所有内容进行基于一个的索引。这意味着对于大多数线性代数步骤,使用 intel c 函数而不是 scipy/numpy。)
查看 Python 稀疏产品的 scipy 代码。请注意,它分两次调用编译后的代码。
看起来 mkl 代码做了同样的事情
https://software.intel.com/en-us/node/468640
If request=1, the routine computes only values of the array ic of length m + 1, the memory for this array must be allocated beforehand. On exit the value ic(m+1) - 1 is the actual number of the elements in the arrays c and jc.
If request=2, the routine has been called previously with the parameter request=1, the output arrays jc and c are allocated in the calling program and they are of the length ic(m+1) - 1 at least.
你首先根据C
的行数分配了ic
(你从输入中知道),然后用request=1
调用mkl代码。
对于 request=2
,您必须根据 ic(m+1) - 1
中的大小分配 c
和 jc
数组。这与输入数组中 nnz
的数量不同。
您正在使用 request1 = c_int(0)
,这要求 c
数组的大小正确 - 如果不实际执行 dot
(或 request 1
).
==================
File: /usr/lib/python3/dist-packages/scipy/sparse/compressed.py
Definition: A._mul_sparse_matrix(self, other)
pass 1 分配indptr
(注意大小),并传递指针(数据在此pass 无关紧要)
indptr = np.empty(major_axis + 1, dtype=idx_dtype)
fn = getattr(_sparsetools, self.format + '_matmat_pass1')
fn(M, N,
np.asarray(self.indptr, dtype=idx_dtype),
np.asarray(self.indices, dtype=idx_dtype),
np.asarray(other.indptr, dtype=idx_dtype),
np.asarray(other.indices, dtype=idx_dtype),
indptr)
nnz = indptr[-1]
pass 2分配indptr
(大小不同),并基于nnz
indices
和data
.
indptr = np.asarray(indptr, dtype=idx_dtype)
indices = np.empty(nnz, dtype=idx_dtype)
data = np.empty(nnz, dtype=upcast(self.dtype, other.dtype))
fn = getattr(_sparsetools, self.format + '_matmat_pass2')
fn(M, N, np.asarray(self.indptr, dtype=idx_dtype),
np.asarray(self.indices, dtype=idx_dtype),
self.data,
np.asarray(other.indptr, dtype=idx_dtype),
np.asarray(other.indices, dtype=idx_dtype),
other.data,
indptr, indices, data)
最后使用这些数组创建一个新数组。
return self.__class__((data,indices,indptr),shape=(M,N))
mkl
库应该以同样的方式使用。
===================
https://github.com/scipy/scipy/blob/master/scipy/sparse/sparsetools/csr.h
有 csr_matmat_pass1
和 csr_matmat_pass2
的 c 代码
====================
如果有帮助,这里是这些通道的纯 Python 实现。没有试图利用任何数组操作的直译。
def pass1(A, B):
nrow,ncol=A.shape
Aptr=A.indptr
Bptr=B.indptr
Cp=np.zeros(nrow+1,int)
mask=np.zeros(ncol,int)-1
nnz=0
for i in range(nrow):
row_nnz=0
for jj in range(Aptr[i],Aptr[i+1]):
j=A.indices[jj]
for kk in range(Bptr[j],Bptr[j+1]):
k=B.indices[kk]
if mask[k]!=i:
mask[k]=i
row_nnz += 1
nnz += row_nnz
Cp[i+1]=nnz
return Cp
def pass2(A,B,Cnnz):
nrow,ncol=A.shape
Ap,Aj,Ax=A.indptr,A.indices,A.data
Bp,Bj,Bx=B.indptr,B.indices,B.data
next = np.zeros(ncol,int)-1
sums = np.zeros(ncol,A.dtype)
Cp=np.zeros(nrow+1,int)
Cj=np.zeros(Cnnz,int)
Cx=np.zeros(Cnnz,A.dtype)
nnz = 0
for i in range(nrow):
head = -2
length = 0
for jj in range(Ap[i], Ap[i+1]):
j, v = Aj[jj], Ax[jj]
for kk in range(Bp[j], Bp[j+1]):
k = Bj[kk]
sums[k] += v*Bx[kk]
if (next[k]==-1):
next[k], head=head, k
length += 1
print(i,sums, next)
for _ in range(length):
if sums[head] !=0:
Cj[nnz], Cx[nnz] = head, sums[head]
nnz += 1
temp = head
head = next[head]
next[temp], sums[temp] = -1, 0
Cp[i+1] = nnz
return Cp, Cj, Cx
def pass0(A,B):
Cp = pass1(A,B)
nnz = Cp[-1]
Cp,Cj,Cx=pass2(A,B,nnz)
N,M=A.shape[0], B.shape[1]
C=sparse.csr_matrix((Cx, Cj, Cp), shape=(N,M))
return C
A
和 B
都必须是 csr
。它使用转置需要转换,例如B = A.T.tocsr()
.
我要调用mkl.mkl_scsrmultcsr from python. The goal is to calculate a sparse matrix C in compressed sparse row格式。稀疏矩阵C是A与A的转置的矩阵乘积,其中A也是csr格式的稀疏矩阵。在用scipy计算C = A点(A.T)时,scipy似乎(?)分配新内存用于保存A(A.T)的转置,并且肯定分配内存对于新的 C 矩阵(这意味着我不能使用现有的 C 矩阵)。所以,我想尝试直接使用 mkl c 函数来减少内存占用。
Here 是适用于另一个 mkl 函数的答案。在那个答案中,mkl 函数快了 4 倍。
经过4天的努力,以下版本终于可以使用了。我想知道我是否浪费了任何记忆。 ctype 会复制 numpy 数组吗?测试函数中的 csr->csc 转换是否必要? intel c函数可以计算(A.T)点A,或者A点A,但不能计算出A点(A.T).
再次感谢。
from ctypes import *
import scipy.sparse as spsp
import numpy as np
import multiprocessing as mp
# June 2nd 2016 version.
# Load the share library
mkl = cdll.LoadLibrary("libmkl_rt.so")
def get_csr_handle(A,clear=False):
if clear == True:
A.indptr[:] = 0
A.indices[:] = 0
A.data[:] = 0
a_pointer = A.data.ctypes.data_as(POINTER(c_float))
# Array containing non-zero elements of the matrix A.
# This corresponds to data array of csr_matrix
# Its length is equal to #non zero elements in A
# (Can this be longer than actual #non-zero elements?)
assert A.data.ctypes.data % 16 == 0 # Check alignment
ja_pointer = A.indices.ctypes.data_as(POINTER(c_int))
# Array of column indices of all non-zero elements of A.
# This corresponds to the indices array of csr_matrix
assert A.indices.ctypes.data % 16 == 0 # Check alignment
ia_pointer = A.indptr.ctypes.data_as(POINTER(c_int))
# Array of length m+1.
# a[ia[i]:ia[i+1]] is the value of nonzero entries of
# the ith row of A.
# ja[ia[i]:ia[i+1]] is the column indices of nonzero
# entries of the ith row of A
# This corresponds to the indptr array of csr_matrix
assert A.indptr.ctypes.data % 16 == 0 # Check alignment
A_data_size = A.data.size
A_indices_size = A.indices.size
A_indptr_size = A.indptr.size
return (a_pointer, ja_pointer, ia_pointer, A)
def csr_dot_csr_t(A_handle, C_handle, nz=None):
# Calculate (A.T).dot(A) and put result into C
#
# This uses one-based indexing
#
# Both C.data and A.data must be in np.float32 type.
#
# Number of nonzero elements in C must be greater than
# or equal to the size of C.data
#
# size of C.indptr must be greater than or equal to
# 1 + (num rows of A).
#
# C_data = np.zeros((nz), dtype=np.single)
# C_indices = np.zeros((nz), dtype=np.int32)
# C_indptr = np.zeros((m+1),dtype=np.int32)
#assert len(c_pointer._obj) >= 1 + A_shape[0]
(a_pointer, ja_pointer, ia_pointer, A) = A_handle
(c_pointer, jc_pointer, ic_pointer, C) = C_handle
#print "CCC",C
#assert type(C.data[0]) == np.float32
#assert type(A.data[0]) == np.float32
#assert C.indptr.size >= A.shape[0] + 1
#CC = A.dot(A.T)
#assert C.data.size >= nz
#assert C.indices.size >= nz
trans_pointer = byref(c_char('T'))
sort_pointer = byref(c_int(0))
(m, n) = A.shape
sort_pointer = byref(c_int(0))
m_pointer = byref(c_int(m)) # Number of rows of matrix A
n_pointer = byref(c_int(n)) # Number of columns of matrix A
k_pointer = byref(c_int(n)) # Number of columns of matrix B
# should be n when trans='T'
# Otherwise, I guess should be m
###
b_pointer = a_pointer
jb_pointer = ja_pointer
ib_pointer = ia_pointer
###
if nz == None:
nz = n*n #*n # m*m # Number of nonzero elements expected
# probably can use lower value for sparse
# matrices.
nzmax_pointer = byref(c_int(nz))
# length of arrays c and jc. (which are data and
# indices of csr_matrix). So this is the number of
# nonzero elements of matrix C
#
# This parameter is used only if request=0.
# The routine stops calculation if the number of
# elements in the result matrix C exceeds the
# specified value of nzmax.
info = c_int(-3)
info_pointer = byref(info)
request_pointer_list = [byref(c_int(0)), byref(c_int(1)), byref(c_int(2))]
return_list = []
for ii in [0]:
request_pointer = request_pointer_list[ii]
ret = mkl.mkl_scsrmultcsr(trans_pointer, request_pointer, sort_pointer,
m_pointer, n_pointer, k_pointer,
a_pointer, ja_pointer, ia_pointer,
b_pointer, jb_pointer, ib_pointer,
c_pointer, jc_pointer, ic_pointer,
nzmax_pointer, info_pointer)
info_val = info.value
return_list += [ (ret,info_val) ]
return return_list
def show_csr_internal(A, indent=4):
# Print data, indptr, and indices
# of a scipy csr_matrix A
name = ['data', 'indptr', 'indices']
mat = [A.data, A.indptr, A.indices]
for i in range(3):
str_print = ' '*indent+name[i]+':\n%s'%mat[i]
str_print = str_print.replace('\n', '\n'+' '*indent*2)
print(str_print)
def fix_for_scipy(C,A):
n = A.shape[1]
print "fix n", n
nz = C.indptr[n] - 1 # -1 as this is still one based indexing.
print "fix nz", nz
data = C.data[:nz]
C.indptr[:n+1] -= 1
indptr = C.indptr[:n+1]
C.indices[:nz] -= 1
indices = C.indices[:nz]
return spsp.csr_matrix( (data, indices, indptr), shape=(n,n))
def test():
AA= [[1,0,0,1],
[1,0,1,0],
[0,0,1,0]]
AA = np.random.choice([0,1], size=(3,750000), replace=True, p=[0.99,0.01])
A_original = spsp.csr_matrix(AA)
#A = spsp.csr_matrix(A_original, dtype=np.float32)
A = A_original.astype(np.float32).tocsc()
#A_original = A.todense()
A = spsp.csr_matrix( (A.data, A.indices, A.indptr) )
print "A:"
show_csr_internal(A)
print A.todense()
A.indptr += 1 # convert to 1-based indexing
A.indices += 1 # convert to 1-based indexing
A_ptrs = get_csr_handle(A)
C = spsp.csr_matrix( np.ones((3,3)), dtype=np.float32)
#C.data = C.data[:16].view()
#C.indptr = C.indptr
C_ptrs = get_csr_handle(C, clear=True)
print "C:"
show_csr_internal(C)
print "=call mkl function="
return_list= csr_dot_csr_t(A_ptrs, C_ptrs)
print "(ret, info):", return_list
print "C after calling mkl:"
show_csr_internal(C)
C_fix = fix_for_scipy(C,A)
print "C_fix for scipy:"
show_csr_internal(C_fix)
print C_fix.todense()
print "Original C after fixing:"
show_csr_internal(C)
print "scipy's (A).dot(A.T)"
scipy_ans = (A_original).dot(A_original.T)
#scipy_ans = spsp.csr_matrix(scipy_ans)
show_csr_internal(scipy_ans)
print scipy_ans.todense()
if __name__ == "__main__":
test()
结果:
A:
data:
[ 1. 1. 1. ..., 1. 1. 1.]
indptr:
[ 0 0 0 ..., 22673 22673 22673]
indices:
[1 0 2 ..., 2 1 2]
[[ 0. 0. 0.]
[ 0. 0. 0.]
[ 0. 0. 0.]
...,
[ 0. 0. 0.]
[ 0. 0. 0.]
[ 0. 0. 0.]]
C:
data:
[ 0. 0. 0. 0. 0. 0. 0. 0. 0.]
indptr:
[0 0 0 0]
indices:
[0 0 0 0 0 0 0 0 0]
=call mkl function=
(ret, info): [(2, 0)]
C after calling mkl:
data:
[ 7576. 77. 83. 77. 7607. 104. 83. 104. 7490.]
indptr:
[ 1 4 7 10]
indices:
[1 2 3 1 2 3 1 2 3]
fix n 3
fix nz 9
C_fix for scipy:
data:
[ 7576. 77. 83. 77. 7607. 104. 83. 104. 7490.]
indptr:
[0 3 6 9]
indices:
[0 1 2 0 1 2 0 1 2]
[[ 7576. 77. 83.]
[ 77. 7607. 104.]
[ 83. 104. 7490.]]
Original C after fixing:
data:
[ 7576. 77. 83. 77. 7607. 104. 83. 104. 7490.]
indptr:
[0 3 6 9]
indices:
[0 1 2 0 1 2 0 1 2]
scipy's (A.T).dot(A)
data:
[ 83 77 7576 104 77 7607 83 104 7490]
indptr:
[0 3 6 9]
indices:
[2 1 0 2 0 1 0 1 2]
[[7576 77 83]
[ 77 7607 104]
[ 83 104 7490]]
学到的东西:
- 对于矩阵 A、B 和 C,索引从 1 开始。
- 我在原始代码中混淆了 c_indptr 和 c_indices。应该是ia = scipy csr_matrix 的 indptr。 ja = scipy csr_matrix. 的索引
来自代码here。一切都作为指针传递给 mkl_?csrmultcsr。
mkl_scsrmultcsr(&ta, &r[1], &sort, &m, &m, &m, a, ja, ia, a, ja, ia, c, jc, ic, &nzmax, &info);
我想要一个可以使用从零开始的索引的 mkl 函数。 mkl.mkl_scsrmultcsr 函数只能用于基于一个的索引。 (或者我可以对所有内容进行基于一个的索引。这意味着对于大多数线性代数步骤,使用 intel c 函数而不是 scipy/numpy。)
查看 Python 稀疏产品的 scipy 代码。请注意,它分两次调用编译后的代码。
看起来 mkl 代码做了同样的事情
https://software.intel.com/en-us/node/468640
If request=1, the routine computes only values of the array ic of length m + 1, the memory for this array must be allocated beforehand. On exit the value ic(m+1) - 1 is the actual number of the elements in the arrays c and jc.
If request=2, the routine has been called previously with the parameter request=1, the output arrays jc and c are allocated in the calling program and they are of the length ic(m+1) - 1 at least.
你首先根据C
的行数分配了ic
(你从输入中知道),然后用request=1
调用mkl代码。
对于 request=2
,您必须根据 ic(m+1) - 1
中的大小分配 c
和 jc
数组。这与输入数组中 nnz
的数量不同。
您正在使用 request1 = c_int(0)
,这要求 c
数组的大小正确 - 如果不实际执行 dot
(或 request 1
).
==================
File: /usr/lib/python3/dist-packages/scipy/sparse/compressed.py
Definition: A._mul_sparse_matrix(self, other)
pass 1 分配indptr
(注意大小),并传递指针(数据在此pass 无关紧要)
indptr = np.empty(major_axis + 1, dtype=idx_dtype)
fn = getattr(_sparsetools, self.format + '_matmat_pass1')
fn(M, N,
np.asarray(self.indptr, dtype=idx_dtype),
np.asarray(self.indices, dtype=idx_dtype),
np.asarray(other.indptr, dtype=idx_dtype),
np.asarray(other.indices, dtype=idx_dtype),
indptr)
nnz = indptr[-1]
pass 2分配indptr
(大小不同),并基于nnz
indices
和data
.
indptr = np.asarray(indptr, dtype=idx_dtype)
indices = np.empty(nnz, dtype=idx_dtype)
data = np.empty(nnz, dtype=upcast(self.dtype, other.dtype))
fn = getattr(_sparsetools, self.format + '_matmat_pass2')
fn(M, N, np.asarray(self.indptr, dtype=idx_dtype),
np.asarray(self.indices, dtype=idx_dtype),
self.data,
np.asarray(other.indptr, dtype=idx_dtype),
np.asarray(other.indices, dtype=idx_dtype),
other.data,
indptr, indices, data)
最后使用这些数组创建一个新数组。
return self.__class__((data,indices,indptr),shape=(M,N))
mkl
库应该以同样的方式使用。
===================
https://github.com/scipy/scipy/blob/master/scipy/sparse/sparsetools/csr.h
有 csr_matmat_pass1
和 csr_matmat_pass2
====================
如果有帮助,这里是这些通道的纯 Python 实现。没有试图利用任何数组操作的直译。
def pass1(A, B):
nrow,ncol=A.shape
Aptr=A.indptr
Bptr=B.indptr
Cp=np.zeros(nrow+1,int)
mask=np.zeros(ncol,int)-1
nnz=0
for i in range(nrow):
row_nnz=0
for jj in range(Aptr[i],Aptr[i+1]):
j=A.indices[jj]
for kk in range(Bptr[j],Bptr[j+1]):
k=B.indices[kk]
if mask[k]!=i:
mask[k]=i
row_nnz += 1
nnz += row_nnz
Cp[i+1]=nnz
return Cp
def pass2(A,B,Cnnz):
nrow,ncol=A.shape
Ap,Aj,Ax=A.indptr,A.indices,A.data
Bp,Bj,Bx=B.indptr,B.indices,B.data
next = np.zeros(ncol,int)-1
sums = np.zeros(ncol,A.dtype)
Cp=np.zeros(nrow+1,int)
Cj=np.zeros(Cnnz,int)
Cx=np.zeros(Cnnz,A.dtype)
nnz = 0
for i in range(nrow):
head = -2
length = 0
for jj in range(Ap[i], Ap[i+1]):
j, v = Aj[jj], Ax[jj]
for kk in range(Bp[j], Bp[j+1]):
k = Bj[kk]
sums[k] += v*Bx[kk]
if (next[k]==-1):
next[k], head=head, k
length += 1
print(i,sums, next)
for _ in range(length):
if sums[head] !=0:
Cj[nnz], Cx[nnz] = head, sums[head]
nnz += 1
temp = head
head = next[head]
next[temp], sums[temp] = -1, 0
Cp[i+1] = nnz
return Cp, Cj, Cx
def pass0(A,B):
Cp = pass1(A,B)
nnz = Cp[-1]
Cp,Cj,Cx=pass2(A,B,nnz)
N,M=A.shape[0], B.shape[1]
C=sparse.csr_matrix((Cx, Cj, Cp), shape=(N,M))
return C
A
和 B
都必须是 csr
。它使用转置需要转换,例如B = A.T.tocsr()
.