cuSolver 在 pycuda 上的 getrs 函数不能正常工作
getrs function of cuSolver over pycuda doesn't work properly
我正在尝试制作一个受 scikits-cuda 库启发的 pycuda 包装器,用于 Nvidia 的新 cuSolver 库中提供的一些操作。我想通过 LU 分解求解 AX=B 形式的线性系统,首先使用我想包装的 cuSolve 中的 cublasSgetrfBatched method from scikits-cuda, that give me the factorization LU; then with that factorization I want to solve the system using cusolverDnSgetrs,当我执行计算 return 状态 3 时,应该给我答案的矩阵不会改变,但 *devInfo 为零,查看 cusolver 的文档说:
CUSOLVER_STATUS_INVALID_VALUE=An unsupported value or parameter was passed to the function (a negative vector size, for example).
libcusolver.cusolverDnSgetrs.restype=int
libcusolver.cusolverDnSgetrs.argtypes=[_types.handle,
ctypes.c_char,
ctypes.c_int,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_int,
ctypes.c_void_p]
"""
handle is the handle pointer given by calling cusolverDnCreate() from cuSolver
LU is the LU factoriced matrix given by cublasSgetrfBatched() from scikits
P is the pivots matrix given by cublasSgetrfBatched()
B is the right hand matix from AX=B
"""
def cusolverSolveLU(handle,LU,P,B):
rows_LU ,cols_LU = LU.shape
rows_B, cols_B = B.shape
B_gpu = gpuarray.to_gpu(B.astype('float32'))
info_gpu = gpuarray.zeros(1, np.int32)
status=libcusolver.cusolverDnSgetrs(
handle, 'n', rows_LU, cols_B,
int(LU.gpudata), cols_LU,
int(P.gpudata), int(B_gpu.gpudata),
cols_B, int(info_gpu.gpudata))
print info_gpu
print status
handle= cusolverCreate() #get the initialization of cusolver
LU, P = cublasLUFactorization(...)
B = np.asarray(np.random.rand(3, 3), np.float32)
cusolverSolveLU(handle,LU,P,B)
输出:
[0]
3
我做错了什么?
这是一个关于如何使用该库的完整工作示例;结果根据使用 numpy 的内置求解器获得的结果进行了验证:
import ctypes
import numpy as np
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
CUSOLVER_STATUS_SUCCESS = 0
libcusolver = ctypes.cdll.LoadLibrary('libcusolver.so')
libcusolver.cusolverDnCreate.restype = int
libcusolver.cusolverDnCreate.argtypes = [ctypes.c_void_p]
def cusolverDnCreate():
handle = ctypes.c_void_p()
status = libcusolver.cusolverDnCreate(ctypes.byref(handle))
if status != CUSOLVER_STATUS_SUCCESS:
raise RuntimeError('error!')
return handle.value
libcusolver.cusolverDnDestroy.restype = int
libcusolver.cusolverDnDestroy.argtypes = [ctypes.c_void_p]
def cusolverDnDestroy(handle):
status = libcusolver.cusolverDnDestroy(handle)
if status != CUSOLVER_STATUS_SUCCESS:
raise RuntimeError('error!')
libcusolver.cusolverDnSgetrf_bufferSize.restype = int
libcusolver.cusolverDnSgetrf_bufferSize.argtypes = [ctypes.c_void_p,
ctypes.c_int,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_int,
ctypes.c_void_p]
def cusolverDnSgetrf_bufferSize(handle, m, n, A, lda, Lwork):
status = libcusolver.cusolverDnSgetrf_bufferSize(handle, m, n,
int(A.gpudata),
n, ctypes.pointer(Lwork))
if status != CUSOLVER_STATUS_SUCCESS:
raise RuntimeError('error!')
libcusolver.cusolverDnSgetrf.restype = int
libcusolver.cusolverDnSgetrf.argtypes = [ctypes.c_void_p,
ctypes.c_int,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_void_p]
def cusolverDnSgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo):
status = libcusolver.cusolverDnSgetrf(handle, m, n, int(A.gpudata),
lda,
int(Workspace.gpudata),
int(devIpiv.gpudata),
int(devInfo.gpudata))
if status != CUSOLVER_STATUS_SUCCESS:
raise RuntimeError('error!')
libcusolver.cusolverDnSgetrs.restype = int
libcusolver.cusolverDnSgetrs.argtypes = [ctypes.c_void_p,
ctypes.c_int,
ctypes.c_int,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_int,
ctypes.c_void_p]
def cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo):
status = libcusolver.cusolverDnSgetrs(handle, trans, n, nrhs,
int(A.gpudata), lda,
int(devIpiv.gpudata), int(B.gpudata),
ldb, int(devInfo.gpudata))
if status != CUSOLVER_STATUS_SUCCESS:
raise RuntimeError('error!')
if __name__ == '__main__':
m = 3
n = 3
a = np.asarray(np.random.rand(m, n), np.float32)
a_gpu = gpuarray.to_gpu(a.T.copy())
lda = m
b = np.asarray(np.random.rand(m, n), np.float32)
b_gpu = gpuarray.to_gpu(b.T.copy())
ldb = m
handle = cusolverDnCreate()
Lwork = ctypes.c_int()
cusolverDnSgetrf_bufferSize(handle, m, n, a_gpu, lda, Lwork)
Workspace = gpuarray.empty(Lwork.value, dtype=np.float32)
devIpiv = gpuarray.zeros(min(m, n), dtype=np.int32)
devInfo = gpuarray.zeros(1, dtype=np.int32)
cusolverDnSgetrf(handle, m, n, a_gpu, lda, Workspace, devIpiv, devInfo)
if devInfo.get()[0] != 0:
raise RuntimeError('error!')
CUBLAS_OP_N = 0
nrhs = n
devInfo = gpuarray.zeros(1, dtype=np.int32)
cusolverDnSgetrs(handle, CUBLAS_OP_N, n, nrhs, a_gpu, lda, devIpiv, b_gpu, ldb, devInfo)
x_cusolver = b_gpu.get().T
cusolverDnDestroy(handle)
x_numpy = np.linalg.solve(a, b)
print np.allclose(x_numpy, x_cusolver)
我正在尝试制作一个受 scikits-cuda 库启发的 pycuda 包装器,用于 Nvidia 的新 cuSolver 库中提供的一些操作。我想通过 LU 分解求解 AX=B 形式的线性系统,首先使用我想包装的 cuSolve 中的 cublasSgetrfBatched method from scikits-cuda, that give me the factorization LU; then with that factorization I want to solve the system using cusolverDnSgetrs,当我执行计算 return 状态 3 时,应该给我答案的矩阵不会改变,但 *devInfo 为零,查看 cusolver 的文档说:
CUSOLVER_STATUS_INVALID_VALUE=An unsupported value or parameter was passed to the function (a negative vector size, for example).
libcusolver.cusolverDnSgetrs.restype=int
libcusolver.cusolverDnSgetrs.argtypes=[_types.handle,
ctypes.c_char,
ctypes.c_int,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_int,
ctypes.c_void_p]
"""
handle is the handle pointer given by calling cusolverDnCreate() from cuSolver
LU is the LU factoriced matrix given by cublasSgetrfBatched() from scikits
P is the pivots matrix given by cublasSgetrfBatched()
B is the right hand matix from AX=B
"""
def cusolverSolveLU(handle,LU,P,B):
rows_LU ,cols_LU = LU.shape
rows_B, cols_B = B.shape
B_gpu = gpuarray.to_gpu(B.astype('float32'))
info_gpu = gpuarray.zeros(1, np.int32)
status=libcusolver.cusolverDnSgetrs(
handle, 'n', rows_LU, cols_B,
int(LU.gpudata), cols_LU,
int(P.gpudata), int(B_gpu.gpudata),
cols_B, int(info_gpu.gpudata))
print info_gpu
print status
handle= cusolverCreate() #get the initialization of cusolver
LU, P = cublasLUFactorization(...)
B = np.asarray(np.random.rand(3, 3), np.float32)
cusolverSolveLU(handle,LU,P,B)
输出:
[0]
3
我做错了什么?
这是一个关于如何使用该库的完整工作示例;结果根据使用 numpy 的内置求解器获得的结果进行了验证:
import ctypes
import numpy as np
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
CUSOLVER_STATUS_SUCCESS = 0
libcusolver = ctypes.cdll.LoadLibrary('libcusolver.so')
libcusolver.cusolverDnCreate.restype = int
libcusolver.cusolverDnCreate.argtypes = [ctypes.c_void_p]
def cusolverDnCreate():
handle = ctypes.c_void_p()
status = libcusolver.cusolverDnCreate(ctypes.byref(handle))
if status != CUSOLVER_STATUS_SUCCESS:
raise RuntimeError('error!')
return handle.value
libcusolver.cusolverDnDestroy.restype = int
libcusolver.cusolverDnDestroy.argtypes = [ctypes.c_void_p]
def cusolverDnDestroy(handle):
status = libcusolver.cusolverDnDestroy(handle)
if status != CUSOLVER_STATUS_SUCCESS:
raise RuntimeError('error!')
libcusolver.cusolverDnSgetrf_bufferSize.restype = int
libcusolver.cusolverDnSgetrf_bufferSize.argtypes = [ctypes.c_void_p,
ctypes.c_int,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_int,
ctypes.c_void_p]
def cusolverDnSgetrf_bufferSize(handle, m, n, A, lda, Lwork):
status = libcusolver.cusolverDnSgetrf_bufferSize(handle, m, n,
int(A.gpudata),
n, ctypes.pointer(Lwork))
if status != CUSOLVER_STATUS_SUCCESS:
raise RuntimeError('error!')
libcusolver.cusolverDnSgetrf.restype = int
libcusolver.cusolverDnSgetrf.argtypes = [ctypes.c_void_p,
ctypes.c_int,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_void_p]
def cusolverDnSgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo):
status = libcusolver.cusolverDnSgetrf(handle, m, n, int(A.gpudata),
lda,
int(Workspace.gpudata),
int(devIpiv.gpudata),
int(devInfo.gpudata))
if status != CUSOLVER_STATUS_SUCCESS:
raise RuntimeError('error!')
libcusolver.cusolverDnSgetrs.restype = int
libcusolver.cusolverDnSgetrs.argtypes = [ctypes.c_void_p,
ctypes.c_int,
ctypes.c_int,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_int,
ctypes.c_void_p]
def cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo):
status = libcusolver.cusolverDnSgetrs(handle, trans, n, nrhs,
int(A.gpudata), lda,
int(devIpiv.gpudata), int(B.gpudata),
ldb, int(devInfo.gpudata))
if status != CUSOLVER_STATUS_SUCCESS:
raise RuntimeError('error!')
if __name__ == '__main__':
m = 3
n = 3
a = np.asarray(np.random.rand(m, n), np.float32)
a_gpu = gpuarray.to_gpu(a.T.copy())
lda = m
b = np.asarray(np.random.rand(m, n), np.float32)
b_gpu = gpuarray.to_gpu(b.T.copy())
ldb = m
handle = cusolverDnCreate()
Lwork = ctypes.c_int()
cusolverDnSgetrf_bufferSize(handle, m, n, a_gpu, lda, Lwork)
Workspace = gpuarray.empty(Lwork.value, dtype=np.float32)
devIpiv = gpuarray.zeros(min(m, n), dtype=np.int32)
devInfo = gpuarray.zeros(1, dtype=np.int32)
cusolverDnSgetrf(handle, m, n, a_gpu, lda, Workspace, devIpiv, devInfo)
if devInfo.get()[0] != 0:
raise RuntimeError('error!')
CUBLAS_OP_N = 0
nrhs = n
devInfo = gpuarray.zeros(1, dtype=np.int32)
cusolverDnSgetrs(handle, CUBLAS_OP_N, n, nrhs, a_gpu, lda, devIpiv, b_gpu, ldb, devInfo)
x_cusolver = b_gpu.get().T
cusolverDnDestroy(handle)
x_numpy = np.linalg.solve(a, b)
print np.allclose(x_numpy, x_cusolver)