Функция getrs cuSolver над pycuda не работает должным образом
Я пытаюсь сделать оболочку Pycuda, вдохновленную библиотекой scikits-cuda, для некоторых операций, представленных в новой библиотеке cuSolver от Nvidia. Я хочу решить линейную систему вида AX=B с помощью факторизации LU, чтобы выполнить это сначала с использованием метода cublasSgetrfBatched от scikits-cuda, который дает мне факторизацию LU; затем с этой факторизацией я хочу решить систему, используя cusolverDnSgetrs из cuSolve, который я хочу обернуть, когда я выполняю вычисление, возвращающее состояние 3, матрицы, которые предлагают дать мне ответ, не меняются, НО * devInfo равен нулю, при просмотре документации ксольвера говорится:
CUSOLVER_STATUS_INVALID_VALUE = Неподдерживаемое значение или параметр были переданы в функцию (например, отрицательный размер вектора).
libcusolver.cusolverDnSgetrs.restype=int
libcusolver.cusolverDnSgetrs.argtypes=[_types.handle,
ctypes.c_char,
ctypes.c_int,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_int,
ctypes.c_void_p]
"""
handle is the handle pointer given by calling cusolverDnCreate() from cuSolver
LU is the LU factoriced matrix given by cublasSgetrfBatched() from scikits
P is the pivots matrix given by cublasSgetrfBatched()
B is the right hand matix from AX=B
"""
def cusolverSolveLU(handle,LU,P,B):
rows_LU ,cols_LU = LU.shape
rows_B, cols_B = B.shape
B_gpu = gpuarray.to_gpu(B.astype('float32'))
info_gpu = gpuarray.zeros(1, np.int32)
status=libcusolver.cusolverDnSgetrs(
handle, 'n', rows_LU, cols_B,
int(LU.gpudata), cols_LU,
int(P.gpudata), int(B_gpu.gpudata),
cols_B, int(info_gpu.gpudata))
print info_gpu
print status
handle= cusolverCreate() #get the initialization of cusolver
LU, P = cublasLUFactorization(...)
B = np.asarray(np.random.rand(3, 3), np.float32)
cusolverSolveLU(handle,LU,P,B)
Выход:
[0]
3
Что я делаю не так?
1 ответ
Вот полный рабочий пример того, как использовать библиотеку; результат сравнивается с полученным с помощью встроенного решателя numpy:
import ctypes
import numpy as np
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
CUSOLVER_STATUS_SUCCESS = 0
libcusolver = ctypes.cdll.LoadLibrary('libcusolver.so')
libcusolver.cusolverDnCreate.restype = int
libcusolver.cusolverDnCreate.argtypes = [ctypes.c_void_p]
def cusolverDnCreate():
handle = ctypes.c_void_p()
status = libcusolver.cusolverDnCreate(ctypes.byref(handle))
if status != CUSOLVER_STATUS_SUCCESS:
raise RuntimeError('error!')
return handle.value
libcusolver.cusolverDnDestroy.restype = int
libcusolver.cusolverDnDestroy.argtypes = [ctypes.c_void_p]
def cusolverDnDestroy(handle):
status = libcusolver.cusolverDnDestroy(handle)
if status != CUSOLVER_STATUS_SUCCESS:
raise RuntimeError('error!')
libcusolver.cusolverDnSgetrf_bufferSize.restype = int
libcusolver.cusolverDnSgetrf_bufferSize.argtypes = [ctypes.c_void_p,
ctypes.c_int,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_int,
ctypes.c_void_p]
def cusolverDnSgetrf_bufferSize(handle, m, n, A, lda, Lwork):
status = libcusolver.cusolverDnSgetrf_bufferSize(handle, m, n,
int(A.gpudata),
n, ctypes.pointer(Lwork))
if status != CUSOLVER_STATUS_SUCCESS:
raise RuntimeError('error!')
libcusolver.cusolverDnSgetrf.restype = int
libcusolver.cusolverDnSgetrf.argtypes = [ctypes.c_void_p,
ctypes.c_int,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_void_p]
def cusolverDnSgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo):
status = libcusolver.cusolverDnSgetrf(handle, m, n, int(A.gpudata),
lda,
int(Workspace.gpudata),
int(devIpiv.gpudata),
int(devInfo.gpudata))
if status != CUSOLVER_STATUS_SUCCESS:
raise RuntimeError('error!')
libcusolver.cusolverDnSgetrs.restype = int
libcusolver.cusolverDnSgetrs.argtypes = [ctypes.c_void_p,
ctypes.c_int,
ctypes.c_int,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_int,
ctypes.c_void_p]
def cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo):
status = libcusolver.cusolverDnSgetrs(handle, trans, n, nrhs,
int(A.gpudata), lda,
int(devIpiv.gpudata), int(B.gpudata),
ldb, int(devInfo.gpudata))
if status != CUSOLVER_STATUS_SUCCESS:
raise RuntimeError('error!')
if __name__ == '__main__':
m = 3
n = 3
a = np.asarray(np.random.rand(m, n), np.float32)
a_gpu = gpuarray.to_gpu(a.T.copy())
lda = m
b = np.asarray(np.random.rand(m, n), np.float32)
b_gpu = gpuarray.to_gpu(b.T.copy())
ldb = m
handle = cusolverDnCreate()
Lwork = ctypes.c_int()
cusolverDnSgetrf_bufferSize(handle, m, n, a_gpu, lda, Lwork)
Workspace = gpuarray.empty(Lwork.value, dtype=np.float32)
devIpiv = gpuarray.zeros(min(m, n), dtype=np.int32)
devInfo = gpuarray.zeros(1, dtype=np.int32)
cusolverDnSgetrf(handle, m, n, a_gpu, lda, Workspace, devIpiv, devInfo)
if devInfo.get()[0] != 0:
raise RuntimeError('error!')
CUBLAS_OP_N = 0
nrhs = n
devInfo = gpuarray.zeros(1, dtype=np.int32)
cusolverDnSgetrs(handle, CUBLAS_OP_N, n, nrhs, a_gpu, lda, devIpiv, b_gpu, ldb, devInfo)
x_cusolver = b_gpu.get().T
cusolverDnDestroy(handle)
x_numpy = np.linalg.solve(a, b)
print np.allclose(x_numpy, x_cusolver)