如何从传递的数组中正确构造 CUSP coo 矩阵
How to properly construct CUSP coo matrix from passed arrays
我正在尝试将 CUSP 集成到现有的 Fortran 代码中。现在我只是在尝试为 thrust / CUSP 进行基本设置,以从 Fortran 中输入数组并使用它们构建一个 CUSP 矩阵(现在是 coo 格式)。
由于这个线程,我已经能够得到一个像 C 例程这样的包装器来编译成一个库,并且 link 它与 Fortran 代码一起使用:unresolved-references-using-ifort-with-nvcc-and-cusp
而且我可以验证 Fortran 是否正确地输入了数组指针,这要归功于前一个线程的帮助:
不幸的是,我仍然无法让 CUSP 使用这些来生成和打印矩阵。
代码和输出如下所示:
输出
$ ./fort_cusp_test
testing 1 2 3
n: 3, nnz: 9
i, row_i, col_j, val_v
0, 1, 1, 1.0000e+00
1, 1, 2, 2.0000e+00
2, 1, 3, 3.0000e+00
3, 2, 1, 4.0000e+00
4, 2, 2, 5.0000e+00
5, 2, 3, 6.0000e+00
6, 3, 1, 7.0000e+00
7, 3, 2, 8.0000e+00
8, 3, 3, 9.0000e+00
initialized row_i into thrust
initialized col_j into thrust
initialized val_v into thrust
defined CUSP integer array view for row_i and col_j
defined CUSP float array view for val_v
loaded row_i into a CUSP integer array view
loaded col_j into a CUSP integer array view
loaded val_v into a CUSP float array view
defined CUSP coo_matrix view
Built matrix A from CUSP device views
sparse matrix <3, 3> with 9 entries
libc++abi.dylib: terminating with uncaught exception of type thrust::system::system_error: invalid argument
Program received signal SIGABRT: Process abort signal.
Backtrace for this error:
#0 0x10d0fdff6
#1 0x10d0fd593
#2 0x7fff8593ff19
Abort trap: 6
fort_cusp_test.f90
program fort_cuda_test
implicit none
! interface
! subroutine test_coo_mat_print_(row_i,col_j,val_v,n,nnz) bind(C)
! use, intrinsic :: ISO_C_BINDING, ONLY: C_INT,C_FLOAT
! implicit none
! integer(C_INT),value :: n, nnz
! integer(C_INT) :: row_i(:), col_j(:)
! real(C_FLOAT) :: val_v(:)
! end subroutine test_coo_mat_print_
! end interface
integer*4 n
integer*4 nnz
integer*4, target :: rowI(9),colJ(9)
real*4, target :: valV(9)
integer*4, pointer :: row_i(:)
integer*4, pointer :: col_j(:)
real*4, pointer :: val_v(:)
n = 3
nnz = 9
rowI = (/ 1, 1, 1, 2, 2, 2, 3, 3, 3/)
colJ = (/ 1, 2, 3, 1, 2, 3, 1, 2, 3/)
valV = (/ 1, 2, 3, 4, 5, 6, 7, 8, 9/)
row_i => rowI
col_j => colJ
val_v => valV
write(*,*) "testing 1 2 3"
call test_coo_mat_print (rowI,colJ,valV,n,nnz)
end program fort_cuda_test
cusp_runner.cu
#include <stdio.h>
#include <cusp/coo_matrix.h>
#include <iostream>
// #include <cusp/krylov/cg.h>
#include <cusp/print.h>
#if defined(__cplusplus)
extern "C" {
#endif
void test_coo_mat_print_(int * row_i, int * col_j, float * val_v, int * N, int * NNZ ) {
int n, nnz;
n = *N;
nnz = *NNZ;
printf("n: %d, nnz: %d\n",n,nnz);
printf("%6s, %6s, %6s, %12s \n","i","row_i","col_j","val_v");
for(int i=0;i<n;i++) {
printf("%6d, %6d, %6d, %12.4e\n",i,row_i[i],col_j[i],val_v[i]);
}
//if ( false ) {
//wrap raw input pointers with thrust::device_ptr
thrust::device_ptr<int> wrapped_device_I(row_i);
printf("initialized row_i into thrust\n");
thrust::device_ptr<int> wrapped_device_J(col_j);
printf("initialized col_j into thrust\n");
thrust::device_ptr<float> wrapped_device_V(val_v);
printf("initialized val_v into thrust\n");
//use array1d_view to wrap individual arrays
typedef typename cusp::array1d_view< thrust::device_ptr<int> > DeviceIndexArrayView;
printf("defined CUSP integer array view for row_i and col_j\n");
typedef typename cusp::array1d_view< thrust::device_ptr<float> > DeviceValueArrayView;
printf("defined CUSP float array view for val_v\n");
DeviceIndexArrayView row_indices(wrapped_device_I, wrapped_device_I + nnz);
printf("loaded row_i into a CUSP integer array view\n");
DeviceIndexArrayView column_indices(wrapped_device_J, wrapped_device_J + nnz);
printf("loaded col_j into a CUSP integer array view\n");
DeviceValueArrayView values(wrapped_device_V, wrapped_device_V + nnz);
printf("loaded val_v into a CUSP float array view\n");
//combine array1d_views into coo_matrix_view
typedef cusp::coo_matrix_view<DeviceIndexArrayView,DeviceIndexArrayView,DeviceValueArrayView> DeviceView;
printf("defined CUSP coo_matrix view\n");
//construct coo_matrix_view from array1d_views
DeviceView A(n,n,nnz,row_indices,column_indices,values);
printf("Built matrix A from CUSP device views\n");
cusp::print(A);
printf("Printed matrix A\n");
//}
}
#if defined(__cplusplus)
}
#endif
生成文件
Test:
nvcc -Xcompiler="-fPIC" -shared cusp_runner.cu -o cusp_runner.so -I/Developer/NVIDIA/CUDA-6.5/include/cusp
gfortran -c fort_cusp_test.f90
gfortran fort_cusp_test.o cusp_runner.so -L/Developer/NVIDIA/CUDA-6.5/lib -lcudart -o fort_cusp_test
clean:
rm *.o *.so fort_cusp_test
cusp_runner.cu的功能版本:
#include <stdio.h>
#include <cusp/coo_matrix.h>
#include <iostream>
// #include <cusp/krylov/cg.h>
#include <cusp/print.h>
#if defined(__cplusplus)
extern "C" {
#endif
void test_coo_mat_print_(int * row_i, int * col_j, float * val_v, int * N, int * NNZ ) {
int n, nnz;
n = *N;
nnz = *NNZ;
printf("n: %d, nnz: %d\n",n,nnz);
printf("printing input (row_i, col_j, val_v)\n");
printf("%6s, %6s, %6s, %12s \n","i","row_i","col_j","val_v");
for(int i=0;i<nnz;i++) {
printf("%6d, %6d, %6d, %12.4e\n",i,row_i[i],col_j[i],val_v[i]);
}
printf("initializing thrust device vectors\n");
thrust::device_vector<int> device_I(row_i,row_i+nnz);
printf("device_I initialized\n");
thrust::device_vector<int> device_J(col_j,col_j+nnz);
printf("device_J initialized\n");
thrust::device_vector<float> device_V(val_v,val_v+nnz);
printf("device_V initialized\n");
cusp::coo_matrix<int, float, cusp::device_memory> A(n,n,nnz);
printf("initialized empty CUSP coo_matrix on device\n");
A.row_indices = device_I;
printf("loaded device_I into A.row_indices\n");
A.column_indices = device_J;
printf("loaded device_J into A.column_indices\n");
A.values = device_V;
printf("loaded device_V into A.values\n");
cusp::print(A);
printf("Printed matrix A\n");
//}
}
#if defined(__cplusplus)
}
#endif
您的 thrust/CUSP 处理指针的辅助代码完全不正确。这个:
thrust::device_ptr<int> wrapped_device_I(row_i);
并不像您认为的那样。您有效地完成的是将主机地址转换为设备地址。除非您使用 CUDA 托管内存,否则这是非法的,而且我在这段代码中看不到任何证据。您要做的是在开始之前分配内存并将 Fortran 数组复制到 GPU。做类似的事情:
thrust::device_ptr<int> wrapped_device_I = thrust::device_malloc<int>(nnz);
thrust::copy(row_i, row_i + nnz, wrapped_device_I);
[免责声明:完全未经测试,使用风险自负]
对于每个 COO 向量。但是,我建议将 test_coo_mat_print_
的 GPU 设置部分中的大部分代码替换为 thrust::vector
实例。除了更易于使用之外,当内存超出范围时,您还可以释放内存,从而大大降低工程内存泄漏的可能性。所以像:
thrust::device_vector<int> device_I(row_i, row_i + nnz);
一个电话搞定一切。
作为最后的提示,如果您正在开发多语言代码库,请将它们设计为使每种语言的代码完全独立并具有自己的本机测试代码。如果您在这种情况下这样做,您会发现 C++ 部分无法独立于您遇到的任何 Fortran 问题工作。它会使调试变得更加简单。
我正在尝试将 CUSP 集成到现有的 Fortran 代码中。现在我只是在尝试为 thrust / CUSP 进行基本设置,以从 Fortran 中输入数组并使用它们构建一个 CUSP 矩阵(现在是 coo 格式)。 由于这个线程,我已经能够得到一个像 C 例程这样的包装器来编译成一个库,并且 link 它与 Fortran 代码一起使用:unresolved-references-using-ifort-with-nvcc-and-cusp
而且我可以验证 Fortran 是否正确地输入了数组指针,这要归功于前一个线程的帮助:
不幸的是,我仍然无法让 CUSP 使用这些来生成和打印矩阵。 代码和输出如下所示:
输出
$ ./fort_cusp_test
testing 1 2 3
n: 3, nnz: 9
i, row_i, col_j, val_v
0, 1, 1, 1.0000e+00
1, 1, 2, 2.0000e+00
2, 1, 3, 3.0000e+00
3, 2, 1, 4.0000e+00
4, 2, 2, 5.0000e+00
5, 2, 3, 6.0000e+00
6, 3, 1, 7.0000e+00
7, 3, 2, 8.0000e+00
8, 3, 3, 9.0000e+00
initialized row_i into thrust
initialized col_j into thrust
initialized val_v into thrust
defined CUSP integer array view for row_i and col_j
defined CUSP float array view for val_v
loaded row_i into a CUSP integer array view
loaded col_j into a CUSP integer array view
loaded val_v into a CUSP float array view
defined CUSP coo_matrix view
Built matrix A from CUSP device views
sparse matrix <3, 3> with 9 entries
libc++abi.dylib: terminating with uncaught exception of type thrust::system::system_error: invalid argument
Program received signal SIGABRT: Process abort signal.
Backtrace for this error:
#0 0x10d0fdff6
#1 0x10d0fd593
#2 0x7fff8593ff19
Abort trap: 6
fort_cusp_test.f90
program fort_cuda_test
implicit none
! interface
! subroutine test_coo_mat_print_(row_i,col_j,val_v,n,nnz) bind(C)
! use, intrinsic :: ISO_C_BINDING, ONLY: C_INT,C_FLOAT
! implicit none
! integer(C_INT),value :: n, nnz
! integer(C_INT) :: row_i(:), col_j(:)
! real(C_FLOAT) :: val_v(:)
! end subroutine test_coo_mat_print_
! end interface
integer*4 n
integer*4 nnz
integer*4, target :: rowI(9),colJ(9)
real*4, target :: valV(9)
integer*4, pointer :: row_i(:)
integer*4, pointer :: col_j(:)
real*4, pointer :: val_v(:)
n = 3
nnz = 9
rowI = (/ 1, 1, 1, 2, 2, 2, 3, 3, 3/)
colJ = (/ 1, 2, 3, 1, 2, 3, 1, 2, 3/)
valV = (/ 1, 2, 3, 4, 5, 6, 7, 8, 9/)
row_i => rowI
col_j => colJ
val_v => valV
write(*,*) "testing 1 2 3"
call test_coo_mat_print (rowI,colJ,valV,n,nnz)
end program fort_cuda_test
cusp_runner.cu
#include <stdio.h>
#include <cusp/coo_matrix.h>
#include <iostream>
// #include <cusp/krylov/cg.h>
#include <cusp/print.h>
#if defined(__cplusplus)
extern "C" {
#endif
void test_coo_mat_print_(int * row_i, int * col_j, float * val_v, int * N, int * NNZ ) {
int n, nnz;
n = *N;
nnz = *NNZ;
printf("n: %d, nnz: %d\n",n,nnz);
printf("%6s, %6s, %6s, %12s \n","i","row_i","col_j","val_v");
for(int i=0;i<n;i++) {
printf("%6d, %6d, %6d, %12.4e\n",i,row_i[i],col_j[i],val_v[i]);
}
//if ( false ) {
//wrap raw input pointers with thrust::device_ptr
thrust::device_ptr<int> wrapped_device_I(row_i);
printf("initialized row_i into thrust\n");
thrust::device_ptr<int> wrapped_device_J(col_j);
printf("initialized col_j into thrust\n");
thrust::device_ptr<float> wrapped_device_V(val_v);
printf("initialized val_v into thrust\n");
//use array1d_view to wrap individual arrays
typedef typename cusp::array1d_view< thrust::device_ptr<int> > DeviceIndexArrayView;
printf("defined CUSP integer array view for row_i and col_j\n");
typedef typename cusp::array1d_view< thrust::device_ptr<float> > DeviceValueArrayView;
printf("defined CUSP float array view for val_v\n");
DeviceIndexArrayView row_indices(wrapped_device_I, wrapped_device_I + nnz);
printf("loaded row_i into a CUSP integer array view\n");
DeviceIndexArrayView column_indices(wrapped_device_J, wrapped_device_J + nnz);
printf("loaded col_j into a CUSP integer array view\n");
DeviceValueArrayView values(wrapped_device_V, wrapped_device_V + nnz);
printf("loaded val_v into a CUSP float array view\n");
//combine array1d_views into coo_matrix_view
typedef cusp::coo_matrix_view<DeviceIndexArrayView,DeviceIndexArrayView,DeviceValueArrayView> DeviceView;
printf("defined CUSP coo_matrix view\n");
//construct coo_matrix_view from array1d_views
DeviceView A(n,n,nnz,row_indices,column_indices,values);
printf("Built matrix A from CUSP device views\n");
cusp::print(A);
printf("Printed matrix A\n");
//}
}
#if defined(__cplusplus)
}
#endif
生成文件
Test:
nvcc -Xcompiler="-fPIC" -shared cusp_runner.cu -o cusp_runner.so -I/Developer/NVIDIA/CUDA-6.5/include/cusp
gfortran -c fort_cusp_test.f90
gfortran fort_cusp_test.o cusp_runner.so -L/Developer/NVIDIA/CUDA-6.5/lib -lcudart -o fort_cusp_test
clean:
rm *.o *.so fort_cusp_test
cusp_runner.cu的功能版本:
#include <stdio.h>
#include <cusp/coo_matrix.h>
#include <iostream>
// #include <cusp/krylov/cg.h>
#include <cusp/print.h>
#if defined(__cplusplus)
extern "C" {
#endif
void test_coo_mat_print_(int * row_i, int * col_j, float * val_v, int * N, int * NNZ ) {
int n, nnz;
n = *N;
nnz = *NNZ;
printf("n: %d, nnz: %d\n",n,nnz);
printf("printing input (row_i, col_j, val_v)\n");
printf("%6s, %6s, %6s, %12s \n","i","row_i","col_j","val_v");
for(int i=0;i<nnz;i++) {
printf("%6d, %6d, %6d, %12.4e\n",i,row_i[i],col_j[i],val_v[i]);
}
printf("initializing thrust device vectors\n");
thrust::device_vector<int> device_I(row_i,row_i+nnz);
printf("device_I initialized\n");
thrust::device_vector<int> device_J(col_j,col_j+nnz);
printf("device_J initialized\n");
thrust::device_vector<float> device_V(val_v,val_v+nnz);
printf("device_V initialized\n");
cusp::coo_matrix<int, float, cusp::device_memory> A(n,n,nnz);
printf("initialized empty CUSP coo_matrix on device\n");
A.row_indices = device_I;
printf("loaded device_I into A.row_indices\n");
A.column_indices = device_J;
printf("loaded device_J into A.column_indices\n");
A.values = device_V;
printf("loaded device_V into A.values\n");
cusp::print(A);
printf("Printed matrix A\n");
//}
}
#if defined(__cplusplus)
}
#endif
您的 thrust/CUSP 处理指针的辅助代码完全不正确。这个:
thrust::device_ptr<int> wrapped_device_I(row_i);
并不像您认为的那样。您有效地完成的是将主机地址转换为设备地址。除非您使用 CUDA 托管内存,否则这是非法的,而且我在这段代码中看不到任何证据。您要做的是在开始之前分配内存并将 Fortran 数组复制到 GPU。做类似的事情:
thrust::device_ptr<int> wrapped_device_I = thrust::device_malloc<int>(nnz);
thrust::copy(row_i, row_i + nnz, wrapped_device_I);
[免责声明:完全未经测试,使用风险自负]
对于每个 COO 向量。但是,我建议将 test_coo_mat_print_
的 GPU 设置部分中的大部分代码替换为 thrust::vector
实例。除了更易于使用之外,当内存超出范围时,您还可以释放内存,从而大大降低工程内存泄漏的可能性。所以像:
thrust::device_vector<int> device_I(row_i, row_i + nnz);
一个电话搞定一切。
作为最后的提示,如果您正在开发多语言代码库,请将它们设计为使每种语言的代码完全独立并具有自己的本机测试代码。如果您在这种情况下这样做,您会发现 C++ 部分无法独立于您遇到的任何 Fortran 问题工作。它会使调试变得更加简单。