I am trying to integrate CUSP into an existing Fortran code. Right now I am just trying to get the basic set up for thrust / CUSP to feed in arrays from Fortran and use them to construct a CUSP matrix (coo format right now). I have been able to get a wrapper like C routine to compile into a library and link it with the Fortran code thanks to this thread: unresolved-references-using-ifort-with-nvcc-and-cusp
And I can verify that Fortran is correctly feeding in array pointers thanks to help from a previous thread: Generating CUSP coo_matrix from passed FORTRAN arrays
Unfortunately, I still can not get CUSP to use these to generate and print a matrix. The code and output are shown below:
output
$ ./fort_cusp_test
testing 1 2 3
n: 3, nnz: 9
i, row_i, col_j, val_v
0, 1, 1, 1.0000e+00
1, 1, 2, 2.0000e+00
2, 1, 3, 3.0000e+00
3, 2, 1, 4.0000e+00
4, 2, 2, 5.0000e+00
5, 2, 3, 6.0000e+00
6, 3, 1, 7.0000e+00
7, 3, 2, 8.0000e+00
8, 3, 3, 9.0000e+00
initialized row_i into thrust
initialized col_j into thrust
initialized val_v into thrust
defined CUSP integer array view for row_i and col_j
defined CUSP float array view for val_v
loaded row_i into a CUSP integer array view
loaded col_j into a CUSP integer array view
loaded val_v into a CUSP float array view
defined CUSP coo_matrix view
Built matrix A from CUSP device views
sparse matrix <3, 3> with 9 entries
libc++abi.dylib: terminating with uncaught exception of type thrust::system::system_error: invalid argument
Program received signal SIGABRT: Process abort signal.
Backtrace for this error:
#0 0x10d0fdff6
#1 0x10d0fd593
#2 0x7fff8593ff19
Abort trap: 6
fort_cusp_test.f90
program fort_cuda_test
implicit none
! interface
! subroutine test_coo_mat_print_(row_i,col_j,val_v,n,nnz) bind(C)
! use, intrinsic :: ISO_C_BINDING, ONLY: C_INT,C_FLOAT
! implicit none
! integer(C_INT),value :: n, nnz
! integer(C_INT) :: row_i(:), col_j(:)
! real(C_FLOAT) :: val_v(:)
! end subroutine test_coo_mat_print_
! end interface
integer*4 n
integer*4 nnz
integer*4, target :: rowI(9),colJ(9)
real*4, target :: valV(9)
integer*4, pointer :: row_i(:)
integer*4, pointer :: col_j(:)
real*4, pointer :: val_v(:)
n = 3
nnz = 9
rowI = (/ 1, 1, 1, 2, 2, 2, 3, 3, 3/)
colJ = (/ 1, 2, 3, 1, 2, 3, 1, 2, 3/)
valV = (/ 1, 2, 3, 4, 5, 6, 7, 8, 9/)
row_i => rowI
col_j => colJ
val_v => valV
write(*,*) "testing 1 2 3"
call test_coo_mat_print (rowI,colJ,valV,n,nnz)
end program fort_cuda_test
cusp_runner.cu
#include <stdio.h>
#include <cusp/coo_matrix.h>
#include <iostream>
// #include <cusp/krylov/cg.h>
#include <cusp/print.h>
#if defined(__cplusplus)
extern "C" {
#endif
void test_coo_mat_print_(int * row_i, int * col_j, float * val_v, int * N, int * NNZ ) {
int n, nnz;
n = *N;
nnz = *NNZ;
printf("n: %d, nnz: %d\n",n,nnz);
printf("%6s, %6s, %6s, %12s \n","i","row_i","col_j","val_v");
for(int i=0;i<n;i++) {
printf("%6d, %6d, %6d, %12.4e\n",i,row_i[i],col_j[i],val_v[i]);
}
//if ( false ) {
//wrap raw input pointers with thrust::device_ptr
thrust::device_ptr<int> wrapped_device_I(row_i);
printf("initialized row_i into thrust\n");
thrust::device_ptr<int> wrapped_device_J(col_j);
printf("initialized col_j into thrust\n");
thrust::device_ptr<float> wrapped_device_V(val_v);
printf("initialized val_v into thrust\n");
//use array1d_view to wrap individual arrays
typedef typename cusp::array1d_view< thrust::device_ptr<int> > DeviceIndexArrayView;
printf("defined CUSP integer array view for row_i and col_j\n");
typedef typename cusp::array1d_view< thrust::device_ptr<float> > DeviceValueArrayView;
printf("defined CUSP float array view for val_v\n");
DeviceIndexArrayView row_indices(wrapped_device_I, wrapped_device_I + nnz);
printf("loaded row_i into a CUSP integer array view\n");
DeviceIndexArrayView column_indices(wrapped_device_J, wrapped_device_J + nnz);
printf("loaded col_j into a CUSP integer array view\n");
DeviceValueArrayView values(wrapped_device_V, wrapped_device_V + nnz);
printf("loaded val_v into a CUSP float array view\n");
//combine array1d_views into coo_matrix_view
typedef cusp::coo_matrix_view<DeviceIndexArrayView,DeviceIndexArrayView,DeviceValueArrayView> DeviceView;
printf("defined CUSP coo_matrix view\n");
//construct coo_matrix_view from array1d_views
DeviceView A(n,n,nnz,row_indices,column_indices,values);
printf("Built matrix A from CUSP device views\n");
cusp::print(A);
printf("Printed matrix A\n");
//}
}
#if defined(__cplusplus)
}
#endif
Makefile
Test:
nvcc -Xcompiler="-fPIC" -shared cusp_runner.cu -o cusp_runner.so -I/Developer/NVIDIA/CUDA-6.5/include/cusp
gfortran -c fort_cusp_test.f90
gfortran fort_cusp_test.o cusp_runner.so -L/Developer/NVIDIA/CUDA-6.5/lib -lcudart -o fort_cusp_test
clean:
rm *.o *.so fort_cusp_test
Functional version of cusp_runner.cu:
#include <stdio.h>
#include <cusp/coo_matrix.h>
#include <iostream>
// #include <cusp/krylov/cg.h>
#include <cusp/print.h>
#if defined(__cplusplus)
extern "C" {
#endif
void test_coo_mat_print_(int * row_i, int * col_j, float * val_v, int * N, int * NNZ ) {
int n, nnz;
n = *N;
nnz = *NNZ;
printf("n: %d, nnz: %d\n",n,nnz);
printf("printing input (row_i, col_j, val_v)\n");
printf("%6s, %6s, %6s, %12s \n","i","row_i","col_j","val_v");
for(int i=0;i<nnz;i++) {
printf("%6d, %6d, %6d, %12.4e\n",i,row_i[i],col_j[i],val_v[i]);
}
printf("initializing thrust device vectors\n");
thrust::device_vector<int> device_I(row_i,row_i+nnz);
printf("device_I initialized\n");
thrust::device_vector<int> device_J(col_j,col_j+nnz);
printf("device_J initialized\n");
thrust::device_vector<float> device_V(val_v,val_v+nnz);
printf("device_V initialized\n");
cusp::coo_matrix<int, float, cusp::device_memory> A(n,n,nnz);
printf("initialized empty CUSP coo_matrix on device\n");
A.row_indices = device_I;
printf("loaded device_I into A.row_indices\n");
A.column_indices = device_J;
printf("loaded device_J into A.column_indices\n");
A.values = device_V;
printf("loaded device_V into A.values\n");
cusp::print(A);
printf("Printed matrix A\n");
//}
}
#if defined(__cplusplus)
}
#endif
Your thrust/CUSP side code for handling pointers is totally incorrect. This:
doesn't do what you think it does. What you have effectively done is cast a host address into a device address. That is illegal unless you are working with CUDA managed memory, and I see no evidence of that in this code. What you want to do is allocate memory and copy the Fortran arrays to the GPU before starting. Do something like:
[disclaimer: totally untested, use at own risk]
For each of the COO vectors. I would, however, suggest replacing most of the code in the GPU setup section of
test_coo_mat_print_
withthrust::vector
instances instead. Apart from being easier to use, you get free memory deallocation when they fall out of scope so that there is far less possibility of engineering memory leaks. So something like:takes care of everything in a single call.
As a final tip, if you are developing multi-language codebases, design them such that the code in each language is fully independent and has its own native test code. If you had done that in this case, you would have discovered that the C++ part was not working independently of whatever Fortran problems you were having. It would have made debugging much simpler.