I have below a simple program from MAGMA doc which inverses a large matrix on a GPU card.
#include <stdio.h>
#include <cuda.h>
#include "magma_v2.h"
#include "magma_lapack.h"
int main() {
//double *d_lA[MagmaMaxGPUs];
magma_init (); // initialize Magma
magma_queue_t queue=NULL;
magma_int_t dev=0;
magma_queue_create(dev ,&queue );
double gpu_time , *dwork; // dwork - workspace
magma_int_t ldwork; // size of dwork
magma_int_t *piv , info; // piv - array of indices of inter -
magma_int_t m = 20000; // changed rows; a - mxm matrix
magma_int_t mm=m*m; // size of a, r, c
double *a; // a- mxm matrix on the host
double *d_a; // d_a - mxm matrix a on the device
double *d_r; // d_r - mxm matrix r on the device
double *d_c; // d_c - mxm matrix c on the device
magma_int_t ione = 1;
magma_int_t ISEED [4] = { 0,0,0,1 }; // seed
magma_int_t err;
const double alpha = 1.0; // alpha =1
const double beta = 0.0; // beta=0
ldwork = m * magma_get_dgetri_nb( m ); // optimal block size
// allocate matrices
err = magma_dmalloc_cpu( &a , mm ); // host memory for a
err = magma_dmalloc( &d_a , mm ); // device memory for a
err = magma_dmalloc( &d_r , mm ); // device memory for r
err = magma_dmalloc( &d_c , mm ); // device memory for c
err = magma_dmalloc( &dwork , ldwork );// dev. mem. for ldwork
piv=( magma_int_t *) malloc(m*sizeof(magma_int_t ));// host mem.
// generate random matrix a // for piv
lapackf77_dlarnv (&ione ,ISEED ,&mm ,a); // randomize a
printf("here1\n");
magma_dsetmatrix( m, m, a, m, d_a , m, queue); // copy a -> d_a
magmablas_dlacpy(MagmaFull , m, m, d_a , m, d_r ,m,queue);//d_a ->d_r
// find the inverse matrix: d_a*X=I using the LU factorization
// with partial pivoting and row interchanges computed by
// magma_dgetrf_gpu; row i is interchanged with row piv(i);
// d_a -mxm matrix; d_a is overwritten by the inverse
gpu_time = magma_sync_wtime(NULL);
printf("here2\n");
//magma_dgetrf_mgpu(2, m, m, d_lA, m, piv, &info);
magma_dgetrf_gpu(m, m, d_a, m, piv, &info);
magma_dgetri_gpu(m, d_a, m, piv, dwork, ldwork, &info);
printf("here3\n");
gpu_time = magma_sync_wtime(NULL)-gpu_time;
magma_dgemm(MagmaNoTrans ,MagmaNoTrans ,m,m,m,alpha ,d_a ,m,
d_r ,m,beta ,d_c ,m,queue); // multiply a^-1*a
printf("magma_dgetrf_gpu + magma_dgetri_gpu time: %7.5f sec.\
\n",gpu_time );
magma_dgetmatrix( m, m, d_c , m, a, m, queue); // copy d_c ->a
printf("upper left corner of a^-1*a:\n");
magma_dprint( 4, 4, a, m ); // part of a^-1*a
free(a); // free host memory
free(piv); // free host memory
magma_free(d_a); // free device memory
magma_free(d_r); // free device memory
magma_free(d_c); // free device memory
magma_queue_destroy(queue); // destroy queue
magma_finalize (); // finalize Magma
return 0;
}
Everything works fine, the short runtime expected is checked :
Now if you look at the code above, you will see that I have commented 2 lines :
//double *d_lA[MagmaMaxGPUs];
and
//magma_dgetrf_mgpu(2, m, m, d_lA, m, piv, &info);
Indeed, uncommenting these 2 lines and replacing magma_dgetrf_gpu(m, m, d_a, m, piv, &info);
by magma_dgetrf_mgpu(2, m, m, d_lA, m, piv, &info);
are done to make run this code distributed on 2 GPU cards (on 2 GPU cards RTX A6000).
I have set MAGMA_NUM_GPUS=2
from MAGMA doc.
So I have the code :
#include <stdio.h>
#include <cuda.h>
#include "magma_v2.h"
#include "magma_lapack.h"
int main() {
double *d_lA[MagmaMaxGPUs];
magma_init (); // initialize Magma
magma_queue_t queue=NULL;
magma_int_t dev=0;
magma_queue_create(dev ,&queue );
double gpu_time , *dwork; // dwork - workspace
magma_int_t ldwork; // size of dwork
magma_int_t *piv , info; // piv - array of indices of inter -
magma_int_t m = 20000; // changed rows; a - mxm matrix
magma_int_t mm=m*m; // size of a, r, c
double *a; // a- mxm matrix on the host
double *d_a; // d_a - mxm matrix a on the device
double *d_r; // d_r - mxm matrix r on the device
double *d_c; // d_c - mxm matrix c on the device
magma_int_t ione = 1;
magma_int_t ISEED [4] = { 0,0,0,1 }; // seed
magma_int_t err;
const double alpha = 1.0; // alpha =1
const double beta = 0.0; // beta=0
ldwork = m * magma_get_dgetri_nb( m ); // optimal block size
// allocate matrices
err = magma_dmalloc_cpu( &a , mm ); // host memory for a
err = magma_dmalloc( &d_a , mm ); // device memory for a
err = magma_dmalloc( &d_r , mm ); // device memory for r
err = magma_dmalloc( &d_c , mm ); // device memory for c
err = magma_dmalloc( &dwork , ldwork );// dev. mem. for ldwork
piv=( magma_int_t *) malloc(m*sizeof(magma_int_t ));// host mem.
// generate random matrix a // for piv
lapackf77_dlarnv (&ione ,ISEED ,&mm ,a); // randomize a
printf("here1\n");
magma_dsetmatrix( m, m, a, m, d_a , m, queue); // copy a -> d_a
magmablas_dlacpy(MagmaFull , m, m, d_a , m, d_r ,m,queue);//d_a ->d_r
// find the inverse matrix: d_a*X=I using the LU factorization
// with partial pivoting and row interchanges computed by
// magma_dgetrf_gpu; row i is interchanged with row piv(i);
// d_a -mxm matrix; d_a is overwritten by the inverse
gpu_time = magma_sync_wtime(NULL);
printf("here2\n");
magma_dgetrf_mgpu(2, m, m, d_lA, m, piv, &info);
magma_dgetri_gpu(m, d_a, m, piv, dwork, ldwork, &info);
printf("here3\n");
gpu_time = magma_sync_wtime(NULL)-gpu_time;
magma_dgemm(MagmaNoTrans ,MagmaNoTrans ,m,m,m,alpha ,d_a ,m,
d_r ,m,beta ,d_c ,m,queue); // multiply a^-1*a
printf("magma_dgetrf_gpu + magma_dgetri_gpu time: %7.5f sec.\
\n",gpu_time );
magma_dgetmatrix( m, m, d_c , m, a, m, queue); // copy d_c ->a
printf("upper left corner of a^-1*a:\n");
magma_dprint( 4, 4, a, m ); // part of a^-1*a
free(a); // free host memory
free(piv); // free host memory
magma_free(d_a); // free device memory
magma_free(d_r); // free device memory
magma_free(d_c); // free device memory
magma_queue_destroy(queue); // destroy queue
magma_finalize (); // finalize Magma
return 0;
}
At the begin of execution, both GPU seem to be running but I get quickly the following errors :
$ ./main_magma_double_example.exe
here1
here2
CUDA runtime error: an illegal memory access was encountered (700) in magma_dgetrf_mgpu at /home/fab/magma-2.6.1/src/dgetrf_mgpu.cpp:183
CUDA runtime error: an illegal memory access was encountered (700) in magma_dgetrf_mgpu at /home/fab/magma-2.6.1/src/dgetrf_mgpu.cpp:183
CUDA runtime error: an illegal memory access was encountered (700) in magma_dgetrf_mgpu at /home/fab/magma-2.6.1/src/dgetrf_mgpu.cpp:192
CUDA runtime error: an illegal memory access was encountered (700) in magma_dgetrf_mgpu at /home/fab/magma-2.6.1/src/dgetrf_mgpu.cpp:193
CUDA runtime error: an illegal memory access was encountered (700) in magma_dgetrf_mgpu at /home/fab/magma-2.6.1/src/dgetrf_mgpu.cpp:192
CUDA runtime error: an illegal memory access was encountered (700) in magma_dgetrf_mgpu at /home/fab/magma-2.6.1/src/dgetrf_mgpu.cpp:193
here3
CUDA runtime error: an illegal memory access was encountered (700) in magma_sync_wtime at /home/fab/magma-2.6.1/control/magma_timer.cpp:98
magma_dgetrf_gpu + magma_dgetri_gpu time: 1.07300 sec.
CUBLAS error: memory mapping error (11) in main at example_double_MAGMA_NVIDIA.cpp:57
upper left corner of a^-1*a:
[
0.1206 0.4128 0.9920 0.4738
0.6438 0.1080 0.1855 0.9998
0.0623 0.0777 0.2275 0.1513
0.4903 0.1876 0.8492 0.3984
];
CUDA runtime error: an illegal memory access was encountered (700) in main at example_double_MAGMA_NVIDIA.cpp:62
CUDA runtime error: an illegal memory access was encountered (700) in main at example_double_MAGMA_NVIDIA.cpp:63
CUDA runtime error: an illegal memory access was encountered (700) in main at example_double_MAGMA_NVIDIA.cpp:64
CUDA runtime error: an illegal memory access was encountered (700) in main at example_double_MAGMA_NVIDIA.cpp:65
We can see that inversion is wrong. I don't know the origin of these error messages.
Compilation done by Makefile.inc :
CXX = nvcc -O3
LAPACK = /opt/intel/oneapi/mkl/latest/lib/intel64
MAGMA = /usr/local/magma
INCLUDE_CUDA=/usr/local/cuda/include
LIBCUDA=/usr/local/cuda/lib64
CXXFLAGS = -c -I${MAGMA}/include -I${INCLUDE_CUDA} -lpthread
LDFLAGS = -L${LAPACK} -lmkl_intel_lp64 -L${LIBCUDA} -lcuda -lcudart -lcublas -L${MAGMA}/lib -lmagma -lpthread
SOURCES = example_double_MAGMA_NVIDIA.cpp
EXECUTABLE = main_magma_double_example.exe
What can I try to exploit both GPU cards at the same time?
PS: I must mention there is a NVLink hardware component between the 2 GPU cards.
Update
From the doc (Magma script doc) using magma_dgetrf_mgpu
, I tried to adapt it to compute inverse of matrix a
:
#include <stdio.h>
#include <cuda.h>
#include "magma_v2.h"
#include "magma_lapack.h"
#define min(a,b) (((a)<(b))?(a):(b))
int main( int argc , char** argv)
{
magma_init (); // initialize Magma
int num_gpus = 2;
magma_setdevice (0);
magma_queue_t queues[num_gpus ];
for( int dev = 0; dev < num_gpus; ++dev ) {
magma_queue_create( dev , &queues[dev] );
}
magma_int_t err;
real_Double_t cpu_time ,gpu_time;
magma_int_t m = 8192, n = 8192; // a,r - mxn matrices
magma_int_t mm = m*n;
magma_int_t nrhs =100; // b - nxnrhs , c - mxnrhs matrices
magma_int_t *ipiv; // array of indices of interchanged rows
magma_int_t n2=m*n; // size of a,r
magma_int_t nnrhs=n*nrhs; // size of b
magma_int_t mnrhs=m*nrhs; // size of c
double *a, *r; // a,r - mxn matrices on the host
double *b, *c;// b - nxnrhs , c - mxnrhs matrices on the host
double *dwork; // dwork - workspace
magmaDouble_ptr d_la[num_gpus ];
double alpha =1.0, beta =0.0; // alpha=1,beta=0
magma_int_t ldwork; // size of dwork
ldwork = m * magma_get_dgetri_nb( m ); // optimal block size
//4.3 LU decomposition and solving general linear systems 282
magma_int_t n_local;
magma_int_t ione = 1, info;
magma_int_t i, min_mn=min(m,n), nb;
magma_int_t ldn_local;// mxldn_local - size of the part of a
magma_int_t ISEED [4] = {0,0,0,1}; // on i-th device
nb =magma_get_dgetrf_nb(m,n); // optim.block size for dgetrf
// allocate memory on cpu
ipiv=( magma_int_t *) malloc(min_mn*sizeof(magma_int_t ));
// host memory for ipiv
err = magma_dmalloc_cpu (&a,n2); // host memory for a
err = magma_dmalloc_pinned (&r,n2); // host memory for r
err = magma_dmalloc_pinned (&b,nnrhs); // host memory for b
err = magma_dmalloc_pinned (&c,mnrhs); // host memory for c
// allocate device memory on num_gpus devices
for(i=0; i<num_gpus; i++){
n_local = ((n/nb)/ num_gpus )*nb;
if (i < (n/nb)% num_gpus)
n_local += nb;
else if (i == (n/nb)% num_gpus)
n_local += n%nb;
ldn_local = (( n_local +31)/32)*32;
magma_setdevice(i);
err = magma_dmalloc (&d_la[i],m*ldn_local ); // device memory
} // on i-th device
magma_setdevice (0);
lapackf77_dlarnv (&ione ,ISEED ,&mm ,a); // randomize a
// copy the corresponding parts of the matrix r to num_gpus
magma_dsetmatrix_1D_col_bcyclic( num_gpus , m, n, nb , a, m, d_la , m, queues );
// MAGMA
// LU decomposition on num_gpus devices with partial pivoting
// and row interchanges , row i is interchanged with row ipiv(i)
gpu_time = magma_sync_wtime(NULL);
magma_dgetrf_mgpu( num_gpus, m, n, d_la, m, ipiv, &info);
magma_dgetri_gpu(m, a, m, ipiv, dwork, ldwork, &info);
gpu_time = magma_sync_wtime(NULL)-gpu_time;
printf("magma_dgetrf_mgpu time: %7.5f sec.\n",gpu_time );
// print part of the solution from dgetrf_mgpu and dgetrs
printf("upper left corner of a^-1*a:\n");
magma_dprint( 4, 4, a, m); // magma_dgetrf_mgpu + dgetrs
free(ipiv); // free host memory
free(a); // free host memory
magma_free_pinned(r); // free host memory
magma_free_pinned(b); // free host memory
magma_free_pinned(c); // free host memory
for(i=0; i<num_gpus; i++){
magma_free(d_la[i] ); // free device memory
}
for( int dev = 0; dev < num_gpus; ++dev ) {
magma_queue_destroy( queues[dev] );
}
magma_finalize ();
}
As usually, compiles fine but get following errors at execution :
CUBLAS error: memory mapping error (11) in magma_dtrtri_gpu at /home/henry/magma-2.6.1/src/dtrtri_gpu.cpp:162
CUBLAS error: memory mapping error (11) in magma_dtrtri_gpu at /home/henry/magma-2.6.1/src/dtrtri_gpu.cpp:172
CUBLAS error: memory mapping error (11) in magma_dtrtri_gpu at /home/henry/magma-2.6.1/src/dtrtri_gpu.cpp:162
CUBLAS error: memory mapping error (11) in magma_dtrtri_gpu at /home/henry/magma-2.6.1/src/dtrtri_gpu.cpp:172
CUDA runtime error: an illegal memory access was encountered (700) in magma_dtrtri_gpu at /home/henry/magma-2.6.1/src/dtrtri_gpu.cpp:173
CUBLAS error: memory mapping error (11) in magma_dtrtri_gpu at /home/henry/magma-2.6.1/src/dtrtri_gpu.cpp:162
CUDA runtime error: an illegal memory access was encountered (700) in magma_dtrtri_gpu at /home/henry/magma-2.6.1/src/dtrtri_gpu.cpp:163
...
However, I think I have initialized well the variable d_la
but it seems that errors of coding remain.
What can I try next?