I am trying to run the following code solving a linear system using the Jacobi algorithm. The goal is to use openMP to accelerate the algorithm. When I run the code with more than one thread I get a bus error.
Here the C - openMP code:
////////////////////////////////////////////////////////////////////////////////
// jacobi.c --- TP2 : resolution d'un systeme lineaire par la methode de jacobi
//
// Auteur : Jeremie Gaidamour (CNRS/IDRIS) <[email protected]>
// Cr�� le : Tue Jul 09 17:33:23 2013
// Dern. mod. par : Jeremie Gaidamour (CNRS/IDRIS) <[email protected]>
// Dern. mod. le : Mon Aug 26 14:57:55 2013
////////////////////////////////////////////////////////////////////////////////
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <float.h>
#include <math.h>
#include <time.h>
#include <sys/time.h>
#ifdef _OPENMP
#include <omp.h>
#endif // _OPENMP
// Dimension par defaut de la taille des matrices
#ifndef VAL_N
#define VAL_N 1201
#endif
#ifndef VAL_D
#define VAL_D 800
#endif
// Initialisation aleatoire d'un tableau
void random_number(double* array, int size) {
for(int i=0; i<size; i++) {
// Generation d'un nombre dans l'intervalle [0, 1[
double r = (double)rand() / (double)(RAND_MAX - 1);
array[i] = r;
}
}
int main() {
int n=VAL_N, diag=VAL_D;
int i, j, iteration=0;
double a[n][n];
double x[n], x_courant[n], b[n];
double norme;
struct timeval t_elapsed_0, t_elapsed_1;
double t_elapsed;
clock_t t_cpu_0, t_cpu_1;
double t_cpu;
#ifdef _OPENMP
int nb_taches;
#pragma omp parallel
{ nb_taches = omp_get_num_threads(); }
fprintf(stdout, "\n\n Execution jacobi en parallele avec %d threads\n", nb_taches);
#endif // _OPENMP
// Initialisation de la matrice et du second membre
srand(421); // pour des resultats reproductibles
random_number(&a[0][0], n*n);
random_number(&b[0], n);
// On muscle la diagonale principale de la matrice
for(int i=0; i<n; i++) {
a[i][i] += diag;
}
// Solution initiale
for(int i=0; i<n; i++) {
x[i] = 1.0;
}
printf("double size: %zu bytes\n", sizeof(double));
printf("Address of a: %p\n", (void*)a);
// Temps CPU de calcul initial.
t_cpu_0 = clock();
// Temps elapsed de reference.
gettimeofday(&t_elapsed_0, NULL);
// Resolution par la methode de Jacobi
while(1) {
iteration++;
#pragma omp parallel for schedule(runtime)
for(int i=0; i<n; i++) {
x_courant[i] = 0;
for(j=0; j<i; j++) {
x_courant[i] += a[j][i]*x[j];
}
for(j=i+1; j<n; j++) {
x_courant[i] += a[j][i]*x[j];
}
x_courant[i] = (b[i] - x_courant[i])/a[i][i];
}
// Test de convergence
{
double absmax = 0;
for(int i=0; i<n; i++) {
double curr = fabs(x[i] - x_courant[i]);
if (curr > absmax)
absmax = curr;
}
norme = absmax / n;
}
if( (norme <= DBL_EPSILON) || (iteration >= n) ) break;
// Copie de x_courant dans x
memcpy(x, x_courant, n*sizeof(double));
}
// Temps elapsed final
gettimeofday(&t_elapsed_1, NULL);
t_elapsed = (t_elapsed_1.tv_sec - t_elapsed_0.tv_sec) + (t_elapsed_1.tv_usec - t_elapsed_0.tv_usec) / (double)1000000;
// Temps CPU de calcul final
t_cpu_1 = clock();
t_cpu = (t_cpu_1 - t_cpu_0) / (double)CLOCKS_PER_SEC;
// Impression du resultat
fprintf(stdout, "\n\n"
" Taille du systeme : %5d\n"
" Iterations : %4d\n"
" Norme : %10.3E\n"
" Temps elapsed : %10.3E sec.\n"
" Temps CPU : %10.3E sec.\n",
n, iteration, norme, t_elapsed, t_cpu
);
return EXIT_SUCCESS;
}
This code is compiled and run on a Mac Studio with M1 max Apple Silicon with 64 GB of memory.
Here the compilation command line:
gcc -Xclang -fopenmp -I/opt/homebrew/opt/libomp/include/ -L/opt/homebrew/opt/libomp/lib/ -lomp jacobi.c -o multi
The stack size limit must be expanded to at least 16 MB using ulimit -s 16384.
Here the execution command line:
export OMP_SCHEDULE="STATIC,10" OMP_NUM_THREADS=2
./multi
An analysis with lldb shows that the program attempt to access memory that is out of bound :
Execution jacobi en parallele avec 2 threads
double size: 8 bytes
Address of a: 0x16f2fdad0
Process 5813 stopped
* thread #1, queue = 'com.apple.main-thread', stop reason = EXC_BAD_ACCESS (code=2, address=0x16fe00de8)
frame #0: 0x0000000100003bc8 multi`.omp_outlined._debug__.3(.global_tid.=0x000000016f2f660c, .bound_tid.=0x000000016f2f6608, n=0x000000016fdfef10, vla=1201, x_courant=0x000000016f2f8fb0, j=0x000000016fdfef04, vla=1201, vla=1201, a=0x000000016f2fdad0, vla=1201, x=0x000000016f2fb540, vla=1201, b=0x000000016f2f6a20) at jacobi.c:91:24
88 for(int i=0; i<n; i++) {
89 x_courant[i] = 0;
90 for(j=0; j<i; j++) {
-> 91 x_courant[i] += a[j][i]*x[j];
92 }
93 for(j=i+1; j<n; j++) {
94 x_courant[i] += a[j][i]*x[j];
(lldb) fr v j
(int &) j = 0x000000016fdfef04 (&j = 1011)
(lldb) fr v i
(int) i = 0
In fact, a contains 1201x1201 doubles which means that its data are stored for the address 0x16f2fdad0 to the address 0x16fdfedd7 but the program attempted to read a at the address 0x16fe00de8. Nonetheless, i and j are in bounds.
I can't explain this behavior and I'll be happy to consider any lead to address that issue.