i should resolve a bordered block system, here there are two version, the serial version that work properly and the parallel version (with MPI) that doesn't work and i don't know why... someone may help me to figure out why?
=================================================================================== = BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES = EXIT CODE: 11 = CLEANING UP REMAINING PROCESSES = YOU CAN IGNORE THE BELOW CLEANUP MESSAGES =================================================================================== YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11) This typically refers to a problem with your application. Please see the FAQ page for debugging suggestion
Serial version:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <string.h>
#include "res.h"
// Numero di blocchi
#define N 128
// Dimensione di ogni Blocco
#define BS 256
// Leading Dimension
#define LD BS * BS
/*
*
*/
int main (int argc, char ** argv) {
// Partenza del timer
clock_t t_end, t_start = clock();
double time;
size_t ii, jj;
/*
* Alloco tutta la memoria necessaria per poter svolgere i calcoli e
* popolo le matrici (in ordine) A, B, C, As, bi, bs
*/
double *A = calloc (LD * N, sizeof(*A));
double *B = calloc (LD * N, sizeof(*B));
double *C = calloc (LD * N, sizeof(*C));
double *As = calloc (LD, sizeof(*As));
double *x = calloc (BS * N, sizeof(*x));
double *xs = calloc (BS, sizeof(*xs));
double *B_tmp = calloc (BS * N, sizeof(*x));
double *b = calloc (BS * N, sizeof(*b));
double *bs = calloc (BS, sizeof(*bs));
double *Y = calloc (LD * N, sizeof(*Y));
double *y = calloc (BS * N, sizeof(*y));
double *D = calloc (LD * N, sizeof(*D));
double *d = calloc (BS * N, sizeof(*d));
double *A_cap = calloc (LD, sizeof(*A_cap));
double *A_tmp = calloc (LD, sizeof(*A_tmp));
double *b_cap = calloc (BS, sizeof(*b_cap));
double *b_tmp = calloc (BS, sizeof(*b_tmp));
double *xi = calloc (BS * N, sizeof(*xi));
double *xs_tmp = calloc (BS, sizeof(*xs_tmp));
for (ii = 0; ii < N; ii++){
genera_Ai(&A[LD*ii], BS, ii+1);
genera_Bi(&B[LD*ii], BS, ii+1);
genera_Bi(&C[LD*ii], BS, ii+1);
}
genera_Ai(As, BS, N+2);
for (ii = 0; ii < N; ii++) {
for(jj = 0; jj < BS; jj++)
b[BS*ii + jj] = 1;
}
for (ii = 0; ii < BS; ii++)
bs[ii] = 1;
/* Step #1
* Calcolo la fattorizzazione LU di tutte le matrici A0,..., AN.
* Ogni matrice Ai ha dimensione BS * BS (256 x 256) overo LD.
*/
for (ii = 0; ii < N; ii++)
LU (&A[LD*ii], BS);
/* Troviamo le matrici Di = Ai^-1 * Bi, di = Ai^-1 * bi con la
fattorizzazione LU e eliminazione in avanti e sostituzione all'indietro */
for (ii = 0; ii < N; ii++) {
elim_avanti (&A[LD*ii], BS, &B[LD*ii], BS, &Y[LD*ii]);
elim_avanti (&A[LD*ii], BS, &b[BS*ii], 1, &y[BS*ii]);
}
for (ii = 0; ii < N; ii++) {
sost_indietro (&A[LD*ii], BS, &Y[LD*ii], BS, &D[LD*ii]);
sost_indietro (&A[LD*ii], BS, &y[BS*ii], 1, &d[BS*ii]);
}
/* Step #2
* Calcolo A_cap e b_cap, eseguendo prima il prodotto matrice matrice e poi
* la sottrazione dalla matrice As | bs (rispettivamente per A_cap e b_cap).
*
* A_tmp = A_tmp + (Ci * Di) ---> A_cap = As - A_tmp
* b_tmp = b_tmp + (Ci * di) ---> b_cap = bs - b_tmp
*/
// #2.1 - Prodotto matrice matrice, ovvero A_tmp = A_tmp + (Ci * Di)
for (ii = 0; ii < N; ii++)
my_gemm (BS, BS, BS, &C[LD*ii], BS, 1.0, &D[LD*ii], BS, A_tmp, BS);
// #2.2 - Calcolo di A_cap, ovvero A_cap = As - A_tmp
for (ii = 0; ii < BS; ii++)
for (jj = 0; jj < BS; jj++)
A_cap[jj+BS*ii] = As[jj+BS*ii] - A_tmp[jj+BS*ii];
// #2.3 - Prodotto matrice matrice, ovvero b_tmp = b_tmp + (Ci * di)
for (ii = 0; ii < N; ii++)
my_gemm (BS, 1, BS, &C[LD*ii], BS, 1.0, &d[BS*ii], 1, b_tmp, 1);
// #2.4 - Calcolo di b_cap = bs - b_tmp
for (ii = 0; ii < BS; ii++)
b_cap[ii] = bs[ii] - b_tmp[ii];
/* Step #3
* Risolvo il sistema A_cap * xs = b_cap, dove:
*
* A_cap = (As - Ci * Ai^-1 * Bi)
* b_cap = (bs - Ci * Ai^-1 * bi)
*/
LU (A_cap, BS);
elim_avanti (A_cap, BS, b_cap, 1, xs_tmp);
sost_indietro (A_cap, BS, xs_tmp, 1, xs);
/*
* Step #4
*
* Calcolo le restanti componenti del vettore soluzione x risolvendo
* N sistemi lineari Ai * xi = bi - Bi * xs.
*/
for (ii = 0; ii < N; ii++) {
// #4.1 Calcolo Bi*xs = B_tmp
my_gemm(BS, 1, BS, &B[LD*ii], BS, 1, xs, 1, &B_tmp[BS*ii], 1);
// #4.2 Calcolo bi-B_tmp = B_tmp
for (jj = 0; jj < BS; jj++)
B_tmp[BS*ii + jj] = b[BS*ii + jj] - B_tmp[BS*ii + jj];
// #4.3 Risolvo il sistema Ai*xi = x
elim_avanti(&A[LD*ii], BS, &B_tmp[BS*ii], 1, xs_tmp);
sost_indietro(&A[LD*ii], BS, xs_tmp, 1, &x[BS*ii]);
}
// Salvo tutte le soluzioni nel vettore finale xi
for (ii = 0; ii <= N; ii++){
for (jj = 0; jj < BS; jj++) {
if (ii < N)
xi[BS*ii + jj] = x[BS*ii + jj];
else
xi[BS*ii + jj] = xs[jj];
}
}
// Fermo il timer
t_end = clock();
time = (t_end - t_start) / (double) CLOCKS_PER_SEC;
printf("Time is: %f\n", time);
free(A); free(B); free(C); free(As);
free(x); free(xs); free(b); free(bs);
free(Y); free(y); free(D); free(d);
free(A_cap); free(A_tmp); free(b_cap); free(b_tmp);
free(xs_tmp); free(xi);
return 0;
}
Parallel version:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <string.h>
#include <mpi.h>
#include "res.h"
// Blocks
#define N 128
// Block Size
#define BS 256
// Leading Dimension
#define LD BS * BS
// The master node
#define MASTER 0
// Globals
/*
*
*/
int main (int argc, char ** argv) {
size_t ii, jj;
int my_rank, num_nodes;
double t_start, t_end;
double *A, *A_recv, *B, *B_recv, *C, *C_recv;
double *As, *x, *xs, *b, *bs;
double *Y, *y, *D, *d, *D_tmp, *d_tmp;
double *A_cap, *A_tmp, *b_cap, *b_tmp;
double *xi, *xs_tmp;
double *x_res;
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &num_nodes);
/**************************** From Master *******************************/
if (my_rank == MASTER) {
t_start = MPI_Wtime();
A = calloc (LD * N, sizeof(*A));
B = calloc (LD * N, sizeof(*B));
C = calloc (LD * N, sizeof(*C));
As = calloc (LD, sizeof(*As));
x = calloc (BS * N, sizeof(*x));
xs = calloc (BS, sizeof(*xs));
b = calloc (BS * N, sizeof(*b));
bs = calloc (BS, sizeof(*bs));
Y = calloc (LD * N, sizeof(*Y));
y = calloc (BS * N, sizeof(*y));
D = calloc (LD * N, sizeof(*D));
d = calloc (BS * N, sizeof(*d));
A_cap = calloc (LD, sizeof(*A_cap));
A_tmp = calloc (LD, sizeof(*A_tmp));
b_cap = calloc (BS, sizeof(*b_cap));
b_tmp = calloc (BS, sizeof(*b_tmp));
d_tmp = calloc (BS, sizeof(*d_tmp));
xi = calloc (BS * N, sizeof(*xi));
xs_tmp = calloc (BS, sizeof(*xs_tmp));
x_res = calloc (BS, sizeof(*x_res));
// Initialize the matrices
for (ii = 0; ii < N; ii++){
genera_Ai(&A[LD*ii], BS, ii+1);
genera_Bi(&B[LD*ii], BS, ii+1);
genera_Bi(&C[LD*ii], BS, ii+1);
}
genera_Ai(As, BS, N+1);
for (ii = 0; ii < N; ii++) {
for(jj = 0; jj < BS; jj++)
b[BS*ii + jj] = 1;
}
for (ii = 0; ii < BS; ii++)
bs[ii] = 1;
// Each node will receive (LD * N / num_nodes) entries
A_recv = calloc (LD * N / num_nodes, sizeof(*A_recv));
B_recv = calloc (LD * N / num_nodes, sizeof(*B_recv));
C_recv = calloc (LD * N / num_nodes, sizeof(*C_recv));
// Send A matrix
MPI_Scatter (A, LD * N, MPI_DOUBLE,
A_recv, LD * N, MPI_DOUBLE,
MASTER, MPI_COMM_WORLD);
// Send B matrix
MPI_Scatter (B, LD * N, MPI_DOUBLE,
B_recv, LD * N, MPI_DOUBLE,
MASTER, MPI_COMM_WORLD);
// Send C matrix
MPI_Scatter (C, LD * N, MPI_DOUBLE,
C_recv, LD * N, MPI_DOUBLE,
MASTER, MPI_COMM_WORLD);
}
/**************************** From Workers *******************************/
/* Step #1
* Each processor performs the LU decomposition of their matrices Ai
* generating the two matrices Li and Ui.
*/
for (ii = 0; ii < N / num_nodes; ii++)
LU (&A[LD*ii], BS);
/* Step #2
* Each processor calculates the term di = Ci * Ai^-1 * bi for each matrix
* For it meant taking advantage of the LU decomposition calculated in step
* Number 1. Then it calculates the sum.
*/
// 2.1
for (ii = 0; ii < N / num_nodes; ii++)
elim_avanti (&A[LD*ii], BS, &b[BS*ii], 1, &y[BS*ii]);
// 2.2
for (ii = 0; ii < N / num_nodes; ii++)
sost_indietro (&A[LD*ii], BS, &y[BS*ii], 1, &d[BS*ii]);
// #2.3 - matrix multiplication, d_tmp = d_tmp + (Ci * di)
for (ii = 0; ii < N; ii++)
my_gemm (BS, 1, BS, &C[LD*ii], BS, 1.0, &d[BS*ii], 1, d_tmp, 1);
MPI_Barrier (MPI_COMM_WORLD);
/* Step #3
* Through a communication type "Reduce" with sum over the terms "di" the
* Master processor obtains the sum Ci * Ai ^ -1 * Bi with i from 0 to n-1 and
* This calculates b_cap.
*/
// 3.1 - sum d_tmp
MPI_Reduce (&d_tmp, &b_tmp, LD * N, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);
// #3.2 - Calc b_cap = bs - b_tmp
for (ii = 0; ii < BS; ii++)
b_cap[ii] = bs[ii] - b_tmp[ii];
/* Step #4
* Each processor calcs the term Di = Ci * Ai^-1 * Bi by the resolution of
* n linear systems Ai * xj = Bij
*/
// 4.1
for (ii = 0; ii < N / num_nodes; ii++)
elim_avanti (&A[LD*ii], BS, &B[LD*ii], BS, &Y[LD*ii]);
// 4.2
for (ii = 0; ii < N / num_nodes; ii++)
sost_indietro (&A[LD*ii], BS, &Y[LD*ii], BS, &D[LD*ii]);
// #4.3 - Prodotto matrice matrice, ovvero D_tmp = D_tmp + (Ci * Di)
for (ii = 0; ii < N; ii++)
my_gemm (BS, BS, BS, &C[LD*ii], BS, 1.0, &D[LD*ii], BS, D_tmp, BS);
/* Step #5
* Through a communication type "Reduce" with sum over the terms 'di', the
* Master processor obtains the sum There Ci * Ai ^ -1 * Bi with i from 0 to n-1 and
* This calculates A_cap.
*/
// 5.1 - sum all D_tmp
MPI_Reduce (&D_tmp, &A_tmp, LD * N, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);
// #5.2 - calc A_cap, that is A_cap = As - A_tmp
for (ii = 0; ii < BS; ii++)
for (jj = 0; jj < BS; jj++)
A_cap[jj+BS*ii] = As[jj+BS*ii] - A_tmp[jj+BS*ii];
/* Step #6
* The "master" processor resolve the system A_cap * xs = b_cap
* to dermine xs.
*/
LU (A_cap, BS);
elim_avanti (A_cap, BS, b_cap, 1, xs_tmp);
sost_indietro (A_cap, BS, xs_tmp, 1, xs);
MPI_Barrier (MPI_COMM_WORLD);
/* Step #7
* The solution xs is communicated to all processors via a broadcast.
*/
if (my_rank == MASTER)
MPI_Bcast (xs, BS, MPI_DOUBLE, MASTER, MPI_COMM_WORLD);
/* Step #8
* each processor j-esimo resolve the linear systems Ai*xi = bi - Bi*xs
*/
for (ii = 0; ii < N / num_nodes; ii++) {
// #8.1 Calc Bi*xs = x
my_gemm(BS, 1, BS, &B[LD*ii], BS, 1, xs, 1, &x[BS*ii], 1);
// #8.2 Calc bi-x = x
for (jj = 0; jj < BS; jj++)
x[BS*ii + jj] = b[BS*ii + jj] -x[BS*ii + jj];
// #8.3 Resolve the system Ai*xi = x
elim_avanti (&A[LD*ii], BS, &x[BS*ii], 1, xs_tmp);
sost_indietro (&A[LD*ii], BS, xs_tmp, 1, &x[BS*ii]);
}
/* Step #9
* The components "xi" are combined into a single vector via a
* Gather type communication. Joined together to form the solution "xs"
* To the initial problem.
*/
MPI_Gather (xi, BS, MPI_DOUBLE,
x_res, BS, MPI_DOUBLE,
MASTER, MPI_COMM_WORLD);
for (ii = 0; ii < N; ii++){
for (jj = 0; jj < BS; jj++) {
if (ii < N)
xi[BS*ii + jj] = x[BS*ii + jj];
else
xi[BS*ii + jj] = xs[jj];
}
}
// Wait all nodes before to print the result
MPI_Barrier (MPI_COMM_WORLD);
// Stop timer
if (my_rank == MASTER) {
t_end = MPI_Wtime();
printf("Time is %f\n", t_end - t_start);
}
free(A); free(B); free(C); free(As);
free(x); free(xs); free(b); free(bs);
free(Y); free(y); free(D); free(d);
free(A_cap); free(A_tmp); free(b_cap); free(b_tmp);
free(xs_tmp); free(xi);
MPI_Finalize();
return 0;
}
This is res.h http://bpaste.net/show/yAQCCwFsZlQCCiw7GKC9/ it's correct and provided by the teacher.
I'm sorry for my bad English and some Italian comments, the code should be clear,my goal is to parallelize the serial portion of the code with MPI, the number of Processors is always a multiple of 4. Thanks in advance.
You posted this elsewhere:
As the error message says, you have aliased the arguments of MPI_Reduce. This is because you are passing the address of a pointer rather than the value, which is the correct way to provide a data buffer to MPI, at least when you're allocating buffers dynamically.
I think that changing this
to
will solve the problem. If you really like &, you can do the following:
but this is pointless.
You might find that your other MPI calls are erroneous if you are using &ptr instead of ptr when a void* argument is requested.