MPI bordered block system/matrix with LU decomposition

1.2k Views Asked by At

i should resolve a bordered block system, here there are two version, the serial version that work properly and the parallel version (with MPI) that doesn't work and i don't know why... someone may help me to figure out why?

=================================================================================== = BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES = EXIT CODE: 11 = CLEANING UP REMAINING PROCESSES = YOU CAN IGNORE THE BELOW CLEANUP MESSAGES =================================================================================== YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11) This typically refers to a problem with your application. Please see the FAQ page for debugging suggestion

Serial version:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <string.h>

#include "res.h"

// Numero di blocchi
#define N 128

// Dimensione di ogni Blocco
#define BS 256

// Leading Dimension
#define LD BS * BS

/*
 *
 */
int main (int argc, char ** argv) {

    // Partenza del timer
    clock_t t_end, t_start = clock();

    double time;
    size_t ii, jj;

    /* 
     * Alloco tutta la memoria necessaria per poter svolgere i calcoli e 
     * popolo le matrici (in ordine) A, B, C, As, bi, bs 
     */
    double *A = calloc (LD * N, sizeof(*A));
    double *B = calloc (LD * N, sizeof(*B));
    double *C = calloc (LD * N, sizeof(*C));
    double *As = calloc (LD, sizeof(*As));

    double *x = calloc (BS * N, sizeof(*x));    
    double *xs = calloc (BS, sizeof(*xs));
    double *B_tmp = calloc (BS * N, sizeof(*x));

    double *b = calloc (BS * N, sizeof(*b));
    double *bs = calloc (BS, sizeof(*bs));

    double *Y = calloc (LD * N, sizeof(*Y));
    double *y = calloc (BS * N, sizeof(*y));
    double *D = calloc (LD * N, sizeof(*D));
    double *d = calloc (BS * N, sizeof(*d));

    double *A_cap = calloc (LD, sizeof(*A_cap));
    double *A_tmp = calloc (LD, sizeof(*A_tmp));
    double *b_cap = calloc (BS, sizeof(*b_cap));
    double *b_tmp = calloc (BS, sizeof(*b_tmp));

    double *xi = calloc (BS * N, sizeof(*xi));
    double *xs_tmp = calloc (BS, sizeof(*xs_tmp));

    for (ii = 0; ii < N; ii++){
        genera_Ai(&A[LD*ii], BS, ii+1);
        genera_Bi(&B[LD*ii], BS, ii+1);
        genera_Bi(&C[LD*ii], BS, ii+1);
    }

    genera_Ai(As, BS, N+2);

    for (ii = 0; ii < N; ii++) {
        for(jj = 0; jj < BS; jj++) 
            b[BS*ii + jj] = 1;
    }

    for (ii = 0; ii < BS; ii++) 
        bs[ii] = 1;

    /* Step #1 
     * Calcolo la fattorizzazione LU di tutte le matrici A0,..., AN.
     * Ogni matrice Ai ha dimensione BS * BS (256 x 256) overo LD.
     */
    for (ii = 0; ii < N; ii++) 
        LU (&A[LD*ii], BS); 
    /* Troviamo le matrici Di = Ai^-1 * Bi, di = Ai^-1 * bi con la 
    fattorizzazione LU e eliminazione in avanti e sostituzione all'indietro */
    for (ii = 0; ii < N; ii++) {
        elim_avanti   (&A[LD*ii], BS, &B[LD*ii], BS, &Y[LD*ii]);
        elim_avanti   (&A[LD*ii], BS, &b[BS*ii],  1, &y[BS*ii]);
    }
    for (ii = 0; ii < N; ii++) {
        sost_indietro (&A[LD*ii], BS, &Y[LD*ii], BS, &D[LD*ii]);
        sost_indietro (&A[LD*ii], BS, &y[BS*ii],  1, &d[BS*ii]);
    }

    /* Step #2
     * Calcolo A_cap e b_cap, eseguendo prima il prodotto matrice matrice e poi 
     * la sottrazione dalla matrice As | bs (rispettivamente per A_cap e b_cap).
     *
     *  A_tmp = A_tmp + (Ci * Di) ---> A_cap = As - A_tmp
     *  b_tmp = b_tmp + (Ci * di) ---> b_cap = bs - b_tmp
     */
    // #2.1 - Prodotto matrice matrice, ovvero A_tmp = A_tmp + (Ci * Di) 
    for (ii = 0; ii < N; ii++)
        my_gemm (BS, BS, BS, &C[LD*ii], BS, 1.0, &D[LD*ii], BS, A_tmp, BS);
    // #2.2 - Calcolo di A_cap, ovvero A_cap = As - A_tmp
    for (ii = 0; ii < BS; ii++)
        for (jj = 0; jj < BS; jj++)
            A_cap[jj+BS*ii] = As[jj+BS*ii] - A_tmp[jj+BS*ii];

    // #2.3 - Prodotto matrice matrice, ovvero b_tmp = b_tmp + (Ci * di)
    for (ii = 0; ii < N; ii++)
        my_gemm (BS, 1, BS, &C[LD*ii], BS, 1.0, &d[BS*ii], 1, b_tmp, 1);
    // #2.4 - Calcolo di b_cap = bs - b_tmp
    for (ii = 0; ii < BS; ii++)
        b_cap[ii] = bs[ii] - b_tmp[ii];


    /* Step #3
     * Risolvo il sistema A_cap * xs = b_cap, dove:
     *      
     * A_cap = (As - Ci * Ai^-1 * Bi)
     * b_cap = (bs - Ci * Ai^-1 * bi)
     */
    LU (A_cap, BS);
    elim_avanti   (A_cap, BS, b_cap,  1, xs_tmp);
    sost_indietro (A_cap, BS, xs_tmp, 1, xs);

    /*
     * Step #4
     * 
     * Calcolo le restanti componenti del vettore soluzione x risolvendo
     * N sistemi lineari Ai * xi = bi - Bi * xs.
     */
    for (ii = 0; ii < N; ii++) {
        // #4.1 Calcolo Bi*xs = B_tmp
        my_gemm(BS, 1, BS, &B[LD*ii], BS, 1, xs, 1, &B_tmp[BS*ii], 1);

        // #4.2 Calcolo bi-B_tmp = B_tmp
        for (jj = 0; jj < BS; jj++)
            B_tmp[BS*ii + jj] = b[BS*ii + jj] - B_tmp[BS*ii + jj];

        // #4.3 Risolvo il sistema Ai*xi = x
        elim_avanti(&A[LD*ii], BS, &B_tmp[BS*ii], 1, xs_tmp);
        sost_indietro(&A[LD*ii], BS, xs_tmp, 1, &x[BS*ii]);
    }

    // Salvo tutte le soluzioni nel vettore finale xi 
    for (ii = 0; ii <= N; ii++){
        for (jj = 0; jj < BS; jj++) {
            if (ii < N)
                xi[BS*ii + jj] = x[BS*ii + jj];
            else
                xi[BS*ii + jj] = xs[jj];
        }
    }

    // Fermo il timer
    t_end = clock();

    time = (t_end - t_start) / (double) CLOCKS_PER_SEC;
    printf("Time is: %f\n", time);

    free(A); free(B); free(C); free(As);
    free(x); free(xs); free(b); free(bs);
    free(Y); free(y);  free(D); free(d);
    free(A_cap); free(A_tmp); free(b_cap); free(b_tmp);
    free(xs_tmp); free(xi);

    return  0;
}

Parallel version:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <string.h>
#include <mpi.h>

#include "res.h"

// Blocks
#define N 128

// Block Size
#define BS 256

// Leading Dimension
#define LD BS * BS

// The master node
#define MASTER 0

// Globals

/*
 *
 */
int main (int argc, char ** argv) {

    size_t ii, jj;
    int my_rank, num_nodes;
    double t_start, t_end;
    double *A, *A_recv, *B, *B_recv, *C, *C_recv;
    double *As, *x, *xs, *b, *bs;
    double *Y, *y, *D, *d, *D_tmp, *d_tmp;
    double *A_cap, *A_tmp, *b_cap, *b_tmp;
    double *xi, *xs_tmp;    
    double *x_res;

    MPI_Status status;
    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
    MPI_Comm_size(MPI_COMM_WORLD, &num_nodes);

    /**************************** From Master *******************************/
    if (my_rank == MASTER) {

        t_start = MPI_Wtime();

        A = calloc (LD * N, sizeof(*A));
        B = calloc (LD * N, sizeof(*B));
        C = calloc (LD * N, sizeof(*C));
        As = calloc (LD, sizeof(*As));
        x = calloc (BS * N, sizeof(*x));
        xs = calloc (BS, sizeof(*xs));
        b = calloc (BS * N, sizeof(*b));
        bs = calloc (BS, sizeof(*bs));
        Y = calloc (LD * N, sizeof(*Y));
        y = calloc (BS * N, sizeof(*y));
        D = calloc (LD * N, sizeof(*D));
        d = calloc (BS * N, sizeof(*d));
        A_cap = calloc (LD, sizeof(*A_cap));
        A_tmp = calloc (LD, sizeof(*A_tmp));
        b_cap = calloc (BS, sizeof(*b_cap));
        b_tmp = calloc (BS, sizeof(*b_tmp));
        d_tmp = calloc (BS, sizeof(*d_tmp));
        xi = calloc (BS * N, sizeof(*xi));
        xs_tmp = calloc (BS, sizeof(*xs_tmp));
        x_res = calloc (BS, sizeof(*x_res));

        // Initialize the matrices
        for (ii = 0; ii < N; ii++){
            genera_Ai(&A[LD*ii], BS, ii+1);
            genera_Bi(&B[LD*ii], BS, ii+1);
            genera_Bi(&C[LD*ii], BS, ii+1);
        }

        genera_Ai(As, BS, N+1);

        for (ii = 0; ii < N; ii++) {
            for(jj = 0; jj < BS; jj++) 
                b[BS*ii + jj] = 1;
        }

        for (ii = 0; ii < BS; ii++) 
            bs[ii] = 1;

        // Each node will receive (LD * N / num_nodes) entries
        A_recv = calloc (LD * N / num_nodes, sizeof(*A_recv));
        B_recv = calloc (LD * N / num_nodes, sizeof(*B_recv));
        C_recv = calloc (LD * N / num_nodes, sizeof(*C_recv));

        // Send A matrix
        MPI_Scatter (A, LD * N, MPI_DOUBLE, 
            A_recv, LD * N, MPI_DOUBLE, 
            MASTER, MPI_COMM_WORLD);

        // Send B matrix     
        MPI_Scatter (B, LD * N, MPI_DOUBLE, 
            B_recv, LD * N, MPI_DOUBLE, 
            MASTER, MPI_COMM_WORLD);

        // Send C matrix
        MPI_Scatter (C, LD * N, MPI_DOUBLE, 
            C_recv, LD * N, MPI_DOUBLE, 
            MASTER, MPI_COMM_WORLD);
    }

    /**************************** From Workers *******************************/
    /* Step #1 
     * Each processor performs the LU decomposition of their matrices Ai
         * generating the two matrices Li and Ui.
     */
    for (ii = 0; ii < N / num_nodes; ii++) 
        LU (&A[LD*ii], BS);

    /* Step #2 
     * Each processor calculates the term di = Ci * Ai^-1 * bi for each matrix
         * For it meant taking advantage of the LU decomposition calculated in step
         * Number 1. Then it calculates the sum.
     */
    // 2.1
    for (ii = 0; ii < N / num_nodes; ii++)
        elim_avanti   (&A[LD*ii], BS, &b[BS*ii],  1, &y[BS*ii]);

    // 2.2
    for (ii = 0; ii < N / num_nodes; ii++)
        sost_indietro (&A[LD*ii], BS, &y[BS*ii],  1, &d[BS*ii]);

     // #2.3 - matrix multiplication, d_tmp = d_tmp + (Ci * di)
    for (ii = 0; ii < N; ii++)
        my_gemm (BS, 1, BS, &C[LD*ii], BS, 1.0, &d[BS*ii], 1, d_tmp, 1);

    MPI_Barrier (MPI_COMM_WORLD);

    /* Step #3
     * Through a communication type "Reduce" with sum over the terms "di" the
         * Master processor obtains the sum Ci * Ai ^ -1 * Bi with i from 0 to n-1 and
         * This calculates b_cap.
     */
    // 3.1 - sum d_tmp
    MPI_Reduce (&d_tmp, &b_tmp, LD * N, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);

    // #3.2 - Calc b_cap = bs - b_tmp
    for (ii = 0; ii < BS; ii++)
        b_cap[ii] = bs[ii] - b_tmp[ii];


    /* Step #4
     * Each processor calcs the term Di = Ci * Ai^-1 * Bi by the resolution of
     * n linear systems Ai * xj = Bij
     */
    // 4.1
    for (ii = 0; ii < N / num_nodes; ii++)
        elim_avanti   (&A[LD*ii], BS, &B[LD*ii], BS, &Y[LD*ii]);

    // 4.2
    for (ii = 0; ii < N / num_nodes; ii++)
        sost_indietro (&A[LD*ii], BS, &Y[LD*ii], BS, &D[LD*ii]);

    // #4.3 - Prodotto matrice matrice, ovvero D_tmp = D_tmp + (Ci * Di) 
    for (ii = 0; ii < N; ii++)
        my_gemm (BS, BS, BS, &C[LD*ii], BS, 1.0, &D[LD*ii], BS, D_tmp, BS);

    /* Step #5
     * Through a communication type "Reduce" with sum over the terms 'di', the
         * Master processor obtains the sum There Ci * Ai ^ -1 * Bi with i from 0 to n-1 and
         * This calculates A_cap.
     */
    // 5.1 - sum all D_tmp
    MPI_Reduce (&D_tmp, &A_tmp, LD * N, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);

    // #5.2 - calc A_cap, that is A_cap = As - A_tmp
    for (ii = 0; ii < BS; ii++)
        for (jj = 0; jj < BS; jj++)
            A_cap[jj+BS*ii] = As[jj+BS*ii] - A_tmp[jj+BS*ii];

    /* Step #6 
     * The "master" processor resolve the system A_cap * xs = b_cap
     * to dermine xs.
     */
    LU (A_cap, BS);
    elim_avanti   (A_cap, BS, b_cap,  1, xs_tmp);
    sost_indietro (A_cap, BS, xs_tmp, 1, xs);

    MPI_Barrier (MPI_COMM_WORLD);

    /* Step #7
     * The solution xs is communicated to all processors via a broadcast.
     */
    if (my_rank == MASTER)
        MPI_Bcast (xs, BS, MPI_DOUBLE, MASTER, MPI_COMM_WORLD);

    /* Step #8
     * each processor j-esimo resolve the linear systems Ai*xi = bi - Bi*xs
     */
    for (ii = 0; ii < N / num_nodes; ii++) {
        // #8.1 Calc Bi*xs = x
        my_gemm(BS, 1, BS, &B[LD*ii], BS, 1, xs, 1, &x[BS*ii], 1);

        // #8.2 Calc bi-x = x
        for (jj = 0; jj < BS; jj++)
            x[BS*ii + jj] = b[BS*ii + jj] -x[BS*ii + jj];

        // #8.3 Resolve the system Ai*xi = x
        elim_avanti   (&A[LD*ii], BS, &x[BS*ii], 1, xs_tmp);
        sost_indietro (&A[LD*ii], BS, xs_tmp,    1, &x[BS*ii]);
    }

    /* Step #9
     * The components "xi" are combined into a single vector via a
     * Gather type communication. Joined together to form the solution "xs"
     * To the initial problem.
     */
    MPI_Gather (xi, BS, MPI_DOUBLE,
         x_res, BS, MPI_DOUBLE,
             MASTER, MPI_COMM_WORLD);

    for (ii = 0; ii < N; ii++){
        for (jj = 0; jj < BS; jj++) {
            if (ii < N)
                xi[BS*ii + jj] = x[BS*ii + jj];
            else
                xi[BS*ii + jj] = xs[jj];
        }
    }

    // Wait all nodes before to print the result
    MPI_Barrier (MPI_COMM_WORLD);

    // Stop timer
    if (my_rank == MASTER) {
        t_end = MPI_Wtime();
        printf("Time is %f\n", t_end - t_start);
    }

    free(A); free(B); free(C); free(As);
    free(x); free(xs); free(b); free(bs);
    free(Y); free(y); free(D); free(d);
    free(A_cap); free(A_tmp); free(b_cap); free(b_tmp);
    free(xs_tmp); free(xi);

    MPI_Finalize();
    return  0;
}

This is res.h http://bpaste.net/show/yAQCCwFsZlQCCiw7GKC9/ it's correct and provided by the teacher.

I'm sorry for my bad English and some Italian comments, the code should be clear,my goal is to parallelize the serial portion of the code with MPI, the number of Processors is always a multiple of 4. Thanks in advance.

2

There are 2 best solutions below

2
On

You posted this elsewhere:

Fatal error in PMPI_Reduce: Internal MPI error!, error stack:
PMPI_Reduce(1217)..............: MPI_Reduce(sbuf=0x7fffffffda88, rbuf=0x7fffffffda78, count=8388608, MPI_DOUBLE, MPI_SUM, root=0, MPI_COMM_WORLD) failed
MPIR_Reduce_impl(1029).........: 
MPIR_Reduce_intra(825).........: 
MPIR_Reduce_redscat_gather(300): 
MPIR_Localcopy(357)............: memcpy arguments alias each other, dst=0x7fffffffda78 src=0x7fffffffda88 len=67108864
[Inferior 1 (process 4210) exited with code 01]

As the error message says, you have aliased the arguments of MPI_Reduce. This is because you are passing the address of a pointer rather than the value, which is the correct way to provide a data buffer to MPI, at least when you're allocating buffers dynamically.

I think that changing this

MPI_Reduce (&d_tmp, &b_tmp, LD * N, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);

to

MPI_Reduce (d_tmp, b_tmp, LD * N, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);

will solve the problem. If you really like &, you can do the following:

MPI_Reduce (&d_tmp[0], &b_tmp[0], LD * N, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);

but this is pointless.

You might find that your other MPI calls are erroneous if you are using &ptr instead of ptr when a void* argument is requested.

0
On

I found the solution.

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <string.h>
#include <mpi.h>

#include "res.h"

// Number of Blocks
#define N 128

// Block Size
#define BS 256

// Leading Dimension
#define LD BS * BS

// The master node
#define MASTER 0

/*
 *
 */
int main (int argc, char ** argv) {

    size_t ii, jj;
    int my_rank, num_nodes;
    double t_start, t_end;
    double *A, *A_data, *B, *B_data, *C, *C_data;
    double *As, *x, *xs, *b, *bs;
    double *Y, *y, *D, *d, *D_tmp, *d_tmp;
    double *A_cap, *A_tmp, *b_cap, *b_tmp;
    double *xi, *xs_tmp;    
    double *x_res;

    MPI_Status status;
    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
    MPI_Comm_size(MPI_COMM_WORLD, &num_nodes);

    A = calloc (LD * N / num_nodes, sizeof(*A));
    B = calloc (LD * N / num_nodes, sizeof(*B));
    C = calloc (LD * N / num_nodes, sizeof(*C));
    Y = calloc (LD * N, sizeof(*Y));
    y = calloc (BS * N, sizeof(*y));
    D = calloc (LD * N, sizeof(*D));
    d = calloc (BS * N, sizeof(*d));
    b = calloc (BS * N, sizeof(*b));
    As = calloc (LD, sizeof(*As));
    x = calloc (BS * N, sizeof(*x));
    xs = calloc (BS, sizeof(*xs));
    bs = calloc (BS, sizeof(*bs));;
    A_cap = calloc (LD, sizeof(*A_cap));
    A_tmp = calloc (LD, sizeof(*A_tmp));
    b_cap = calloc (BS, sizeof(*b_cap));
    b_tmp = calloc (BS, sizeof(*b_tmp));
    d_tmp = calloc (BS, sizeof(*d_tmp));
    D_tmp = calloc (LD, sizeof(*D_tmp));
    xi = calloc (BS * N, sizeof(*xi));
    xs_tmp = calloc (BS, sizeof(*xs_tmp));
    x_res = calloc (BS * num_nodes, sizeof(*x_res));    

    if (my_rank == MASTER) {

        t_start = MPI_Wtime();

        A_data = calloc (LD * N, sizeof(*A_data));
        B_data = calloc (LD * N, sizeof(*B_data));
        C_data = calloc (LD * N, sizeof(*C_data));

        // Initialize the matrices
        for (ii = 0; ii < N; ii++){
            genera_Ai(&A_data[LD*ii], BS, ii+1);
            genera_Bi(&B_data[LD*ii], BS, ii+1);
            genera_Bi(&C_data[LD*ii], BS, ii+1);
        }

        genera_Ai(As, BS, N+1);

        for (ii = 0; ii < N; ii++) {
            for(jj = 0; jj < BS; jj++) 
                b[BS*ii + jj] = 1;
        }   

        for (ii = 0; ii < BS; ii++) 
            bs[ii] = 1;
    }

    // Send A matrix to all process
    MPI_Scatter(A_data, LD * N / num_nodes, MPI_DOUBLE, 
                A,      LD * N / num_nodes, MPI_DOUBLE, 
                MASTER, MPI_COMM_WORLD);  

    // Send B matrix to all process
    MPI_Scatter(B_data, LD * N / num_nodes, MPI_DOUBLE, 
                B,      LD * N / num_nodes, MPI_DOUBLE, 
                MASTER, MPI_COMM_WORLD);

    // Send C matrix to all process
    MPI_Scatter(C_data, LD * N / num_nodes, MPI_DOUBLE, 
                C,      LD * N / num_nodes, MPI_DOUBLE, 
                MASTER, MPI_COMM_WORLD);

    /* Step #1 
     * Each processor performs the LU decomposition of their matrices Ai 
     * generating the two matrices Li and Ui.
     */
    for (ii = 0; ii < N / num_nodes; ii++) 
        LU (&A[LD*ii], BS);

    /* Step #2 
     * Each processor calculates the term 'di' = Ci * Ai ^ -1 * bi for each 
     * matrix Ai addressed to it using the LU decomposition calculated in step 
     * number 1. Then it calculates the sum.
     */
    // 2.1
    for (ii = 0; ii < N / num_nodes; ii++) {
        elim_avanti   (&A[LD*ii], BS, &b[BS*ii],  1, &y[BS*ii]);
    }

    // 2.2
    for (ii = 0; ii < N / num_nodes; ii++)
        sost_indietro (&A[LD*ii], BS, &y[BS*ii],  1, &d[BS*ii]);

     // #2.3 - Product matrix matrix, that is d_tmp = d_tmp + (Ci * di)
    for (ii = 0; ii < N / num_nodes; ii++)
        my_gemm (BS, 1, BS, &C[LD*ii], BS, 1.0, &d[BS*ii], 1, d_tmp, 1);

    /* Step #3
     * Through a communication type "Reduce" with the sum on the terms 'di', 
     * the master processor get the sum Ci = Ai^-1 * bi, with i from 0 to n-1 
     * and with this calculates b_cap.
     */
    // 3.1 - sum of d_tmp
    MPI_Reduce (d_tmp, b_tmp, BS, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);

    // #3.2 - Calcs di b_cap = bs - b_tmp
    for (ii = 0; ii < BS; ii++)
        b_cap[ii] = bs[ii] - b_tmp[ii];

    /* Step #4
     * Each processor calculates the term Di = Ci * Ai^-1 * Bi by the 
     * resolution of the n linear systems Ai * xj = Bij
     */
    // 4.1
    for (ii = 0; ii < N / num_nodes; ii++)
        elim_avanti   (&A[LD*ii], BS, &B[LD*ii], BS, &Y[LD*ii]);

    // 4.2
    for (ii = 0; ii < N / num_nodes; ii++)
        sost_indietro (&A[LD*ii], BS, &Y[LD*ii], BS, &D[LD*ii]);

    // #4.3 - Products matrix matrix, that is D_tmp = D_tmp + (Ci * Di)
    for (ii = 0; ii < N / num_nodes; ii++)
        my_gemm (BS, BS, BS, &C[LD*ii], BS, 1.0, &D[LD*ii], BS, D_tmp, BS);

    /* Step #5
     * Through a communication type "Reduce" with sum over the terms 'di', 
     * the master processor get the sum Ci * Ai^-1 * Bi with i from 0 to n-1 
     * and with this calculates A_cap.
     */
    // 5.1 - sum of D_tmp
    MPI_Reduce (D_tmp, A_tmp, BS, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);

    // #5.2 - Calcs of A_cap, that is A_cap = As - A_tmp
    for (ii = 0; ii < BS; ii++)
        for (jj = 0; jj < BS; jj++)
            A_cap[jj+BS*ii] = As[jj+BS*ii] - A_tmp[jj+BS*ii];

    /* Step #6 
     * The master processor solves the linear system A_cap * xs = xs b_cap 
     * to determine.
     */
    LU (A_cap, BS);
    elim_avanti   (A_cap, BS, b_cap,  1, xs_tmp);
    sost_indietro (A_cap, BS, xs_tmp, 1, xs);

    /* Step #7
     * The solution xs is communicated to all processors via a broadcast.
     */
    MPI_Bcast (xs, BS, MPI_DOUBLE, MASTER, MPI_COMM_WORLD);

    /* Step #8
     * Each processor j-th solves linear systems Ai*xi = bi - Bi*xs
     */
    for (ii = 0; ii < N / num_nodes; ii++) {
        // #8.1 Calcs Bi*xs = x
        my_gemm(BS, 1, BS, &B[LD*ii], BS, 1, xs, 1, &x[BS*ii], 1);

        // #8.2 Calcs bi-x = x
        for (jj = 0; jj < BS; jj++)
            x[BS*ii + jj] = b[BS*ii + jj] -x[BS*ii + jj];

        // #8.3 Resolve the system Ai*xi = x
        elim_avanti   (&A[LD*ii], BS, &x[BS*ii], 1, xs_tmp);
        sost_indietro (&A[LD*ii], BS, xs_tmp,    1, &x[BS*ii]);
    }

    /* Step #9
     * The components xi are combined into a single vector with a "Gather". 
     * Joined together to form the xs solution to the initial problem.
     */
    MPI_Gather (xi,    BS, MPI_DOUBLE,
                x_res, BS, MPI_DOUBLE,
                MASTER, MPI_COMM_WORLD);

    for (ii = 0; ii < N; ii++){
        for (jj = 0; jj < BS; jj++) {
            if (ii < N)
                xi[BS*ii + jj] = x[BS*ii + jj];
            else
                xi[BS*ii + jj] = xs[jj];
        }
    }

    // Stop timer
    if (my_rank == MASTER) {
        t_end = MPI_Wtime();
        printf("Time is %f\n", t_end - t_start);
        free(A_data); free(B_data); free(C_data);
    }

    free(A); free(A_cap); free(A_tmp); 
    free(D); free(D_tmp); free(d_tmp); free(d);
    free(b); free(b_cap); free(b_tmp);
    free(B); free(C); 
    free(Y); free(y); 
    free(As); free(bs);
    free(x); free(xs); free(xs_tmp); free(x_res);

    MPI_Finalize();
    return  0;
}

Thanks to Maxim for the special advice :)