I need some help with understanding of parallel computing in c++. I've wrote classic implementation of Gauss method for solving linear equations:
int Gauss::Solve(SimpleGraph<double> matr, std::vector<double>& answer) {
const int n = matr.get_rows();
const int m = matr.get_cols() - 1;
// straight way
std::vector<int> where(m, -1);
for (int row = 0, col = 0; row < n && col < m; ++col) {
// pivot searching
try {
matr.SwapRows(row, FindPivotRow(matr, row, col));
} catch (...) {
continue;
}
where[col] = row;
// making triangle matrix
for (int i = row + 1; i != n; ++i) {
auto coef = matr[i][col] / matr[row][col];
for (int j = col; j <= m; ++j) {
matr[i][j] -= matr[row][j] * coef;
if (std::abs(matr[i][j]) < EPS) matr[i][j] = 0;
}
}
++row;
}
// way back
answer.assign(m, 0);
for (int row = n - 1; row >= 0; --row) {
// check sum of coefs near 'x'
double sum = 0;
for (int col = m - 1; col >= 0; --col)
sum += matr[row][col];
if (std::abs(sum) < EPS && std::abs(matr[row][m]) > EPS) {
return NONE;
}
// works because matrix is triangle now
for (int col = m; col >= row /* 0 */; --col)
matr[row][col] /= matr[row][row];
// making diagonal matrix
for (int i = row - 1; i >= 0; --i) {
double K = matr[i][row] / matr[row][row];
for (int j = m; j >= 0; --j) {
matr[i][j] = matr[i][j] - matr[row][j] * K;
}
}
}
// here we have diagonal main matrix, so answers are free members
for (int i = 0; i != m; ++i)
if (where[i] == -1) return LOT;
for (int i = 0; i != n; ++i) {
answer[i] = matr[i][n];
}
return ONE;
}
Method returns ONE if there are only one solution, NONE if no solution exists, and LOT if there are a lot of solutions.
SimpleGraph is just a wrapper around vector with overloaded operator[].
Function FindPivotRow throws exception if there are no pivot element under current row and pivot element is zero --> so we can continue from next row.
I need to integrate parallel computing here using std::thread and std::mutex. I know that the main idea of threads is to do independent calculations in different threads.
As Gauss method has 2 main parts (2 ways), I tried to make them parallel:
struct StraightWay {
SimpleGraph<double>& matr_ref;
std::vector<int>& where_ref;
std::mutex& matr_mtx_ref;
void operator()(int n, int m) {
matr_mtx_ref.lock();
for (int row = 0, col = 0; row < n && col < m; ++col) {
try {
matr_ref.SwapRows(row, FindPivotRow(matr_ref, row, col));
} catch (...) {
continue;
}
where_ref[col] = row;
for (int i = row + 1; i != n; ++i) {
double coef = matr_ref[i][col] / matr_ref[row][col];
for (int j = 0; j <= m; ++j) {
matr_ref[i][j] -= matr_ref[row][j] * coef;
if (std::abs(matr_ref[i][j]) < Gauss::EPS) matr_ref[i][j] = 0;
}
}
++row;
}
matr_mtx_ref.unlock();
}
};
struct BackwardWay {
SimpleGraph<double>& matr_ref;
std::mutex& matr_mtx_ref;
void operator()(int n, int m) {
matr_mtx_ref.lock();
for (int row = n - 1; row >= 0; --row) {
double sum = 0;
for (int col = m - 1; col >= 0; --col)
sum += matr_ref[row][col];
if (std::abs(sum) < Gauss::EPS && std::abs(matr_ref[row][m]) > Gauss::EPS) {
/* idk what to do here */
/* return NONE; */
}
for (int col = m; col >= row /* 0 */; --col)
matr_ref[row][col] /= matr_ref[row][row];
for (int i = row - 1; i >= 0; --i) {
double K = matr_ref[i][row] / matr_ref[row][row];
for (int j = m; j >= 0; --j) {
matr_ref[i][j] = matr_ref[i][j] - matr_ref[row][j] * K;
}
}
}
matr_mtx_ref.unlock();
}
};
int Gauss::ParallelSolve(SimpleGraph<double> matr, std::vector<double>& answer) {
const int n = matr.get_rows();
const int m = matr.get_cols() - 1;
std::mutex matr_mtx;
std::vector<int> where(m, -1);
StraightWay strw{matr, where, matr_mtx};
BackwardWay bckw{matr, matr_mtx};
std::thread straight_way(strw, n, m);
std::thread backward_way(bckw, n, m);
straight_way.join();
backward_way.join();
for (int i = 0; i != m; ++i)
if (where[i] == -1) return LOT;
answer.assign(n, 0);
for (int i = 0; i != n; ++i) {
answer[i] = matr[i][n];
}
return ONE;
}
So, struct StraigthWay is a copy of the first part of classic implementation, and struct BackwardWay is the same as the second part of classic solution.
std::mutex matr_mtx doesn't allow to change 'matr' at the same time from different threads.
As a result, the execution time for classic solution is much better than the parallel one.
When deleting mutex, the answer is wrong (ofc) but the exec time is good.
I think when I create a critical zone with mutex, it is just copies the behavior of classic solution and waste resources on creating threads and their processing.
My main question is How to make gauss algorithm parallel?