I'm trying to optimise the following parallel execution to use threadprivate variables as the Eigen Vector evalInput could easily be thread_local and shared between iterations/tasks within a thread:
#pragma omp parallel for schedule(dynamic)
for (int j = 0; j < input.rows(); ++j)
{
Eigen::VectorXf evalInput = input;
// Modify one entry of evalInput, then restore after evaluating
output.col(j) = factors * evalInput;
}
I tried to get evalInput to be shared per-thread with threadprivate and copyin, but even the most basic example does not work for me:
static Eigen::VectorXf evalInput = input;
#pragma omp threadprivate(evalInput)
#pragma omp parallel for schedule(dynamic) copyin(evalInput)
for (int j = 0; j < input.rows(); ++j)
{
evalInput[j] += 1.0;
output.col(j) = factors * evalInput;
evalInput[j] = input[j];
}
Depending on the circumstances, I get the error error: ‘threadprivate’ ‘evalInput’ has incomplete type or error: ‘evalInput’ declared ‘threadprivate’ after first use (gcc 13.2.1).
However, notably, it does work in clang.
I prepared four simplified test cases, each changing one thing to the last, and not using Eigen, but simple vectors (so ignore that there is no point in using threadprivate here):
parallelBasicbeing the basic implementation to optimise (as an example, not what I actually want to optimise) using templated Scalar.parallelOptadds the threadprivate variable, yielding the incomplete type error.parallelOptFloatthen drops the Scalar template, yielding in the first use error.parallelVarfinally adds a random, unused templated parameter, and for some reason DOES compile.
It can be compiled with g++ -o main -c main.cpp -Iinclude -fopenmp -lgomp and it will fail, but when compiled with clang++ it does work:
#include <omp.h>
#include <vector>
template<typename Scalar>
void parallelBasic(const std::vector<Scalar> &input, const std::vector<Scalar> &factors, std::vector<Scalar> &output)
{
#pragma omp parallel for schedule(dynamic)
for (int j = 0; j < input.size(); ++j)
{
output[j] = factors[j] * input[j];
}
}
template<typename Scalar>
void parallelOpt(const std::vector<Scalar> &input, const std::vector<Scalar> &factors, std::vector<Scalar> &output)
{ // Does not work, "‘threadprivate’ ‘evalInput’ has incomplete type"
static std::vector<Scalar> evalInput;
#pragma omp threadprivate(evalInput)
evalInput = input;
#pragma omp parallel for schedule(dynamic) copyin(evalInput)
for (int j = 0; j < input.size(); ++j)
{
output[j] = factors[j] * evalInput[j];
}
}
void parallelOptFloat(const std::vector<float> &input, const std::vector<float> &factors, std::vector<float> &output)
{ // Removed templated Scalar, also does not work, "‘evalInput’ declared ‘threadprivate’ after first use"
static std::vector<float> evalInput;
#pragma omp threadprivate(evalInput)
evalInput = input;
#pragma omp parallel for schedule(dynamic) copyin(evalInput)
for (int j = 0; j < input.size(); ++j)
{
output[j] = factors[j] * evalInput[j];
}
}
template<typename Scalar>
void parallelVar(const std::vector<float> &input, const std::vector<float> &factors, std::vector<float> &output, Scalar value)
{ // Works, just added unused parameter
static std::vector<float> evalInput;
#pragma omp threadprivate(evalInput)
evalInput = input;
#pragma omp parallel for schedule(dynamic) copyin(evalInput)
for (int j = 0; j < input.size(); ++j)
{
output[j] = factors[j] * evalInput[j];
}
}
int main ()
{
omp_set_dynamic(0);
std::vector<float> input(100), factors(100), output(100);
parallelBasic<float>(input, factors, output);
//parallelOpt<float>(input, factors, output);
//parallelOptFloat(input, factors, output);
parallelVar(input, factors, output, 1);
return 0;
}
Thank you for any help.