I am trying to parallelize fft convolution in arrayfire over multiple CPU threads:
#include <arrayfire.h>
#include <iostream>
#include <omp.h>
using namespace af;
void printarray(const std::vector<float>& f, size_t N=10)
{
const size_t bound=std::min(N,f.size());
for (size_t i=0; i< bound; ++i){
using namespace std;
cout<<f[i];
if (i+1<bound) cout<<", ";
else cout<<endl;
}
}
using namespace std;
int main() {
std::vector<float> vec{2.0,1.0};
cout<<"vec: "<<endl;
printarray(vec);
std::vector<float> kernel(10000000,5.0);
cout<<"kernel: "<<endl;
printarray(kernel);
try {
#pragma omp parallel
{
#pragma omp master
{
cout<<"Threads: "<<omp_get_num_threads()<<endl;
}
af::array af_in(vec.size(), vec.data());
af::array af_kernel(kernel.size(), kernel.data());
af::array tmp = af::fftConvolve(af_in, af_kernel, AF_CONV_EXPAND);
std::vector<float> out;
float *h = tmp.host<float>();
size_t entries = tmp.bytes() / sizeof(float);
for (size_t i = 0; i < entries; ++i) {
out.push_back(h[i]);
}
af::freeHost(h);
int thr_num=omp_get_thread_num();
cout<<"Thread "<<thr_num<<" finished"<<endl;
}
} catch (af::exception& e) {
fprintf(stderr, "%s\n", e.what());
throw;
}
return 0;
}
This minimal example program example.cpp
can be compiled using g++ example.cpp -lafcpu -fopenmp
. But somehow it only runs the convolutions sequentially, not parallel. This can be checked for example by observing the CPU load while the program runs or by measuring runtime:
$ time OMP_NUM_THREADS=1 ./a.out
vec:
2, 1
kernel:
5, 5, 5, 5, 5, 5, 5, 5, 5, 5
Threads: 1
Thread 0 finished
real 0m1,745s
user 0m1,654s
sys 0m0,069s
$ time OMP_NUM_THREADS=8 ./a.out
vec:
2, 1
kernel:
5, 5, 5, 5, 5, 5, 5, 5, 5, 5
Threads: 8
Thread 2 finished
Thread 5 finished
Thread 6 finished
Thread 1 finished
Thread 0 finished
Thread 3 finished
Thread 7 finished
Thread 4 finished
real 0m11,944s
user 0m14,552s
sys 0m0,544s
I guess there must be some locking mechanism inside the function af::fftConvolve
which prevents parallel execution despite I even constructed separate af::array
variables in the individual threads.
How can I parallelize these af::fftConvolve
convolutions on CPU?