I just started to play with Boost.Compute, to see how much speed it can bring to us, I wrote a simple program:
#include <iostream>
#include <vector>
#include <algorithm>
#include <boost/foreach.hpp>
#include <boost/compute/core.hpp>
#include <boost/compute/platform.hpp>
#include <boost/compute/algorithm.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/functional/math.hpp>
#include <boost/compute/types/builtin.hpp>
#include <boost/compute/function.hpp>
#include <boost/chrono/include.hpp>
namespace compute = boost::compute;
int main()
{
// generate random data on the host
std::vector<float> host_vector(16000);
std::generate(host_vector.begin(), host_vector.end(), rand);
BOOST_FOREACH (auto const& platform, compute::system::platforms())
{
std::cout << "====================" << platform.name() << "====================\n";
BOOST_FOREACH (auto const& device, platform.devices())
{
std::cout << "device: " << device.name() << std::endl;
compute::context context(device);
compute::command_queue queue(context, device);
compute::vector<float> device_vector(host_vector.size(), context);
// copy data from the host to the device
compute::copy(
host_vector.begin(), host_vector.end(), device_vector.begin(), queue
);
auto start = boost::chrono::high_resolution_clock::now();
compute::transform(device_vector.begin(),
device_vector.end(),
device_vector.begin(),
compute::sqrt<float>(), queue);
auto ans = compute::accumulate(device_vector.begin(), device_vector.end(), 0, queue);
auto duration = boost::chrono::duration_cast<boost::chrono::milliseconds>(boost::chrono::high_resolution_clock::now() - start);
std::cout << "ans: " << ans << std::endl;
std::cout << "time: " << duration.count() << " ms" << std::endl;
std::cout << "-------------------\n";
}
}
std::cout << "====================plain====================\n";
auto start = boost::chrono::high_resolution_clock::now();
std::transform(host_vector.begin(),
host_vector.end(),
host_vector.begin(),
[](float v){ return std::sqrt(v); });
auto ans = std::accumulate(host_vector.begin(), host_vector.end(), 0);
auto duration = boost::chrono::duration_cast<boost::chrono::milliseconds>(boost::chrono::high_resolution_clock::now() - start);
std::cout << "ans: " << ans << std::endl;
std::cout << "time: " << duration.count() << " ms" << std::endl;
return 0;
}
And here's the sample output on my machine (win7 64-bit):
====================Intel(R) OpenCL====================
device: Intel(R) Core(TM) i7-4770 CPU @ 3.40GHz
ans: 1931421
time: 64 ms
-------------------
device: Intel(R) HD Graphics 4600
ans: 1931421
time: 64 ms
-------------------
====================NVIDIA CUDA====================
device: Quadro K600
ans: 1931421
time: 4 ms
-------------------
====================plain====================
ans: 1931421
time: 0 ms
My question is: why is the plain (non-opencl) version faster?
I can see one possible reason for the big difference. Compare the CPU and the GPU data flow:-
Given this, it appears that the Intel chip is just a bit rubbish at general compute, the NVidia is probably suffering from the extra data copying and setting up the GPU to do the calculation.
You should try the same program but with a much more complex operation - sqrt and sum are too simple to overcome the extra overhead of using the GPU. You could try calculating Mandlebrot points for instance.
In your example, moving the lambda into the accumulate would be faster (one pass over memory vs. two passes)