As explained in the docs, one should pay attention when using auto
type deduction with eigen. e.g. the following function segfaults for me, and I'm okay with that:
struct MultiplyAdd{
Eigen::Matrix<double, DIMS, DIMS> mult;
Eigen::Matrix<double, DIMS, 1> add;
auto counterExample(Eigen::Matrix<double, DIMS, -1> const& data) const {
return (mult.sinh()*data).colwise() + add;
}
};
As far as I understand the docs above, I can either spell out the return type as an actual matrix (and not a lazy evaluated expression) or use eval()
.
#include <benchmark/benchmark.h>
#include <Eigen/Dense>
constexpr int DIMS = 10;
struct MultiplyAdd{
Eigen::Matrix<double, DIMS, DIMS> mult;
Eigen::Matrix<double, DIMS, 1> add;
Eigen::Matrix<double, DIMS, -1> transform(Eigen::Matrix<double, DIMS, -1> const& data) const {
return ((mult*data).colwise() + add).eval();
}
};
struct classic_worker{
static auto dowork(Eigen::Matrix<double, DIMS, -1> const& data, MultiplyAdd const& trafo) {
const auto flags = ((2.*trafo.transform(data)).row(2).array() > 1.7f).eval();
return flags;
}
};
struct allinline{
static auto dowork(Eigen::Matrix<double, DIMS, -1> const& data, MultiplyAdd const& trafo) {
const auto flags = ((2.*((trafo.mult * data).colwise() + trafo.add)).row(2).array() > 1.7f).eval();
return flags;
}
};
template <typename STRUCT>
static void quicktest(benchmark::State &state) {
Eigen::Matrix<double, DIMS, -1> data = Eigen::MatrixXd::Random(DIMS, state.range(0));
Eigen::Matrix<double, DIMS, DIMS> m = Eigen::MatrixXd::Random(DIMS, DIMS);
Eigen::Matrix<double, DIMS, 1> a = Eigen::MatrixXd::Random(DIMS, 1);
MultiplyAdd trafo{m,a};
for (auto _ : state) {
benchmark::DoNotOptimize(STRUCT::dowork(data, trafo));
}
}
// clang-format off
BENCHMARK_TEMPLATE(quicktest, classic_worker )->UseRealTime()->DenseRange(20,320, 50);
BENCHMARK_TEMPLATE(quicktest, allinline )->UseRealTime()->DenseRange(20,320, 50);
// clang-format on
BENCHMARK_MAIN();
In this longer example I have two implementations implementations of my MultiplyAdd
and compare them with google-benchmark. In one case I do my operations in the transform
method that I can put in a central header, in the other case I do all the lowlevel math on the "call site". In the example I only use row 2 of the computation. It appears
./iter_5 --benchmark_filter='.*(classic|all).*320.*' --benchmark_repetitions=10 --benchmark_report_aggregates_only
...
-----------------------------------------------------------------------------------------
Benchmark Time CPU Iterations
-----------------------------------------------------------------------------------------
quicktest<classic_worker>/320/real_time_mean 3974 ns 3974 ns 10
quicktest<classic_worker>/320/real_time_median 3944 ns 3944 ns 10
quicktest<classic_worker>/320/real_time_stddev 121 ns 121 ns 10
quicktest<allinline>/320/real_time_mean 3308 ns 3308 ns 10
quicktest<allinline>/320/real_time_median 3285 ns 3285 ns 10
quicktest<allinline>/320/real_time_stddev 68.2 ns 68.2 ns 10
the all-at-call-site version is ~20% faster (with all the caveats of micro benchmarks).
My naive interpretation given the above link is that I'm avoiding lazy evaluation and needlessly compute all rows. (The speed difference appears independent of my choice for DIMS
between 3 and 100, so I suspect my interpretation is wrong.)
What is the correct pattern to write functions that return eigen matrix types (in the docs I only saw mentions of function arguments)?
EDIT: I have a new example that's closer in observations to my real case
#include <benchmark/benchmark.h>
#include <Eigen/Dense>
struct Trafo{
Eigen::Quaternion<double> mult;
Eigen::Matrix<double, 3, 1> add;
inline Eigen::Matrix<double, 3, -1> transform(Eigen::Matrix<double, 3, -1> const& data) const noexcept {
return ((mult.toRotationMatrix()*data).colwise() + add).eval();
}
inline auto counterExample(Eigen::Matrix<double, 3, -1> const& data) const noexcept {
return (mult.toRotationMatrix()*data).colwise() + add;
}
};
struct classic_worker{
static auto dowork(Eigen::Matrix<double, 3, -1> const& data, Trafo const& trafo) {
const auto flags = ((2.*trafo.transform(data)).row(2).array() > 1.7f).eval();
return flags;
}
};
struct counterExample_worker{
static auto dowork(Eigen::Matrix<double, 3, -1> const& data, Trafo const& trafo) {
const auto flags = ((2.*trafo.counterExample(data)).row(2).array() > 1.7f).eval();
return flags;
}
};
struct allinline{
static auto dowork(Eigen::Matrix<double, 3, -1> const& data, Trafo const& trafo) {
const auto flags = ((2.*((trafo.mult.toRotationMatrix() * data).colwise() + trafo.add)).row(2).array() > 1.7f).eval();
return flags;
}
};
template <typename STRUCT>
static void quicktest(benchmark::State &state) {
Eigen::Matrix<double, 3, -1> data = Eigen::MatrixXd::Random(3, state.range(0));
Eigen::Matrix<double, 4, 1> random = Eigen::MatrixXd::Random(4, 1);
Eigen::Quaternion<double> m{random(0), random(1), random(2), random(3)};
Eigen::Matrix<double, 3, 1> a = Eigen::MatrixXd::Random(3, 1);
Trafo trafo{m,a};
for (auto _ : state) {
benchmark::DoNotOptimize(STRUCT::dowork(data, trafo));
}
}
// clang-format off
BENCHMARK_TEMPLATE(quicktest, counterExample_worker )->UseRealTime()->DenseRange(20,320, 50);
BENCHMARK_TEMPLATE(quicktest, classic_worker )->UseRealTime()->DenseRange(20,320, 50);
BENCHMARK_TEMPLATE(quicktest, allinline )->UseRealTime()->DenseRange(20,320, 50);
// clang-format on
BENCHMARK_MAIN();
There are three ways how to do my computation:
- a function (
counterExample
called bycounterExample_worker
) that doesn't calleval
and thus lets a call site take care of evaluations - though this might (and seems to) lead to inefficient multiple evaluations - a function (
transform
called byclassic_worker
) that callseval
and thus evaluates its expression (which might not be needed) - all code at call site (done by
allinline
)
Compiled with g++8 and -O3 -march=native
(and all in the same translation unit, so the compiler can do all the inlining it wants) I see the following timing table
quicktest<counterExample_worker>/320/real_time_mean 52612 ns 52612 ns 10
quicktest<counterExample_worker>/320/real_time_median 55218 ns 55217 ns 10
quicktest<counterExample_worker>/320/real_time_stddev 8501 ns 8501 ns 10
quicktest<classic_worker>/320/real_time_mean 622 ns 622 ns 10
quicktest<classic_worker>/320/real_time_median 619 ns 619 ns 10
quicktest<classic_worker>/320/real_time_stddev 6.89 ns 6.89 ns 10
quicktest<allinline>/320/real_time_mean 428 ns 428 ns 10
quicktest<allinline>/320/real_time_median 426 ns 426 ns 10
quicktest<allinline>/320/real_time_stddev 5.68 ns 5.67 ns 10
It appears, not calling eval
is a bad idea here. However doing the transformation in its own function also comes at a price wrt writing everything in one line. So the question is rather: Is there a way to put the transformation here into separate functions without runtime slowdown? I do notice that (unfortunately) the results in this example depend heavily on the compiler and compilation flags (with g++-8 -msse3
the difference between allinline
and classic_worker
vanishes, clang++-10
appears to provide a faster allinline
with -msse3
and with -march=native
…)