I have been experiment with using references for speed improvements. The (not working) trivial example below, probably won't see any improvements. However, I think by not copying the data into a separate (cost) function, should save some time for a non trivial example.
As for now I have the example in an R-project as three c++ files:
header
#ifndef ExampleInternal_H
#define ExampleInternal_H
namespace ExampleInternal{
#include <RcppArmadillo.h>
#include <RcppParallel.h>
#include <Rcpp.h>
void myfuncA(arma::rowvec &vec_in, arma::colvec& data){
vec_in.at(1) = vec_in.at(0)*arma::accu(data);
}
struct PARALLEL_WORKER : RcppParallel::Worker{
arma::mat &input_output;
const arma::colvec &data_in;
PARALLEL_WORKER(arma::mat &input_output, const arma::colvec &data_in);
void operator()(std::size_t begin, std::size_t end);
};
}
#endif
Function
#include <RcppArmadillo.h>
#include <RcppParallel.h>
#include "ExampleInternal.h"
using namespace ExampleInternal;
// [[Rcpp::export]]
arma::mat Parallelfunc(int Len_in, const arma::colvec data_in){
arma::mat input(Len_in, 2, arma::fill::zeros);
for(unsigned int i = 0; i < Len_in; i ++){
input.at(i, 0) =i;
}
ExampleInternal::PARALLEL_WORKER worker(input, data_in);
parallelFor(0, Len_in, worker);
return input;
}
Parallel Worker
#include <RcppArmadillo.h>
#include <RcppParallel.h>
#include "ExampleInternal.h"
using namespace RcppParallel;
using namespace ExampleInternal;
namespace ExampleInternal{
PARALLEL_WORKER::PARALLEL_WORKER(arma::mat &input_output, const arma::colvec &data_in) : input_output(input_output), data_in(data_in) {}
void PARALLEL_WORKER::operator()(std::size_t begin, std::size_t end){
for(unsigned int k = begin; k < end; k ++){
ExampleInternal::myfuncA(input_output.row(k), data_in);
}
}
}