I am using ArrayFire library for signal processing. I am just curious about how to make more efficient my code. I read vectorization guide in docs, but i just ended up using gfor construct. Is it possible to improve efficiency ? Is there a better way for vectorization ? ( I hope there is :)
Note : I am aiming CUDA performance.
Here is the code which i am trying to improve :
#include <arrayfire.h>
#include <stdio.h>
#include <af/util.h>
static int proc_size = 1024;
static int fft_size = proc_size * 4;
static int staves = 288;
static int beams = 256;
static af::array S;
static af::array B;
static af::array R;
void fn()
{
gfor ( af::seq i, fft_size )
R( i , af::span ) = matmul( S( i , af::span ) , B( af::span , af::span , i ) );
}
int main(int, char **)
{
S = af::randn( fft_size , staves , c32 );
gfor ( af::seq i, fft_size )
S( i , af::span ) = af::randn( 1 , staves , c32 );
B = af::randn( staves , beams , fft_size , af::dtype::c32 );
R = af::constant( af::cfloat { 0 , 0 } , fft_size , beams );
try
{
af::setDevice( 0 );
af::info();
double time = af::timeit(fn);
printf( "Took %f secs.\n" , time );
}
catch (const af::exception &ex)
{
fprintf(stderr, "%s\n", ex.what());
throw;
}
return 0;
}