I am a little confused on how to use the cula device interface. Right now , I am using the cula interface on a cpp file and I am generating some random numbers from a cu file.
cu file:
...
__global__ void kernel( double * A,double * B, curandState * globalState, int Asize, int Bsize )
{
// generate random numbers
...
void kernel_wrapper(
double ** const A_host,
double ** const B_host,
const int Asize ,
const int Bsize )
{
...
// create random states
curandState * devStates;
gpuErrchk( cudaMalloc( &devStates, N * sizeof(curandState) ) );
// allocate host memory
*A_host = (double*) malloc( Asize * sizeof(double) );
*B_host = (double*) malloc( Bsize * sizeof(double) );
// allocate device memory
double * A_dev, * B_dev;
gpuErrchk( cudaMalloc( (void**) &A_dev, Asize * sizeof(double) ) );
gpuErrchk( cudaMalloc( (void**) &B_dev, Bsize * sizeof(double) ) );
// setup seeds
setup_kernel<<<1,N>>>( devStates, unsigned( time(NULL)) );
...
// generate random numbers
kernel<<<1,1>>>( A_dev, B_dev, devStates, Asize, Bsize );
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
// copy result from device to host
gpuErrchk( cudaMemcpy( *A_host, A_dev, Asize * sizeof(double), cudaMemcpyDeviceToHost ) );
gpuErrchk( cudaMemcpy( *B_host, B_dev, Bsize * sizeof(double), cudaMemcpyDeviceToHost ) );
// clean up device memory
gpuErrchk( cudaFree( A_dev ) );
gpuErrchk( cudaFree( B_dev ) );
gpuErrchk( cudaFree( devStates ) );
return;
}
cpp file:
...
extern void kernel_wrapper(double** A,double** B, int Asize ,int Bsize);
...
culaDouble* A;
culaDouble* B;
kernel_wrapper( &A, &B, Asize, Bsize );
...
status = culaDgels('N',N,N, NRHS, A, N, B, N);
So , I am allocating host memory from cu file and pass it to cpp file.
If I want to use cula device?
I can't figure how to manage memory transfers.
I don't know cula. However, after a brief look at the reference guide (which I suggest to consult prior to SO) you can use cula device functions just as host functions. However, you have to pass device memory pointers to the function.
and in your cpp:
That's it you don't even need host memory as long as everything shall remain in device memory.
Finaly, may I suggest that you take a look at the CUDA Programming Guide? I think this will help you understand the differences in host and device memory and in "memory transfers" to and from a CUDA device.