#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST], recv_buf[NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, iend+1, MPI_DOUBLE, procR[1], 0, recv_buf, iend+1, MPI_DOUBLE, procR[1], 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST], recv_buf[NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j+1][i] = recv_buf[i];
I need this portion of the code to run on the GPU and I'd like also the MPI message to involve only the device. For this purpose I saw that#pragma acc host_data use_device()
should be perfect but the compiler seems not to see it (-Minfo=accel) and I get an error running the application:
BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
PID 36444 RUNNING AT Inspiron-7501
EXIT CODE: 139
CLEANING UP REMAINING PROCESSES
YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11)
Removing it it works well, but it's slow:
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST], recv_buf[NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
#pragma acc update self(send_buf[:NX_GLOB+2*NGHOST])
MPI_Sendrecv (send_buf, iend+1, MPI_DOUBLE, procR[1], 0, recv_buf, iend+1, MPI_DOUBLE, procR[1], 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
#pragma acc update device(recv_buf[NX_GLOB+2*NGHOST])
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST], recv_buf[NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j+1][i] = recv_buf[i];