I've attempted to parallelize a set of conditional statements, but the output does not match existing implementation after the first loop which contains the kernel executes (mapI is an int array of 135, and on the 60th index of the second loop it fails, totaling 195 calls to mapI). I've checked that all arrays are passing to and from the kernel correctly by comparing them to the existing implementation and am baffled as to why this computation does not return the correct result, as it does for the first loop of the code. All OpenCL overhead functions return CL_SUCCESS.
cl_mem Q1cl, Q3cl;
Q1cl = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(double)*um->Npts*um->Nel, Q1, &err);
Q3cl = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(double)*um->Npts*um->Nel, Q3, &err);
nxcl = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(double)*um->Nel*um->Nfaces*um->Nfq, nx, &err);
nycl = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(double)*um->Nel*um->Nfaces*um->Nfq, ny, &err);
mapIcl = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*(um->Nfq+um->Ninflow), mapI, &err);
mapFcl = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*(um->Nfq+um->Nfar), mapF, &err);
fluxQ1cl = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(double)*um->Nel*um->Nfaces*um->Nfq, *fluxQ1check, &err);
err = clSetKernelArg(kernel[7], 0, sizeof(cl_mem), (void*)&mapIcl);
err = clSetKernelArg(kernel[7], 1, sizeof(cl_mem), (void*)&nxcl);
err = clSetKernelArg(kernel[7], 2, sizeof(cl_mem), (void*)&nycl);
err = clSetKernelArg(kernel[7], 3, sizeof(cl_mem), (void*)&mapFcl);
err = clSetKernelArg(kernel[7], 4, sizeof(cl_mem), (void*)&Q1cl);
err = clSetKernelArg(kernel[7], 5, sizeof(cl_mem), (void*)&Q3cl);
err = clSetKernelArg(kernel[7], 6, sizeof(cl_mem), (void*)&fluxQ1cl);
globalWorkSize[0] = Ninflow; //Old implementation, now NEL
globalWorkSize[1] = Nfar; //Old implementation, now NFACES*NFQ
err = clEnqueueNDRangeKernel(queue[0], kernel[7], 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
err = clEnqueueReadBuffer(queue[0], fluxQ1cl, CL_TRUE, 0, sizeof(double)*um->Nel*um->Nfaces*um->Nfq, *fluxQ1check, 0, NULL, NULL);
Kernel Code:
__kernel void umBC(__global int* mapI,
__global double* nx,
__global double* ny,
__global int* mapF,
__global double* Q1,
__global double* Q3,
__global double* fluxQ1)
{
int id, idF;
double e[9][2] = { {0, 0}, {1, 0}, {0, 1}, {-1, 0}, {0, -1}, {1, 1}, {-1, 1}, {-1, -1}, {1, -1}};
double t_1 = 1. / 9.;
double uf = 0.;
double vf = 0.;
int globalx = get_global_id(0);
int globaly = get_global_id(1);
id = mapI[globalx];
fluxQ1[id] = ((e[1][0]*nx[id] + e[1][1]*ny[id]) < 0)*((Q1[id]-Q3[id] -2.*t_1*1.*(e[1][0]*uf+e[1][0]*vf)*3.) * (e[1][0]*nx[id] + e[1][1]*ny[id])) + 0.;
uf = 0.01;
vf = 0.;
idF = mapF[globaly];
fluxQ1[idF] = ((e[1][0]*nx[idF] + e[1][1]*ny[idF]) < 0)*((Q1[idF]-Q3[idF] -2.*t_1*1.*(e[1][0]*uf+e[1][0]*vf)*3.) * (e[1][0]*nx[idF] + e[1][1]*ny[idF])) + 0.;
}
Edit: Below is the working implementation, thank you again doqtor and Lee for your help. To implementthis I needed to change the way mapI and mapF worked to match the sizing of fluxQ.
__kernel void umBC(__global int* mapI,
__global double* nx,
__global double* ny,
__global int* mapF,
__global double* Q1,
__global double* Q3,
__global double* fluxQ1)
{
double e[9][2] = { {0, 0}, {1, 0}, {0, 1}, {-1, 0}, {0, -1}, {1, 1}, {-1, 1}, {-1, -1}, {1, -1}};
double t_1 = 1. / 9.;
double uf = 0.;
double vf = 0.;
int globalx = get_global_id(0); //NEL
int globaly = get_global_id(1); //NFACES*NFQ
if(mapI[globalx*NFACES*NFQ+globaly] != NEL*NFACES*NFQ+1000){
fluxQ1[globalx*NFACES*NFQ+globaly] = 0.0;
if ((e[1][0]*nx[globalx*NFACES*NFQ+globaly] + e[1][1]*ny[globalx*NFACES*NFQ+globaly]) < 0){
fluxQ1[globalx*NFACES*NFQ+globaly] = (Q1[globalx*NFACES*NFQ+globaly]-Q3[globalx*NFACES*NFQ+globaly] -2.*t_1*1.*(e[1][0]*uf+e[1][0]*vf)*3.) * (e[1][0]*nx[globalx*NFACES*NFQ+globaly] + e[1][1]*ny[globalx*NFACES*NFQ+globaly]);
}
}
uf = 0.01;
vf = 0.;
if(mapF[globalx*NFACES*NFQ+globaly] != NEL*NFACES*NFQ+1000){
fluxQ1[globalx*NFACES*NFQ+globaly] = 0.0;
if ((e[1][0]*nx[globalx*NFACES*NFQ+globaly] + e[1][1]*ny[globalx*NFACES*NFQ+globaly]) < 0){
fluxQ1[globalx*NFACES*NFQ+globaly] = (Q1[globalx*NFACES*NFQ+globaly]-Q3[globalx*NFACES*NFQ+globaly] -2.*t_1*1.*(e[1][0]*uf+e[1][0]*vf)*3.) * (e[1][0]*nx[globalx*NFACES*NFQ+globaly] + e[1][1]*ny[globalx*NFACES*NFQ+globaly]);
}
}
}
You get wrong results because your kernel is not implemented correctly. @Lee already said that, I will try to further explain why.
Let's say for simplicity we consider executing kernel using 2x2 global range.
Now, the (globalx, globaly) for each work item will be:
so:
Assuming that mapI[0], mapI[1], mapF[0] and mapF[1] return unique ids then 2 work items write to the same location in parallel which obviously can't give you correct results. This will get worse if mapI and mapF can return same ids. Some of the results you get correct by luck.
Changing number of global or local work items won't help, using
select
or not won't help either. You need to ensure work items don't write to the same location at the same time!