I have to program the Floyd algorithm using OpenCL, it works fine but only with n<268. when n>=268 i have an "Access violation reading location" when calling clEnqueueWriteBuffer (the buffer_distances one, in the loop).
Here is my code:
graphe is an adjacency matrix, and distances is the distances matrix
int n;
printf("enter n value: ");
scanf("%d", &n);
printf("\n");
int n2 = n * n;
int matSize = n2 * sizeof(int*);
int* graphe = malloc(sizeof(int) * n2);
int* distances = malloc(sizeof(int) * n2);
//mat[i,j] => mat[i*n + j]
if (graphe == NULL)
printf("malloc failed\n");
init_graphe(graphe, n);
copy(graphe, distances, n);
initialization of opencl variables:
char* programSource = load_kernel("kernel.cl");
cl_int status;
// STEP 1: Discover and initialize the platforms
cl_uint numPlatforms = 0;
cl_platform_id* platforms = NULL;
status = clGetPlatformIDs(0, NULL, &numPlatforms);
printf("Number of platforms = %d\n", numPlatforms);
platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
char Name[1000];
clGetPlatformInfo(platforms[0], CL_PLATFORM_NAME, sizeof(Name), Name, NULL);
printf("Name of platform : %s\n", Name);
fflush(stdout);
// STEP 2: Discover and initialize the devices
cl_uint numDevices = 0;
cl_device_id* devices = NULL;
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
printf("Number of devices = %d\n", (int)numDevices);
devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
for (int i = 0; i < numDevices; i++) {
clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(Name), Name, NULL);
printf("Name of device %d: %s\n\n", i, Name);
}
// STEP 3: Create a context
fflush(stdout);
cl_context context = NULL;
context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);
// STEP 4: Create a command queue
fflush(stdout);
cl_command_queue cmdQueue;
cmdQueue = clCreateCommandQueue(context, devices[0], 0, &status);
// STEP 5: Create device buffers
fflush(stdout);
cl_mem buffer_graphe;
cl_mem buffer_n;
cl_mem buffer_distances;
cl_mem buffer_k;
buffer_graphe = clCreateBuffer(context, CL_MEM_READ_WRITE, matSize, NULL, &status);
buffer_n = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int), NULL, &status);
buffer_distances = clCreateBuffer(context, CL_MEM_READ_WRITE, matSize, NULL, &status);
buffer_k = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int), NULL, &status);
fflush(stdout);
// STEP 6: Create and compile the program
cl_program program = clCreateProgramWithSource(context, 1, (const char**)&programSource, NULL, &status);
printf("Compilation\n");
fflush(stdout);
status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);
// STEP 8: Create the kernel
cl_kernel kernel = NULL;
fflush(stdout);
kernel = clCreateKernel(program, "floyd", &status);
size_t globalWorkSize[2] = { n, n };
size_t localWorkSize[3] = { 20,20 };
Execution of the kernel:
clock_t start = clock();
int k;
for (k = 0; k < n; k++) {
status = clEnqueueWriteBuffer(cmdQueue, buffer_graphe, CL_TRUE, 0, matSize, graphe, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buffer_n, CL_TRUE, 0, sizeof(int), &n, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buffer_distances, CL_TRUE, 0, matSize, distances, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buffer_k, CL_TRUE, 0, sizeof(int), &k, 0, NULL, NULL);
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&buffer_graphe);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&buffer_n);
status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&buffer_distances);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&buffer_k);
status = clEnqueueNDRangeKernel(cmdQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
clFinish(cmdQueue);
status = clEnqueueReadBuffer(cmdQueue, buffer_distances, CL_TRUE, 0, matSize, distances, 0, NULL, NULL);
clFinish(cmdQueue);
}
and the kernel:
void kernel floyd(global int* graphe, global int* n, global int* distances, global int* k)
{
int i = get_global_id(0);
int j = get_global_id(1);
int ij = i * (*n) + j;
int ik = i * (*n) + (*k);
int kj = (*k) * (*n) + j;
if (distances[ik] + distances[kj] < distances[ij]) {
distances[ij] = distances[ik] + distances[kj];
}
}
You have:
matSize
will be 800 on a 64-bit system. (sizeof(int*)
= 8)distances
array. (sizeof(int)
= 4, typically)matSize
) fromdistances
into your OpenCL buffer. This overflows the end of the array. Whoops.The bug is of course the use of
sizeof(int*)
: you've got an array ofint
s, not an array of pointers, so this should besizeof(int)
, which is what you're correctly doing in themalloc
call. (I can't quite fathom why you're not usingmatSize
there.) Although what you should probably be using isCLint
, or one of the explicitly-sized types (int32_t
in this case), because types in OpenCL kernels have very specific definitions which may or may not match those in host C code.Additional Notes:
ij
(written) for one of the work-items will be equal toik
(read) for the others in the row? Similar deal withij
andkj
.distances
buffer between iterations, if you're not modifying it on the host. Neither doesgraphe
need re-writing every time if it's not changing.k
andn
without a buffer.status = clSetKernelArg(kernel, 1, sizeof(n), &n);
works fine if you change your kernel signature's argument toint n
. (no dereference needed in the kernel then.)clFinish
calls, and you can change the buffer writes to be non-blocking once you've moved them outside the loop. This might give you an additional small speedup.