getting wrong output for Parallel Floyd Warshall Algorithm in OpenCL

361 Views Asked by At
#include <stdio.h>
#include <stdlib.h>
#include <iostream>

/*#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else*/
#include <CL/cl.h>
//#endif

#define DATA_SIZE 16

using namespace std;

const char *ProgramSource =
"__kernel void floydWarshallPass(__global uint * pathDistanceBuffer,const unsigned int numNodes, __global uint * result, const unsigned int pass)\n"\
"{\n"\
    "int xValue = get_global_id(0);\n"\
    "int yValue = get_global_id(1);\n"\
    "int k = pass;\n"\
    "int oldWeight = pathDistanceBuffer[yValue * 4 + xValue];\n"\
    "int tempWeight = (pathDistanceBuffer[yValue * 4 + k] + pathDistanceBuffer[k * 4 + xValue]);\n"\
    "if (tempWeight < oldWeight)\n"\
    "{\n"\
        "pathDistanceBuffer[yValue * 4 + xValue] = tempWeight;\n"\
        "result[yValue * 4 + xValue] = tempWeight;\n"\
    "}\n"\
"}\n"\
"\n";


int main(void)
{
cl_context context;
cl_context_properties properties[3];
cl_kernel kernel;
cl_command_queue command_queue;
cl_program program;
cl_int err;
cl_uint num_of_platforms=0;
cl_platform_id platform_id;
cl_device_id device_id;
cl_uint num_of_devices=0;  
cl_mem inputA, inputB, output;
cl_int numNodes;
size_t global;

float inputDataA[16] = {0,2,3,4,5,0,7,8,9,10,0,12,13,14,15,0};
float results[16]={0};

int i,j;
numNodes = 16;



if(clGetPlatformIDs(1, &platform_id, &num_of_platforms) != CL_SUCCESS)
{
    printf("Unable to get platform id\n");
    return 1;
}


// try to get a supported GPU device
if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, &num_of_devices) != CL_SUCCESS)
{
printf("Unable to get device_id\n");
return 1;
}

// context properties list - must be terminated with 0
properties[0]= CL_CONTEXT_PLATFORM;
properties[1]= (cl_context_properties) platform_id;
properties[2]= 0;

// create a context with the GPU device
context = clCreateContext(properties,1,&device_id,NULL,NULL,&err);

// create command queue using the context and device
command_queue = clCreateCommandQueue(context, device_id, 0, &err);

// create a program from the kernel source code
program = clCreateProgramWithSource(context,1,(const char **) &ProgramSource, NULL, &err);

// compile the program
if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS)
{
printf("Error building program\n");
return 1;
}

// specify which kernel from the program to execute
kernel = clCreateKernel(program, "floydWarshallPass", &err);

// create buffers for the input and ouput

inputA = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * DATA_SIZE, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * DATA_SIZE, NULL, NULL);

// load data into the input buffer
clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(float) * DATA_SIZE, inputDataA, 0, NULL, NULL);
clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) * DATA_SIZE, inputDataA, 0, NULL, NULL);

// set the argument list for the kernel command
clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&numNodes);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &output);

global=DATA_SIZE;

// enqueue the kernel command for execution
for(cl_uint sh=0; sh<16; sh++)
{
clSetKernelArg(kernel, 3, sizeof(cl_uint), (void *)&sh);
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
//clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sizeof(float)*DATA_SIZE, results, 0, NULL, NULL);

//clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(float) * DATA_SIZE, results, 0, NULL, NULL);
//clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) * DATA_SIZE, results, 0, NULL, NULL);
//clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
//clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&numNodes);
//clSetKernelArg(kernel, 2, sizeof(cl_mem), &output);
clFinish(command_queue);

}
clFinish(command_queue);
// copy the results from out of the output buffer
clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) *DATA_SIZE, results, 0, NULL, NULL);

// print the results
printf("output: ");

for(i=0;i<16; i++)
{
printf("%f ",results[i]);
}

// cleanup - release OpenCL resources
clReleaseMemObject(inputA);
//clReleaseMemObject(inputB);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);

return 0;

}

I am getting -0.00000 output for every node.

P.S i am running my code on CL_DEVICE_TYPE_CPU because on GPU it is giving error that cannot get device id.

Please give some guidance on how to get correct output.

1

There are 1 best solutions below

0
On

I think your question is a little too broad, you should have narrowed your code a little bit. I'll try to help you with some errors I found on your code, but I didn't debug or compile it, so those issues I describe here are only something for you to start looking at.

  • Why are you calling get_global_id with parameter 1 on your kernel? Back at your clEnqueueNDRangeKernel you specified that your work-items dimension is only one, so your get_global_id is querying for a non-existing dimension. If you want to translate a single dimension coordinate into two coordinate, you should use a transformation such as below:
int id = get_global_id(0);
int x = id % size->width;
int y = id / size->height;
  • Pay attention when you use sizeof(float) to measure the size of the data types: they may be not of the same size inside the OpenCL implementation. Use sizeof(cl_float) instead.

  • Maybe you are not getting any GPU because you don't have the proper drivers installed on your computer. Go to the GPU vendor website and look for runtime drivers for OpenCL.

Take a look at those pages from the OpenCL specification