I'm trying to implement Floyd Warshall algorithm using cuda but I'm having syncrhornization problem. This is my code:
__global__ void run_on_gpu(const int graph_size, int *output, int k) {
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
if (D(i, k) + D(k, j) < D(i, j)) {
D(i, j) = D(i, k) + D(k, j);
}
}
void floyd_warshall_gpu(const int *graph, int graph_size, int *output) {
int *dev_output;
HANDLE_ERROR( cudaMalloc(&dev_output, sizeof(int) * graph_size * graph_size) );
cudaMemcpy(dev_output, graph, sizeof(int) * graph_size * graph_size, cudaMemcpyHostToDevice);
dim3 blocks(BLOCKS_PER_GRAPH_SIDE, BLOCKS_PER_GRAPH_SIDE, 1);
dim3 threadsPerBlock(THREADS_PER_BLOCK_SIDE, THREADS_PER_BLOCK_SIDE, 1);
int k;
for (k = 0; k < graph_size; k++) {
run_on_gpu<<<blocks, threadsPerBlock>>>(graph_size, dev_output, k);
}
cudaMemcpy(output, dev_output, sizeof(int) * graph_size * graph_size, cudaMemcpyDeviceToHost);
cudaFree(dev_output);
}
This is my initial variables:
#define GRAPH_SIZE 2000
#define EDGE_COST(graph, graph_size, a, b) graph[a * graph_size + b]
#define D(a, b) EDGE_COST(output, graph_size, a, b)
#define INF 0x1fffffff
#define THREADS_PER_BLOCK_SIDE 16 // Each block have 16 * 16 = 256 threads
#define BLOCKS_PER_GRAPH_SIDE GRAPH_SIZE / THREADS_PER_BLOCK_SIDE
This is how I'm generating the graph:
void generate_random_graph(int *output, int graph_size) {
int i, j;
srand(0xdadadada);
for (i = 0; i < graph_size; i++) {
for (j = 0; j < graph_size; j++) {
if (i == j) {
D(i, j) = 0;
}
else {
int r;
r = rand() % 40;
if (r > 20) {
r = INF;
}
D(i, j) = r;
}
}
}
}
When I set GRAPH_SIZE to a smaller number like 100 the result is incorrect.
I have written the algorithm sequentially on the cpu like the code bellow:
void floyd_warshall_cpu(const int *graph, int graph_size, int *output) {
int i, j, k;
memcpy(output, graph, sizeof(int) * graph_size * graph_size);
for (k = 0; k < graph_size; k++) {
for (i = 0; i < graph_size; i++) {
for (j = 0; j < graph_size; j++) {
if (D(i, k) + D(k, j) < D(i, j)) {
D(i, j) = D(i, k) + D(k, j);
}
}
}
}
}
And I run and test it like this:
int main(int argc, char **argv) {
int *graph, *output_cpu, *output_gpu;
int size;
size = sizeof(int) * GRAPH_SIZE * GRAPH_SIZE;
graph = (int *)malloc(size);
output_cpu = (int *)malloc(size);
assert(output_cpu);
memset(output_cpu, 0, size);
output_gpu = (int *)malloc(size);
generate_random_graph(graph, GRAPH_SIZE);
floyd_warshall_cpu(graph, GRAPH_SIZE, output_cpu);
floyd_warshall_gpu(graph, GRAPH_SIZE, output_gpu);
if (memcmp(output_cpu, output_gpu, size) != 0) {
fprintf(stderr, "FAIL!\n");
}
else {
fprintf(stderr, "SUCCESS!\n");
}
free(graph);
free(output_cpu);
free(output_gpu);
return 0;
}
Can anyone give me an ideia how to solve this?
The main problem I could find seems to be that your grid sizing is not done correctly.
With N=2000 and thread block side dimension of 16, that happens to be whole-number divisible. But if you reduce N to 100, it is not.
We can fix that by "rounding up" your grid dimensions:
And adding a thread-check to your kernel:
Here's a modified code that seems to run correctly for me:
(I've modified your test case slightly as it was producing an output matrix that was mostly zero. )