#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#define CL_TARGET_OPENCL_VERSION 210
#include "CL/cl.h"
char* program_src = "__kernel void SAXPY (__global float* x, __global float* y, float a)\n"
"{\n"
"const int i = get_global_id (0);\n"
"y [i] = a * x [i];\n"
"}\n";
int main() {
cl_platform_id platform_ids[16];
cl_uint platform_count;
if (clGetPlatformIDs(16, &platform_ids, &platform_count) != CL_SUCCESS) {
return EXIT_FAILURE;
}
printf("%i cl platform(s) found\n", platform_count);
if (platform_count == 0) {
return EXIT_FAILURE;
}
printf("choosing platform 0...\n");
cl_device_id device_ids[16];
cl_int device_count;
if (clGetDeviceIDs(platform_ids[0], CL_DEVICE_TYPE_ALL, 16, &device_ids, &device_count) != CL_SUCCESS) {
return EXIT_FAILURE;
}
printf("%i cl device(s) found on platform 0\n", device_count);
if (device_count == 0) {
return EXIT_FAILURE;
}
cl_device_id device = device_ids[0];
printf("** running test **\n");
cl_int cl_fehler;
cl_context ctx = clCreateContext(NULL, 1, &device, NULL, NULL, &cl_fehler);
if (ctx == NULL) {
printf("Fehler: %i\n", cl_fehler);
return EXIT_FAILURE;
}
printf("Am clCommandQueue\n");
cl_fehler = CL_SUCCESS;
cl_command_queue queue = clCreateCommandQueue(ctx, device, 0, &cl_fehler);
if (cl_fehler != CL_SUCCESS) {
printf("Fehler: %i\n", cl_fehler);
return EXIT_FAILURE;
}
// Replace 1 mit Zahlen von Gärate IDs
printf("Am clCreateProgram\n");
cl_fehler = CL_SUCCESS;
cl_program program = clCreateProgramWithSource(ctx, 1, &program_src, NULL, &cl_fehler);
if (cl_fehler != CL_SUCCESS) {
printf("Fehler: %i\n", cl_fehler);
return EXIT_FAILURE;
}
printf("Am clBuildProgram\n");
cl_fehler = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
if (cl_fehler != CL_SUCCESS) {
printf("Fehler: %i\n", cl_fehler);
return EXIT_FAILURE;
}
printf("Am clCreateKernel\n");
cl_fehler = CL_SUCCESS;
cl_kernel kernel = clCreateKernel(program, "SAXPY", &cl_fehler);
if (cl_fehler != CL_SUCCESS) {
printf("Fehler: %i\n", cl_fehler);
return EXIT_FAILURE;
}
printf("Am clCreateBuffer\n");
cl_fehler = CL_SUCCESS;
cl_mem eingabe_buffer = clCreateBuffer(ctx, CL_MEM_READ_ONLY, sizeof(cl_float) * 10, NULL, &cl_fehler);
if (cl_fehler != CL_SUCCESS) {
printf("Fehler: %i\n", cl_fehler);
return EXIT_FAILURE;
}
printf("Am clCreateBuffer\n");
cl_fehler = CL_SUCCESS;
cl_mem ausgabe_buffer = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, sizeof(cl_float) * 10, NULL, &cl_fehler);
if (cl_fehler != CL_SUCCESS) {
printf("Fehler: %i\n", cl_fehler);
return EXIT_FAILURE;
}
cl_float eingabe_data[10] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
printf("Am clEnqueueWriteBuffer\n");
cl_fehler = clEnqueueWriteBuffer(queue, eingabe_buffer, CL_TRUE, 0, sizeof(cl_float) * 10, &eingabe_data, 0, NULL, NULL);
if (cl_fehler != CL_SUCCESS) {
printf("Fehler: %i\n", cl_fehler);
return EXIT_FAILURE;
}
printf("Am clSetKernelArg\n");
cl_fehler = clSetKernelArg(kernel, 0, sizeof(cl_mem), &eingabe_buffer);
if (cl_fehler != CL_SUCCESS) {
printf("Fehler: %i\n", cl_fehler);
return EXIT_FAILURE;
}
printf("Am clSetKernelArg\n");
cl_fehler = clSetKernelArg(kernel, 1, sizeof(cl_mem), &ausgabe_buffer);
if (cl_fehler != CL_SUCCESS) {
printf("Fehler: %i\n", cl_fehler);
return EXIT_FAILURE;
}
printf("Am clSetKernelArg\n");
cl_float f = 2.0;
cl_fehler = clSetKernelArg(kernel, 2, sizeof(cl_float), &f);
if (cl_fehler != CL_SUCCESS) {
printf("Fehler: %i\n", cl_fehler);
return EXIT_FAILURE;
}
printf("Am clEnqueueNDRangeKernel\n");
const size_t globalWorkSize[3] = { 10, 0, 0 };
cl_fehler = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalWorkSize, NULL, 0, NULL, NULL);
if (cl_fehler != CL_SUCCESS) {
printf("Fehler: %i\n", cl_fehler);
return EXIT_FAILURE;
}
clFinish(queue);
printf("Am clEnqueueReadBuffer\n");
cl_float ausgabe_data[10] = {1, 1, 1, 1, 1, 1, 1 ,1 ,1 ,1 };
cl_fehler = clEnqueueReadBuffer(queue, ausgabe_buffer, CL_TRUE, 0, sizeof(cl_float) * 10, &ausgabe_data[0], 0, NULL, NULL);
if (cl_fehler != CL_SUCCESS) {
printf("Fehler: %i\n", cl_fehler);
return EXIT_FAILURE;
}
// No point in share the rest of the code because the problem is being happening here
}
Hardware:
Trying to use another's example as following gives the same mixed results of sometimes working but most of the time getting an hanging kernel at clEnqueueReadBuffer, the actually running has happened but 9/10 times it hangs.
set -e
git clone https://github.com/ULHPC/tutorials.git
cd tutorials/gpu/opencl/code
clang++ exercise3.cpp -lOpenCL -I/opt/rocm/include/ -L/opt/rocm/lib/ -o test
./test
The host is openSUSE Leap 15.5 x86_64 on an AMD ATI 04:00.0 Lucienne and AMD Ryzen 7 5700U with Radeon Graphics (16) @ 1.800GHz with AMD rocm drivers installed systemwide.
I am compiling with clang src/main.c -lOpenCL -I/opt/rocm-6.0.0/include/ -L/opt/rocm-6.0.0/lib/ -o test
And I have the rocm-opencl-sdk package installed from here which is the vendors (AMD's) official page.