How to transfer data from device memory subbuffer to host-program in OpenCL?

287 Views Asked by At

I’m learning OpenCL from Matthew Scarpino's "OpenCL in action" book. Chapter 3 contains material on the organization of buffers (pp.45-47) and subbuffers (pp.47-48). There is an inaccuracy in the code example for creating a subbuffer set out on page 47, which is disclosed by same questions here and here. I went further and decided to investigate the case of transfer the values stored in the subbuffer back to the host-program.

My host-program is transferring an array of integers iaArray1[5] = { 1, 2, 3, 4, 5 } into kernel. A buffer memObjArray1 is used to do it. From the iaArray1 array, the ipaArray2 array is obtained, storing the values { 3, 4, 5, 6, 7 } as the kernel sums the values of the array with the constant 2. The memObjArray2 output buffer is used to transfer the values of the ipaArray2 array entirely from the device to the host-program. Next, the memObjSubArray subbuffer is formed from the memObjArray2 buffer. An attempt is being made to transfer data from the device memory subbufer memObjSubArray to the host-program.

I believe that the mechanism for transferring data from the kernel to the host program is the same for both the buffer and the subbuffer. To do this, I used the same function clEnqueueReadBuffer(), but the program gave an error message. What am I doing wrong? What function should be used to transfer data from device memory subbuffer to host-program?

Kernel function is follows:

__kernel void good (global int* iaArray1, global int* iaArray2)
{
    int i=get_global_id(0);
    iaArray2[i]=iaArray1[i]+2;
}

Here is the code of my program. There are several simplifications in the presented program. First, the exit branches have been simplified to shorten the code. Secondly, the original program was designed to work with several cl-files, so some of the variables are arrays.

#include <CL\cl.h>
#include <stdio.h>
#include <stdlib.h>

#define PROGRAM_FILE_1 "good.cl"
//#define PROGRAM_FILE_2 "bad.cl"
//#define PROGRAM_FILE_3 "setminusone.cl"
#define NUM_OF_FILES 1

int main(){
    cl_platform_id *platforms;
    cl_uint numOfPlatforms;
    cl_int status;
    cl_device_id *devices;
    cl_uint numOfDevices;
    char caDeviceName[500];
    cl_context context;

    const char * kcpaFileName[NUM_OF_FILES] = { PROGRAM_FILE_1};
    FILE * pProgramHandler;
    char * cpaProgramBuffer[NUM_OF_FILES];
    size_t saProgramSize[NUM_OF_FILES] = { 0};
    cl_uint numOfEnters[NUM_OF_FILES] = { 0};

    cl_program program;
    const char kcaOptions[] = "-cl-finite-math-only -cl-no-signed-zeros";
    size_t sLogSize = 0;
    char * cpProgramLog;

    cl_uint numOfKernels = 0;
    cl_kernel * kernels;
    char caKernelName[20];

    cl_command_queue cmdQueue0;

    printf("Establishing number of available platforms... ");
    status = clGetPlatformIDs(NULL, NULL, &numOfPlatforms);
    if (status < 0){
        printf("FAIL to establish platform(s)!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nEstablised %u platform(s).\nInitializing platform(s)... ", numOfPlatforms);
    platforms = (cl_platform_id *)malloc(numOfPlatforms*sizeof(cl_platform_id));
    status = clGetPlatformIDs(numOfPlatforms, platforms, NULL); //
    if (status < 0){
        printf("FAIL to initialize platform(s)!> %d\n", status);
        system("PAUSE");
        exit(1);
    }

    printf("OK.\nEstablishing devices... ");
    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, NULL, NULL, &numOfDevices);
    if (status < 0){
        printf("FAIL to establish device(s)!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nEstablished %u device(s).\nInitializing device(s)... ", numOfDevices);
    devices = (cl_device_id *)malloc(numOfDevices*sizeof(cl_device_id));
    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numOfDevices, devices, NULL);
    if (status < 0){
        printf("FAIL to initialize devices(s)!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.");
    for (int i = 0; i < numOfDevices; i++){
        status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(caDeviceName), caDeviceName, NULL);
        if (status < 0){
            printf("FAIL to read device #%d name!> %d\n", i, status);
            system("PAUSE");
            exit(1);
        }
        printf("\nDevice #%d is \"%s\".", i, caDeviceName);
    }


    printf("\nCreating context... ");
    context = clCreateContext(NULL, numOfDevices, devices, NULL, NULL, &status);
    if (status < 0){
        printf("FAIL to create context!> %d\n", status);
        system("PAUSE");
        exit(1);
    }

    printf("OK.\nReading source code from file(s)... ");
    for (int i = 0; i < NUM_OF_FILES; i++){
        pProgramHandler = fopen(kcpaFileName[i], "r");
        if (pProgramHandler == NULL){
            printf("FAIL to open file \"%s\"!> %d\n", kcpaFileName[i], status);
            system("PAUSE");
            exit(1);
        }
        fseek(pProgramHandler, 0, SEEK_END);
        saProgramSize[i] = ftell(pProgramHandler);
        rewind(pProgramHandler);
        cpaProgramBuffer[i] = (char*)malloc(sizeof(char)*saProgramSize[i] + 1);
        fread(cpaProgramBuffer[i], sizeof(char), saProgramSize[i], pProgramHandler);
        cpaProgramBuffer[i][saProgramSize[i]] = '\0';
        fclose(pProgramHandler);
        for (int j = 0; j < saProgramSize[i]; j++){
            if ((char)cpaProgramBuffer[i][j] == (char)10){
                numOfEnters[i]++;
            }
        }
        saProgramSize[i] = saProgramSize[i] - numOfEnters[i];
        cpaProgramBuffer[i][saProgramSize[i]] = '\0';
    }
    printf("OK.\nCreating program from source code... ");
    program = clCreateProgramWithSource(context, NUM_OF_FILES, (const char **)cpaProgramBuffer, (const size_t *)saProgramSize, &status);
    if (status < 0){
        printf("FAIL to create program!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nBuilding program... ");
    status = clBuildProgram(program, 1, devices, kcaOptions, NULL, NULL);//, 
    if (status < 0){
        printf("FAIL to build program.\n...Genetating log...");
        for (int i = 0; i < NUM_OF_FILES; i++){
            printf("\nCode from file \"%s\":\n%s", kcpaFileName[i], cpaProgramBuffer[i]);
        }
        clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &sLogSize);
        cpProgramLog = (char*)malloc(sizeof(char)*sLogSize + 1);
        cpProgramLog[sLogSize] = '\0';
        clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sLogSize + 1, cpProgramLog, NULL);
        printf("\nLog length is %d.\nLog:\n%s\n> %d\n", sLogSize, cpProgramLog, status);
        system("PAUSE");
        exit(1);
    }

    printf("OK.\nDetermining number of kernels... ");
    status = clCreateKernelsInProgram(program, NULL, NULL, &numOfKernels);
    if (status < 0){
        printf("FAIL to determine number of kernels!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nDetermined %d kernel(s):", numOfKernels);
    kernels = (cl_kernel*)malloc(sizeof(cl_kernel)*numOfKernels);
    clCreateKernelsInProgram(program, numOfKernels, kernels, NULL);
    for (int i = 0; i < numOfKernels; i++){
        clGetKernelInfo(kernels[i], CL_KERNEL_FUNCTION_NAME, sizeof(caKernelName), caKernelName, NULL);
        printf("\nKernel \"%s\" indexed at %d.", caKernelName, i);
    }

    printf("\nCreating command queue... ");
    cmdQueue0 = clCreateCommandQueue(context, devices[0], NULL, &status);
    if (status < 0){
        printf("FAIL to create command queue!> %d\n", status);
        system("PAUSE");
        exit(1);
    }

    /*Data, buffers and subbuffers*/
    int iaArray1[5] = { 1, 2, 3, 4, 5 };
    printf("\nPrinting out the initial array:\n");
    for (int i = 0; i < 5; i++){
        printf("%d ", iaArray1[i]);
    }
    printf("\nCreating buffers for kernels[0]... ");
    int* ipaArray2 = (int*)malloc(5 * sizeof(int));
    cl_mem memObjArray1 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(iaArray1), &iaArray1, &status);
    if (status < 0){
        printf("\nFAIL to create memObjArray1 buffer!> %d \n", status);
        system("PAUSE");
        exit(1);
    }
    cl_mem memObjArray2 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(iaArray1), NULL, &status);
    if (status < 0){
        printf("\nFAIL to create memObjArray2 buffer!> %d \n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nSetting arguments for kernels[0]... ");
    status = clSetKernelArg(kernels[0], 0, sizeof(cl_mem), &memObjArray1);
    if (status < 0){
        printf("\nFAIL to set memObjArray1 argument at kernels[0]!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    status = clSetKernelArg(kernels[0], 1, sizeof(cl_mem), &memObjArray2);
    if (status < 0){
        printf("\nFAIL to set memObjArray2 argument at kernels[0]!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nExecuting kernels[0]... ");
    size_t tGlobal_item_size = 5;   //?
    size_t tLocal_item_size = 1;    //?
    status = clEnqueueNDRangeKernel(cmdQueue0, kernels[0], 1, NULL, &tGlobal_item_size, &tLocal_item_size, 0, NULL, NULL);
    if (status < 0){
        printf("\nFAIL to enqueue kernels[0] into cmdQueue0!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nReading results from device memory buffer to host array... ");
    status = clEnqueueReadBuffer(cmdQueue0, memObjArray2, CL_TRUE, 0, 5 * sizeof(int), ipaArray2, 0, NULL, NULL);
    if (status < 0){
        printf("FAIL to copy results from device to host!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nPrinting out the result array:\n");
    for (int i = 0; i < 5; i++){
        printf("%d ", ipaArray2[i]);
    }

    printf("\nCreating subbuffer... ");
    int iQuantity = 2;
    int iShift = 2;
    typedef struct _cl_buffer_region{
        size_t size;
        size_t origin;
    } cl_buffer_region;
    cl_buffer_region stRegion;
    stRegion.size = iQuantity * sizeof(int);
    stRegion.origin = iShift * sizeof(int);
    cl_mem memObjSubArray = clCreateSubBuffer(memObjArray2, CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &stRegion, &status);
    if (status < 0){
        printf("FAIL to create subbuffer!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nReading results from device memory subbuffer to host array... ");
    int* ipaSubArray = (int*)malloc(iQuantity*sizeof(int));
    status = clEnqueueReadBuffer(cmdQueue0, memObjSubArray, CL_TRUE, 0, iQuantity*sizeof(int), ipaSubArray, 0, NULL, NULL);
    if (status < 0){
        printf("FAIL to copy results from device to host!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nPrinting out the result array:\n");
    for (int i = 0; i < iQuantity; i++){
        printf("%d ", ipaSubArray[i]);
    }


    printf("OK.\n...Releasing resources... ");
    clReleaseMemObject(memObjArray1);
    clReleaseMemObject(memObjArray2);
    clReleaseMemObject(memObjSubArray);

    clReleaseCommandQueue(cmdQueue0);
    free(kernels);
    clReleaseProgram(program);
    for (int i = 0; i < NUM_OF_FILES; i++){
        free(cpaProgramBuffer[i]);
    }
    clReleaseContext(context);
    free(devices);
    free(platforms);
    printf("OK.\nEnd of program. Bey!\n");
    system("PAUSE");
}

Program execution LOG-file

1

There are 1 best solutions below

0
On

It seems that there is no function of transferring data from a sub-buffer memory object located on the device to the memory of the host program. But the problem of the lack of observability of the sub-buffer data can be solved through the use of buffers and the data copy function clEnqueueCopyBuffer(). Its specification can be found here. Its fourth input argument specifies the offset of the data from the beginning of the source buffer. The fifth input argument specifies the offset of the data in the destination buffer. The sixth argument specifies the amount of data to copy.

An example of clEnqueueCopyBuffer() function call is as follows:

cl_int status = clEnqueueCopyBuffer(cmdQueue0, memObjInput, memObjOutput, 
                                    sizeof(int)*tSrcBufOffset,
                                    sizeof(int)*tDestBufOffset,
                                    sizeof(int)*tQuantityToCopy, 
                                    NULL, NULL, NULL); 

As an example, I wrote a program that replaces part of the buffer data. For the original integer array { 1, 2, 3, 4, 5 }, a buffer is created in device memory. The 2nd and 3rd elements are retrieved from this buffer and displayed on the screen: {2, 3}. Then, in the kernel, values of each element of the buffer is increased by two { 3, 4, 5, 6, 7 }. The result of kernel execution is returned to the host program and displayed. This is followed by the replacement of the 3rd and 4th elements of the buffer with the values stored in the auxiliary buffer: { 3, 4, 2, 3, 7 }.

The essence of the program data flow graphically looks like this:

program data flow essence

The main part of the code is presented below. To execute it just insert it into appropriate code previously given.

...
    /*Data and buffers*/
    /*kernels[0]*/
    
    // two arrays and buffers creation
    int iaInputArray[] = { 1, 2, 3, 4, 5 };
    int iSizeOfArray = 5;
    int* ipaOutputArray = (int*)malloc(iSizeOfArray*sizeof(int));
    cl_mem memObjInput;
    cl_mem memObjOutput;
    cl_mem memObjSubBuffer;
    size_t tGlobal_item_size = iSizeOfArray;    //?
    size_t tLocal_item_size = 1;    //?
    size_t tSrcBufOffset;           //offset in source buffer
    size_t tDstBufOffset;           //offset in destination buffer
    size_t tNumbOfElementsToCopy=2; //number of elements to copy
    int* ipaSubArray = (int*)malloc(tNumbOfElementsToCopy*sizeof(int));

    printf("OK.\nPrinting out initial input array:\n");
    for (int i = 0; i < iSizeOfArray; i++){
        printf("%d ", iaInputArray[i]);
    }
    printf("\nCreating buffer memory objects... ");
    memObjInput = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(iaInputArray), &iaInputArray, &status);
    if (status < 0){
        printf("FAIL to create buffer for input data!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    memObjOutput = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(iaInputArray), NULL, &status);
    if (status < 0){
        printf("FAIL to create buffer for output data!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    memObjSubBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(ipaSubArray), NULL, &status);
    if (status < 0){
        printf("FAIL to create buffer for output data!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nCopying 2nd and 3rd elements of the initial array into sub-buffer... ");
    tSrcBufOffset = 1;
    tDstBufOffset = 0;
    status = clEnqueueCopyBuffer(cmdQueue0, memObjInput, memObjSubBuffer, sizeof(int)*tSrcBufOffset, sizeof(int)*tDstBufOffset, sizeof(int)*tNumbOfElementsToCopy, NULL, NULL, NULL);
    if (status < 0){
        printf("FAIL to copy buffers!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nTransferring copied elements to host-program... ");
    status = clEnqueueReadBuffer(cmdQueue0, memObjSubBuffer, CL_TRUE, 0, tNumbOfElementsToCopy*sizeof(int), ipaSubArray, 0, NULL, NULL);
    if (status < 0){
        printf("FAIL to transfer data from device memory buffer to host array!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nPrinting out copied elements:\n");
    for (int i = 0; i < tNumbOfElementsToCopy; i++){
        printf("%d ", ipaSubArray[i]);
    }
    printf("\nSetting kernel arguments... ");
    status = clSetKernelArg(kernels[0], 0, sizeof(cl_mem), &memObjInput);
    if (status < 0){
        printf("FAIL to set kernel argument #0!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    status = clSetKernelArg(kernels[0], 1, sizeof(cl_mem), &memObjOutput);
    if (status < 0){
        printf("FAIL to set kernel argument #1!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nExecuting kernel... ");
    status = clEnqueueNDRangeKernel(cmdQueue0, kernels[0], 1, NULL, &tGlobal_item_size, &tLocal_item_size, 0, NULL, NULL);
    if (status < 0){
        printf("FAIL to enqueue kernels[0] into cmdQueue0!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nReading results from device memory buffer to host array... ");
    status = clEnqueueReadBuffer(cmdQueue0, memObjOutput, CL_TRUE, 0, iSizeOfArray*sizeof(int), ipaOutputArray, 0, NULL, NULL);
    if (status < 0){
        printf("FAIL to transfer data from device memory buffer to host array!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nPrinting out data obtained from kernel:\n");
    for (int i = 0; i < iSizeOfArray; i++){
        printf("%d ", ipaOutputArray[i]);
    }
    printf("\nChanging 3rd and 4th elements of data in output buffer... ");
    tSrcBufOffset = 0;
    tDstBufOffset = 2;
    status = clEnqueueCopyBuffer(cmdQueue0, memObjSubBuffer, memObjOutput, sizeof(int)*tSrcBufOffset, sizeof(int)*tDstBufOffset, sizeof(int)*tNumbOfElementsToCopy, NULL, NULL, NULL);
    if (status < 0){
        printf("FAIL to copy buffers!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nTransferring results from device memory buffer to host array... ");
    status = clEnqueueReadBuffer(cmdQueue0, memObjOutput, CL_TRUE, 0, iSizeOfArray*sizeof(int), ipaOutputArray, 0, NULL, NULL);
    if (status < 0){
        printf("FAIL to transfer data from device memory buffer to host array!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nPrinting out host array data:\n");
    for (int i = 0; i < iSizeOfArray; i++){
        printf("%d ", ipaOutputArray[i]);
    }
    printf("\n...Releasing resources... ");
...

Print screen of program execution: program execution log