How to convert a cudaArray to a Torch tensor?

426 Views Asked by At

I am experimenting with Torch and CUDA. Using torch::from_blob() I was able to do the following so far:

#include <cuda_runtime.h>
#include <torch/torch.h>
#include <iostream>
#include <exception>
#include <memory>
#include <math.h>

using std::cout;
using std::endl;
using std::exception;

/*
 * Demonstration of interoperability between CUDA and Torch C++ API using 
 * pinned memory.
 *
 * Using the ENABLE_ERROR variable a change in the result (CUDA) can be
 * introduced through its respective Torch tensor. This will also affect
 * the copied data from GPU to CPU, resulting in an error during assert
 * checks at the end
 */

// Contains the call to the CUDA kernel
void vector_add(int* a, int* b, int* c, int N, int cuda_grid_size, int cuda_block_size);

bool ENABLE_ERROR = false;

int main(int argc, const char* argv[])
{
    // Setup array, here 2^16 = 65536 items
    const int N = 1 << 16;
    size_t bytes = N * sizeof(int);

    // Declare pinned memory pointers
    int* a_cpu, * b_cpu, * c_cpu;

    // Allocate pinned memory for the pointers
    // The memory will be accessible from both CPU and GPU
    // without the requirements to copy data from one device
    // to the other
    cout << "Allocating memory for vectors on CPU" << endl;
    cudaMallocHost(&a_cpu, bytes);
    cudaMallocHost(&b_cpu, bytes);
    cudaMallocHost(&c_cpu, bytes);

    // Init vectors
    cout << "Populating vectors with random integers" << endl;
    for (int i = 0; i < N; ++i)
    {
        a_cpu[i] = rand() % 100;
        b_cpu[i] = rand() % 100;
    }

    // Declare GPU memory pointers
    int* a_gpu, * b_gpu, * c_gpu;

    // Allocate memory on the device
    cout << "Allocating memory for vectors on GPU" << endl;
    cudaMalloc(&a_gpu, bytes);
    cudaMalloc(&b_gpu, bytes);
    cudaMalloc(&c_gpu, bytes);

    // Copy data from the host to the device (CPU -> GPU)
    cout << "Transfering vectors from CPU to GPU" << endl;
    cudaMemcpy(a_gpu, a_cpu, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(b_gpu, b_cpu, bytes, cudaMemcpyHostToDevice);

    // Specify threads per CUDA block (CTA), her 2^10 = 1024 threads
    int NUM_THREADS = 1 << 10;

    // CTAs per grid
    int NUM_BLOCKS = (N + NUM_THREADS - 1) / NUM_THREADS;

    // Call CUDA kernel
    cout << "Running CUDA kernels" << endl;
    vector_add(a_gpu, b_gpu, c_gpu, N, NUM_BLOCKS, NUM_THREADS);

    try
    {
        // Convert pinned memory on GPU to Torch tensor on GPU
        auto options = torch::TensorOptions().dtype(torch::kInt).device(torch::kCUDA, 0).pinned_memory(true);
        cout << "Converting vectors and result to Torch tensors on GPU" << endl;
        torch::Tensor a_gpu_tensor = torch::from_blob(a_gpu, { N }, options);
        torch::Tensor b_gpu_tensor = torch::from_blob(b_gpu, { N }, options);
        torch::Tensor c_gpu_tensor = torch::from_blob(c_gpu, { N }, options);

        cout << "Verifying result using Torch tensors" << endl;
        if (ENABLE_ERROR)
        {
            /*
            TEST
            Change the value of the result should result in two things:
             - the GPU memory will be modified
             - the CPU test later on (after the GPU memory is copied to the CPU side) should fail
            */
            cout << "ERROR GENERATION ENABLED! Application will crash during verification of results" << endl;
            cout << "Changing result first element from " << c_gpu_tensor[0];
            c_gpu_tensor[0] = 99999999;
            cout << " to " << c_gpu_tensor[0] << endl;
        }
        else
        {
            assert(c_gpu_tensor.equal(a_gpu_tensor.add(b_gpu_tensor)) == true);
        }
    }
    catch (exception& e)
    {
        cout << e.what() << endl;

        cudaFreeHost(a_cpu);
        cudaFreeHost(b_cpu);
        cudaFreeHost(c_cpu);

        cudaFree(a_gpu);
        cudaFree(b_gpu);
        cudaFree(c_gpu);

        return 1;
    }

    // Copy memory to device and also synchronize (implicitly)
    cout << "Synchronizing CPU and GPU. Copying result from GPU to CPU" << endl;
    cudaMemcpy(c_cpu, c_gpu, bytes, cudaMemcpyDeviceToHost);

    // Verify the result on the CPU
    cout << "Verifying result on CPU" << endl;
    for (int i = 0; i < N; ++i)
    {
        assert(c_cpu[i] == a_cpu[i] + b_cpu[i]);
    }

    cudaFreeHost(a_cpu);
    cudaFreeHost(b_cpu);
    cudaFreeHost(c_cpu);

    cudaFree(a_gpu);
    cudaFree(b_gpu);
    cudaFree(c_gpu);

    return 0;
}

with a kernel

__global__ void vector_add_kernel(int* a, int* b, int* c, int N)
{
    // Calculate global thread ID
    int t_id = (blockDim.x * blockIdx.x) + threadIdx.x;

    // Check boundry
    if (t_id < N)
    {
        c[t_id] = a[t_id] + b[t_id];
    }
}

void vector_add(int* a, int* b, int* c, int N, int cuda_grid_size, int cuda_block_size)
{
    vector_add_kernel << <cuda_grid_size, cuda_block_size >> > (a, b, c, N);
    cudaGetLastError();
}

The code above uses pinned memory (for fast transfer between CPU and GPU) and does an addition operation between two vectors using the respective kernel. In addition I convert the GPU memory blocks, that are used for those vectors, to libtorch tensors, all while remaining on the GPU, and do the same operation but using the tensors. I even added a small "error" that allowed me to verify that the data I initially allocate (without the tensors) is actually being changed when manipulating the tensors.

I also have managed to use cv::Mat's data, which is a void pointer that points at the pixel data of an OpenCV image, with torch::from_blob() successfully, e.g.

auto tensor_input = torch::from_blob(img_torch.data, { 1, img_torch.size().height, img_torch.size().width, 1 }, torch::kFloat32);
tensor_input = tensor_input.permute({ 0, 3, 1, 2 });

for an BGRA (PNG) image that I had to convert to CV_32FC3 (in order to use with my ML model and play around a bit with the tensor's shape (the permute()) above.

I am unable to do this with a cudaArray and would like to know if that is even possible.

The reason why I am using a cudaArray is that, just like in the description of this type, I am storing a texture (in my case a D3D11 2D texture) that I need to process. I am actually able to do that using pure CUDA kernel that I've written myself, while also using cudaSurfaceObject_t, which I doubt I can pass onto libtorch in any shape or form.

I am looking for something in the lines of (pseudo-code):

// Register cudaGraphicsResource* cu_arr_interop using cudaGraphicsMapResources(...)
...

// Map the texture's texels to a CUDA array
cudaArray* cu_arr;
cudaGraphicsSubResourceGetMappedArray(&cu_arr, cu_arr_interop, 0, 0);

// Convert the CUDA array to a Torch tensor
auto options = torch::TensorOptions().dtype(...).device(torch::kCUDA, 0).pinned_memory(true);
auto tensor_in = torch::from_blob((void*)cu_arr, { ... }, options);

// Run ML model
auto tensor_out = module.forward({ tensor_in }).toTensor();

// See result on screen
...

// cudaGraphicsUnmapResources(...)
1

There are 1 best solutions below

0
On BEST ANSWER

Following the comments, I managed map the data from and to a CUDA array. The intermediate libtorch tensor is fully functional.

Code for CUDA to libtorch Tensor

cudaError_t cr = cudaSuccess;

// Allocate linear CUDA memory
void* copy = nullptr;
cr = cudaMalloc(&copy, dpitch * height);
if (cr != cudaSuccess)
{
    ...
}

// Copying the input CUDA array to the flat CUDA memory
cr = cudaMemcpy2DFromArray(copy, dpitch, array_read, 0, 0, dpitch, height, cudaMemcpyDeviceToDevice);
if (cr != cudaSuccess)
{
    ...
}

// Setup tensor that maps the flat CUDA memory so that it can be used in libtorch
at::Tensor tensor_in;
auto options = torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCUDA, 0).pinned_memory(true);
// Map memory as a HEIGHTxWIDTHxCHANNELS tensor that will represent the image with its 4 channels
tensor_in = torch::from_blob(copy, { height, width,  4 }, options);
// Permute so that the channels are the first dimension. This allows extracting the pixel data per channel as a separate tensor
tensor_in = tensor_in.permute({2, 0, 1});

Further conversions depend on the model that will be used for the inference for the given tensor. The above permutation allows the extraction of each channel as a separate tensor. In my case I had to do some extra conversions to make the tensors compatible with my model, e.g.

// Extract channels and convert to tensors that are compatible with the expected input for the ML
at::Tensor tensor_in_R, tensor_in_G, tensor_in_B, tensor_in_A;
tensor_in_R= tensor_in[0].div(255.0).unsqueeze(0).unsqueeze(0).to(torch::kFloat32);
tensor_in_G = ...
tensor_in_B = ...
tensor_in_A = ...

While the copying back is done with

// Copy tensor to the CUDA output array
cr = cudaMemcpy2DToArray(array_write,
    0, 0,
    tensor_out.data_ptr(),
    dpitch, dpitch,
    height, cudaMemcpyDeviceToDevice);

with dpitch being equal to width * sizeof(unsigned char) * 4 this will not work.

The output tensor (the result from the inference) needs to be post-processed - (un)squeezing dimensions if necessary, permuting, converting to the original data format (e.g. torch::kUInt8 in my case) and so on.

Two steps are very important, namely:

  • Merging - if you are splitting the image and processing each channel separately, you will have to merge the result (here R, G, B and A). In my case I did it with cat(), which concatenates tensors along an existing dimensions.

    tensor_out = torch::cat({
    tensor_out_R.unsqueeze(0),
    tensor_out_G.unsqueeze(0),
    tensor_out_B.unsqueeze(0),
    tensor_in_processed[3].unsqueeze(0)
         }).permute({ 2, 0, 1 });
    
  • Flattening - it took me like 2 days to figure this quite obvious necessity. The tensor has different memory layout. In order to copy it back to the CUDA array, it needs to be flattened. The default flatten() squishes the given tensor to a one dimensional array. By doing so you can even reuse the pitches that were used in cudaMemcpy2DFromArray() if the format (incl. dimensions) of your input CUDA array are the same as the output.

Due to the poor documentation of libtorch in regards to exceptions and error handling overall I recommend to dump intermediate results from C++ to serialized tensor files. These can then be loaded using

t_from_cpp = list(torch.jit.load('tensor_cpp_dump.pt').parameters())[0]

You can use torchvision.transforms with the PILToImage() to visualize/save as image the tensor. Checking the shape and experimenting with various conversions for the tensor offers a fast way to get a solution, which you can then transfer in C++. For a comparison running inference in C++ offers zero feedback when an error occurs. In PyTorch you will often times get a nice description on what went wrong including full trace.

[![enter image description here][3]][3]

Whenever I see channels being displayed as separate images or some other weird thing, I am always thinking that the way the memory is aligned/being read is not in the right order. I am almost certain that the double use of dpitch as arguments in cudaMemcpy2DToArray() is the culprit. What values I need to put here is a mystery.