I am experimenting with Torch and CUDA. Using torch::from_blob()
I was able to do the following so far:
#include <cuda_runtime.h>
#include <torch/torch.h>
#include <iostream>
#include <exception>
#include <memory>
#include <math.h>
using std::cout;
using std::endl;
using std::exception;
/*
* Demonstration of interoperability between CUDA and Torch C++ API using
* pinned memory.
*
* Using the ENABLE_ERROR variable a change in the result (CUDA) can be
* introduced through its respective Torch tensor. This will also affect
* the copied data from GPU to CPU, resulting in an error during assert
* checks at the end
*/
// Contains the call to the CUDA kernel
void vector_add(int* a, int* b, int* c, int N, int cuda_grid_size, int cuda_block_size);
bool ENABLE_ERROR = false;
int main(int argc, const char* argv[])
{
// Setup array, here 2^16 = 65536 items
const int N = 1 << 16;
size_t bytes = N * sizeof(int);
// Declare pinned memory pointers
int* a_cpu, * b_cpu, * c_cpu;
// Allocate pinned memory for the pointers
// The memory will be accessible from both CPU and GPU
// without the requirements to copy data from one device
// to the other
cout << "Allocating memory for vectors on CPU" << endl;
cudaMallocHost(&a_cpu, bytes);
cudaMallocHost(&b_cpu, bytes);
cudaMallocHost(&c_cpu, bytes);
// Init vectors
cout << "Populating vectors with random integers" << endl;
for (int i = 0; i < N; ++i)
{
a_cpu[i] = rand() % 100;
b_cpu[i] = rand() % 100;
}
// Declare GPU memory pointers
int* a_gpu, * b_gpu, * c_gpu;
// Allocate memory on the device
cout << "Allocating memory for vectors on GPU" << endl;
cudaMalloc(&a_gpu, bytes);
cudaMalloc(&b_gpu, bytes);
cudaMalloc(&c_gpu, bytes);
// Copy data from the host to the device (CPU -> GPU)
cout << "Transfering vectors from CPU to GPU" << endl;
cudaMemcpy(a_gpu, a_cpu, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(b_gpu, b_cpu, bytes, cudaMemcpyHostToDevice);
// Specify threads per CUDA block (CTA), her 2^10 = 1024 threads
int NUM_THREADS = 1 << 10;
// CTAs per grid
int NUM_BLOCKS = (N + NUM_THREADS - 1) / NUM_THREADS;
// Call CUDA kernel
cout << "Running CUDA kernels" << endl;
vector_add(a_gpu, b_gpu, c_gpu, N, NUM_BLOCKS, NUM_THREADS);
try
{
// Convert pinned memory on GPU to Torch tensor on GPU
auto options = torch::TensorOptions().dtype(torch::kInt).device(torch::kCUDA, 0).pinned_memory(true);
cout << "Converting vectors and result to Torch tensors on GPU" << endl;
torch::Tensor a_gpu_tensor = torch::from_blob(a_gpu, { N }, options);
torch::Tensor b_gpu_tensor = torch::from_blob(b_gpu, { N }, options);
torch::Tensor c_gpu_tensor = torch::from_blob(c_gpu, { N }, options);
cout << "Verifying result using Torch tensors" << endl;
if (ENABLE_ERROR)
{
/*
TEST
Change the value of the result should result in two things:
- the GPU memory will be modified
- the CPU test later on (after the GPU memory is copied to the CPU side) should fail
*/
cout << "ERROR GENERATION ENABLED! Application will crash during verification of results" << endl;
cout << "Changing result first element from " << c_gpu_tensor[0];
c_gpu_tensor[0] = 99999999;
cout << " to " << c_gpu_tensor[0] << endl;
}
else
{
assert(c_gpu_tensor.equal(a_gpu_tensor.add(b_gpu_tensor)) == true);
}
}
catch (exception& e)
{
cout << e.what() << endl;
cudaFreeHost(a_cpu);
cudaFreeHost(b_cpu);
cudaFreeHost(c_cpu);
cudaFree(a_gpu);
cudaFree(b_gpu);
cudaFree(c_gpu);
return 1;
}
// Copy memory to device and also synchronize (implicitly)
cout << "Synchronizing CPU and GPU. Copying result from GPU to CPU" << endl;
cudaMemcpy(c_cpu, c_gpu, bytes, cudaMemcpyDeviceToHost);
// Verify the result on the CPU
cout << "Verifying result on CPU" << endl;
for (int i = 0; i < N; ++i)
{
assert(c_cpu[i] == a_cpu[i] + b_cpu[i]);
}
cudaFreeHost(a_cpu);
cudaFreeHost(b_cpu);
cudaFreeHost(c_cpu);
cudaFree(a_gpu);
cudaFree(b_gpu);
cudaFree(c_gpu);
return 0;
}
with a kernel
__global__ void vector_add_kernel(int* a, int* b, int* c, int N)
{
// Calculate global thread ID
int t_id = (blockDim.x * blockIdx.x) + threadIdx.x;
// Check boundry
if (t_id < N)
{
c[t_id] = a[t_id] + b[t_id];
}
}
void vector_add(int* a, int* b, int* c, int N, int cuda_grid_size, int cuda_block_size)
{
vector_add_kernel << <cuda_grid_size, cuda_block_size >> > (a, b, c, N);
cudaGetLastError();
}
The code above uses pinned memory (for fast transfer between CPU and GPU) and does an addition operation between two vectors using the respective kernel. In addition I convert the GPU memory blocks, that are used for those vectors, to libtorch
tensors, all while remaining on the GPU, and do the same operation but using the tensors. I even added a small "error" that allowed me to verify that the data I initially allocate (without the tensors) is actually being changed when manipulating the tensors.
I also have managed to use cv::Mat
's data
, which is a void
pointer that points at the pixel data of an OpenCV image, with torch::from_blob()
successfully, e.g.
auto tensor_input = torch::from_blob(img_torch.data, { 1, img_torch.size().height, img_torch.size().width, 1 }, torch::kFloat32);
tensor_input = tensor_input.permute({ 0, 3, 1, 2 });
for an BGRA (PNG) image that I had to convert to CV_32FC3
(in order to use with my ML model and play around a bit with the tensor's shape (the permute()
) above.
I am unable to do this with a cudaArray
and would like to know if that is even possible.
The reason why I am using a cudaArray
is that, just like in the description of this type, I am storing a texture (in my case a D3D11 2D texture) that I need to process. I am actually able to do that using pure CUDA kernel that I've written myself, while also using cudaSurfaceObject_t
, which I doubt I can pass onto libtorch
in any shape or form.
I am looking for something in the lines of (pseudo-code):
// Register cudaGraphicsResource* cu_arr_interop using cudaGraphicsMapResources(...)
...
// Map the texture's texels to a CUDA array
cudaArray* cu_arr;
cudaGraphicsSubResourceGetMappedArray(&cu_arr, cu_arr_interop, 0, 0);
// Convert the CUDA array to a Torch tensor
auto options = torch::TensorOptions().dtype(...).device(torch::kCUDA, 0).pinned_memory(true);
auto tensor_in = torch::from_blob((void*)cu_arr, { ... }, options);
// Run ML model
auto tensor_out = module.forward({ tensor_in }).toTensor();
// See result on screen
...
// cudaGraphicsUnmapResources(...)
Following the comments, I managed map the data from and to a CUDA array. The intermediate libtorch tensor is fully functional.
Code for CUDA to libtorch Tensor
Further conversions depend on the model that will be used for the inference for the given tensor. The above permutation allows the extraction of each channel as a separate tensor. In my case I had to do some extra conversions to make the tensors compatible with my model, e.g.
While the copying back is done with
with
dpitch
being equal towidth * sizeof(unsigned char) * 4
this will not work.The output tensor (the result from the inference) needs to be post-processed - (un)squeezing dimensions if necessary, permuting, converting to the original data format (e.g.
torch::kUInt8
in my case) and so on.Two steps are very important, namely:
Merging - if you are splitting the image and processing each channel separately, you will have to merge the result (here R, G, B and A). In my case I did it with
cat()
, which concatenates tensors along an existing dimensions.Flattening - it took me like 2 days to figure this quite obvious necessity. The tensor has different memory layout. In order to copy it back to the CUDA array, it needs to be flattened. The default
flatten()
squishes the given tensor to a one dimensional array. By doing so you can even reuse the pitches that were used incudaMemcpy2DFromArray()
if the format (incl. dimensions) of your input CUDA array are the same as the output.Due to the poor documentation of libtorch in regards to exceptions and error handling overall I recommend to dump intermediate results from C++ to serialized tensor files. These can then be loaded using
You can use
torchvision.transforms
with thePILToImage()
to visualize/save as image the tensor. Checking the shape and experimenting with various conversions for the tensor offers a fast way to get a solution, which you can then transfer in C++. For a comparison running inference in C++ offers zero feedback when an error occurs. In PyTorch you will often times get a nice description on what went wrong including full trace.[![enter image description here][3]][3]
Whenever I see channels being displayed as separate images or some other weird thing, I am always thinking that the way the memory is aligned/being read is not in the right order. I am almost certain that the double use of
dpitch
as arguments incudaMemcpy2DToArray()
is the culprit. What values I need to put here is a mystery.