Error with a captured CUDA graph and asynchronous memory allocations in a loop

2.1k Views Asked by At

I am trying to implement a cuda graph experiment. There are three kernels, kernel_0, kernel_1, and kernel_2. They will be executed sequentially and have dependencies. Right now I am going to only capture kernel_1. These are my code:


#include <stdio.h>
#include <chrono>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>

#define N 50000
#define NSTEP 1000
#define NKERNEL 20

using namespace std::chrono;

static const char *_cudaGetErrorEnum(cudaError_t error) {
  return cudaGetErrorName(error);
}

template <typename T>
void check(T result, char const *const func, const char *const file,
           int const line) {
  if (result) {
    fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
            static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
    exit(EXIT_FAILURE);
  }
}

#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)

__global__ void shortKernel_0(float * out_d, float * in_d){
    int idx=blockIdx.x*blockDim.x+threadIdx.x;
    if(idx<N) {
        in_d[idx] = 1.0;
        out_d[idx]=1 + in_d[idx];
    }
}

__global__ void shortKernel_1(float * out_d, float * in_d){
    int idx=blockIdx.x*blockDim.x+threadIdx.x;
    if(idx<N) out_d[idx]=2*in_d[idx];
}

__global__ void shortKernel_2(float * out_d, float * in_d){
    int idx=blockIdx.x*blockDim.x+threadIdx.x;
    if(idx<N) {
        out_d[idx]=3*in_d[idx];
    }
}

void test(){

    size_t size_bytes = N * sizeof(float);
    void * in_d_0;
    void * out_d_0;
    void * out_d_1;
    void * out_d_2;

    int threads = 128;
    int blocks = (N+threads)/threads;
    int iter = 10;
    cudaStream_t stream;
    cudaStreamCreate(&stream);

    CUmemoryPool pool_;
    cuDeviceGetDefaultMemPool(&pool_, 0);
    uint64_t threshold = UINT64_MAX;
    cuMemPoolSetAttribute(pool_, CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, &threshold);

    cudaGraph_t graph;
    cudaGraphExec_t instance;
    bool graphCreated=false;

    for (int i =0; i < iter; i++){
        cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&in_d_0), size_bytes, pool_, stream);
        cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_0), size_bytes, pool_, stream);
        shortKernel_0<<<blocks, threads,0, stream>>>(reinterpret_cast<float *>(out_d_0), reinterpret_cast<float *>(in_d_0));
        if (!graphCreated){
            cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
            cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_1), size_bytes, pool_, stream);
            cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(in_d_0), stream);
            shortKernel_1<<<blocks, threads,0, stream>>>(reinterpret_cast<float *>(out_d_1), reinterpret_cast<float *>(out_d_0));
            cudaStreamEndCapture(stream, &graph);
            checkCudaErrors(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
            checkCudaErrors(cudaGraphUpload(instance, stream));
            graphCreated = true;
        }else{
            checkCudaErrors(cudaGraphLaunch(instance, stream));
        }
        cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_0), stream);
        cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_2), size_bytes, pool_, stream);
        shortKernel_2<<<blocks, threads,0, stream>>>(reinterpret_cast<float *>(out_d_2), reinterpret_cast<float *>(out_d_1));
       
        cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_1), stream);
        cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_2), stream);
    }
   
    cudaDeviceSynchronize();        
    printf("With async malloc done!");
    cudaStreamDestroy(stream);
    cudaGraphDestroy(graph);
    cudaGraphExecDestroy(instance);
}

int main() {
    test();
    return 0;
}

The output from kernel_0 is consumed by kernel_1. and The output from kernel_1 is consumed by kernel_2. However, when I ran with compute-sanitizer, I got some errors. Any idea on this error? Part of error is attached:

========= Program hit CUDA_ERROR_INVALID_VALUE (error 1) due to "invalid argument" on CUDA API call to cuMemFreeAsync.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame: [0x2ef045]
=========                in /usr/local/cuda/compat/lib.real/libcuda.so.1
=========     Host Frame:test() [0xb221]
=========                in /opt/test-cudagraph/./a.out
=========     Host Frame:main [0xb4b3]
=========                in /opt/test-cudagraph/./a.out
=========     Host Frame:__libc_start_main [0x24083]
=========                in /usr/lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:_start [0xaf6e]
=========                in /opt/test-cudagraph/./a.out
1

There are 1 best solutions below

0
On BEST ANSWER

1. Figuring out where the error occurs, exactly

To get the "idea", you need to wrap all of your API calls with error checks. Doing so properly is a bit tricky, since the cudaError_t runtime-API status type and the CUresult driver-API status type don't agree on all values, so you would need to overload the error-check function:

void check(cudaError_t result, char const *const func, 
   const char *const file, int const line) 
{
  if (result) {
    fprintf(stderr, "CUDA runtime error at %s:%d code=%d(%s) \"%s\" \n", 
    file, line, static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
    exit(EXIT_FAILURE);
  }
}

void check(CUresult result, char const *const func, 
   const char *const file, int const line) 
{
  if (result) {
    const char* error_name = "(UNKNOWN)";
    cuGetErrorName(result, &error_name);
    fprintf(stderr, "CUDA driver error at %s:%d code=%d(%s) \"%s\" \n", 
    file, line, static_cast<unsigned int>(result), error_name, func);
    exit(EXIT_FAILURE);
  }
}

when you then wrap all your calls with an error check, running the program gets you:

CUDA driver error at a.cu:102 code=1(CUDA_ERROR_INVALID_VALUE) "cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_1), stream)" 

and the line triggering the error is:

checkCudaErrors(cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_1), stream));

i.e. the CUDA driver believes out_d_1 is not a valid device pointer for (asynchronous) freeing.

This was the easy part which isn't even that specific to your program.

2. The errors

There are two problems in your code:

  1. On the first pass of your for loop, you capture the graph using stream capture. When capturing a graph this way, no actual work is done during the graph capture process. This means that on the first iteration of the for loop, this line cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_1), size_bytes, pool_, stream); does nothing. No allocation is performed. out_d_1 is not modified. However during that same for loop iteration, you attempt to free that pointer here: cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_1), stream);, but on that particular for loop iteration it was never allocated. So the free fails. This explains the cuMemFreeAsync problem related to the usage here: cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_1), stream);

  2. There is also a problem with the usage of cuMemFreeAsync during the capture process, specifically this line: cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(in_d_0), stream); We can see that the allocation for that item (in_d_0) that you are attempting to free during graph capture (i.e. during the graph execution) is allocated outside the graph. But this is a no-no. See the documentation for cuMemFreeAsync:

During stream capture, this function results in the creation of a free node and must therefore be passed the address of a graph allocation

3. What can you do about it?

Combining those two items, one possible way to fix your posted code is as follows:

$ cat t2068.cu
#include <stdio.h>
#include <chrono>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>

#define N 50000
#define NSTEP 1000
#define NKERNEL 20

using namespace std::chrono;

static const char *_cudaGetErrorEnum(cudaError_t error) {
  return cudaGetErrorName(error);
}

template <typename T>
void check(T result, char const *const func, const char *const file,
           int const line) {
  if (result) {
    fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
            static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
    exit(EXIT_FAILURE);
  }
}

#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)

__global__ void shortKernel_0(float * out_d, float * in_d){
    int idx=blockIdx.x*blockDim.x+threadIdx.x;
    if(idx<N) {
        in_d[idx] = 1.0;
        out_d[idx]=1 + in_d[idx];
    }
}

__global__ void shortKernel_1(float * out_d, float * in_d){
    int idx=blockIdx.x*blockDim.x+threadIdx.x;
    if(idx<N) out_d[idx]=2*in_d[idx];
}

__global__ void shortKernel_2(float * out_d, float * in_d){
    int idx=blockIdx.x*blockDim.x+threadIdx.x;
    if(idx<N) {
        out_d[idx]=3*in_d[idx];
    }
}

void test(){

    size_t size_bytes = N * sizeof(float);
    void * in_d_0;
    void * out_d_0;
    void * out_d_1;
    void * out_d_2;

    int threads = 128;
    int blocks = (N+threads)/threads;
    int iter = 10;
    cudaStream_t stream;
    cudaStreamCreate(&stream);

    CUmemoryPool pool_;
    cuDeviceGetDefaultMemPool(&pool_, 0);
    uint64_t threshold = UINT64_MAX;
    cuMemPoolSetAttribute(pool_, CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, &threshold);

    cudaGraph_t graph;
    cudaGraphExec_t instance;
    bool graphCreated=false;

    for (int i =0; i < iter; i++){
        cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&in_d_0), size_bytes, pool_, stream);
        cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_0), size_bytes, pool_, stream);
        shortKernel_0<<<blocks, threads,0, stream>>>(reinterpret_cast<float *>(out_d_0), reinterpret_cast<float *>(in_d_0));
        // moved the next line outside of the graph region
        cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(in_d_0), stream);
        if (!graphCreated){
            cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
            cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_1), size_bytes, pool_, stream);
            //cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(in_d_0), stream);
            shortKernel_1<<<blocks, threads,0, stream>>>(reinterpret_cast<float *>(out_d_1), reinterpret_cast<float *>(out_d_0));
            cudaStreamEndCapture(stream, &graph);
            checkCudaErrors(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
            checkCudaErrors(cudaGraphUpload(instance, stream));
            graphCreated = true;
        }
        // modified so that we run the instantiated graph on every iteration
        checkCudaErrors(cudaGraphLaunch(instance, stream));
        cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_0), stream);
        cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_2), size_bytes, pool_, stream);
        shortKernel_2<<<blocks, threads,0, stream>>>(reinterpret_cast<float *>(out_d_2), reinterpret_cast<float *>(out_d_1));

        cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_1), stream);
        cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_2), stream);
    }

    cudaDeviceSynchronize();
    printf("With async malloc done!\n");
    cudaStreamDestroy(stream);
    cudaGraphDestroy(graph);
    cudaGraphExecDestroy(instance);
}

int main() {
    test();
    return 0;
}
$ nvcc -o t2068 t2068.cu -lcuda
$ compute-sanitizer ./t2068
========= COMPUTE-SANITIZER
With async malloc done!
========= ERROR SUMMARY: 0 errors
$

A reasonable question might be "If freeing a non-graph allocation is not allowed in a graph, why didn't graph capture fail?" I suspect the answer to that is that the graph capture mechanism is not able to determine at the point of graph capture whether your CUdeviceptr will contain an entity that was allocated during graph execution, or not.

You might also want to consider avoiding the de-allocation and re-allocation of other buffers. After all, the buffer sizes are constant over all iterations.

Some observations about this stream ordered memory allocation in graphs:

  • an item allocated outside the graph cannot be freed in the graph
  • an item allocated in the graph can be freed in the graph
  • an item allocated in the graph need not be freed immediately at the end of graph execution, it can be freed later (in non-graph code, as is demonstrated here)
  • an item allocated in a graph should be freed before the graph attempts to allocate it again, but also specifically, before the graph is launched again. Hopefully the reasons for this are obvious; it would be a typical memory leak. However you may get a graph runtime error if you forget this. You can use a control at graph instantiation to auto-free such allocations at the graph launch point:

If any allocations created by [the graph being launched] remain unfreed ... and hGraphExec was not instantiated with CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH, the launch will fail with CUDA_ERROR_INVALID_VALUE.