cudaGraph: Multi-threaded stream capturing causes errors only when run in cuda-memcheck

1.5k Views Asked by At

I have a program where multiple host threads try to capture a cuda graph and execute it. It produces the correct results, but it cannot be run with cuda-memcheck.

When run with cuda-memcheck, the following error appears.

Program hit cudaErrorStreamCaptureInvalidated (error 901) due to "operation failed due to a previous error during capture" on CUDA API call to cudaLaunchKernel.

When only one host thread is used cuda-memcheck shows no error.

Here is example code which can be compiled with nvcc 10.2 : nvcc -arch=sm_61 -O3 main.cu -o main

#include <iostream>
#include <memory>
#include <algorithm>
#include <cassert>
#include <vector>
#include <thread>
#include <iterator>


#ifndef CUERR

    #define CUERR {                                                            \
        cudaError_t err;                                                       \
        if ((err = cudaGetLastError()) != cudaSuccess) {                       \
            std::cout << "CUDA error: " << cudaGetErrorString(err) << " : "    \
                      << __FILE__ << ", line " << __LINE__ << std::endl;       \
            exit(1);                                                           \
        }                                                                      \
    }

#endif


__global__
void kernel(int id, int num){
    printf("kernel %d, id %d\n", num, id);
}

struct Data{
    bool isValidGraph = false;
    int id = 0;
    int deviceId = 0;
    cudaGraphExec_t execGraph = nullptr;
    cudaStream_t stream = nullptr;
};

void buildGraphViaCapture(Data& data){
    cudaSetDevice(data.deviceId); CUERR;

    if(!data.isValidGraph){
        std::cerr << "rebuild graph\n";

        if(data.execGraph != nullptr){
            cudaGraphExecDestroy(data.execGraph); CUERR;
        }

        assert(data.stream != cudaStreamLegacy);

        cudaStreamCaptureStatus captureStatus;
        cudaStreamIsCapturing(data.stream, &captureStatus); CUERR;

        assert(captureStatus == cudaStreamCaptureStatusNone);
        
        cudaStreamBeginCapture(data.stream, cudaStreamCaptureModeRelaxed); CUERR;

        for(int i = 0; i < 64; i++){
            kernel<<<1,1,0,data.stream>>>(data.id, i);
        }

        cudaGraph_t graph;
        cudaStreamEndCapture(data.stream, &graph); CUERR;
        
        cudaGraphExec_t execGraph;
        cudaGraphNode_t errorNode;
        auto logBuffer = std::make_unique<char[]>(1025);
        std::fill_n(logBuffer.get(), 1025, 0);
        cudaError_t status = cudaGraphInstantiate(&execGraph, graph, &errorNode, logBuffer.get(), 1025);
        if(status != cudaSuccess){
            if(logBuffer[1024] != '\0'){
                std::cerr << "cudaGraphInstantiate: truncated error message: ";
                std::copy_n(logBuffer.get(), 1025, std::ostream_iterator<char>(std::cerr, ""));
                std::cerr << "\n";
            }else{
                std::cerr << "cudaGraphInstantiate: error message: ";
                std::cerr << logBuffer.get();
                std::cerr << "\n";
            }
            CUERR;
        }            

        cudaGraphDestroy(graph); CUERR;

        data.execGraph = execGraph;

        data.isValidGraph = true;
    }
}

void execute(Data& data){
    buildGraphViaCapture(data);

    assert(data.isValidGraph);

    cudaGraphLaunch(data.execGraph, data.stream); CUERR;
}


void initData(Data& data, int id, int deviceId){
    data.id = id;
    data.deviceId = deviceId;
    cudaStreamCreate(&data.stream); CUERR;
}

void destroyData(Data& data){
    if(data.execGraph != nullptr){
        cudaGraphExecDestroy(data.execGraph); CUERR;
    }
    cudaStreamDestroy(data.stream); CUERR; 
}

int main(){

    std::vector<int> deviceIds{0};

    std::vector<std::thread> threads;

    for(int deviceId : deviceIds){
        for(int k = 0; k < 4; k++){
            threads.emplace_back([&,deviceId](){

                std::vector<Data> vec(3);

                initData(vec[0], deviceId * 10 + 4*k + 0, deviceId);
                initData(vec[1], deviceId * 10 + 4*k + 1, deviceId);

                int cur = 0;

                for(int iter = 0; iter < 10; iter++){
                    cudaStreamSynchronize(vec[cur].stream); CUERR;
                    execute(vec[cur]); CUERR;
                    cur = 1 - cur;
                }

                cudaStreamSynchronize(vec[0].stream); CUERR;
                cudaStreamSynchronize(vec[1].stream); CUERR;

                destroyData(vec[0]);
                destroyData(vec[1]);

            });
        }
    }

    for(auto& t : threads){
        t.join();
    }



    cudaDeviceReset();
    return 0;
}

Why does the error only appear when multiple threads are used, and why exactly is the capture invalidated?


Edit 20th march 2022:

The error still exists with CUDA-MEMCHECK version 11.5.114. However, cuda-memcheck is now deprecated in favor of compute-sanitizer. The latter does no longer report cudaErrorStreamCaptureInvalidated

2

There are 2 best solutions below

0
On

We ran into this problem aswell - where even though we are working on different CUDA graph objects, we still get errors. Our (ugly) solution is to wrap the cudaStreamBeginCapture and cudaStreamEndCapture in a RAII struct with a static mutex.

It solves the problem for now, but I'm going to inquire further on the CUDA developer forums.

2
On

Cuda graphs are not thread safe. If you read the documentation, it says that:

Graph objects (cudaGraph_t, CUgraph) are not internally synchronized and must not be accessed concurrently from multiple threads. API calls accessing the same graph object must be serialized externally.

You need to access the graph object in a critical section.