I am trying to implement a cuda graph experiment. There are three kernels, kernel_0, kernel_1, and kernel_2. They will be executed sequentially and have dependencies. Right now I am going to only capture kernel_1. These are my code:
#include <stdio.h>
#include <chrono>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#define N 50000
#define NSTEP 1000
#define NKERNEL 20
using namespace std::chrono;
static const char *_cudaGetErrorEnum(cudaError_t error) {
return cudaGetErrorName(error);
}
template <typename T>
void check(T result, char const *const func, const char *const file,
int const line) {
if (result) {
fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
exit(EXIT_FAILURE);
}
}
#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
__global__ void shortKernel_0(float * out_d, float * in_d){
int idx=blockIdx.x*blockDim.x+threadIdx.x;
if(idx<N) {
in_d[idx] = 1.0;
out_d[idx]=1 + in_d[idx];
}
}
__global__ void shortKernel_1(float * out_d, float * in_d){
int idx=blockIdx.x*blockDim.x+threadIdx.x;
if(idx<N) out_d[idx]=2*in_d[idx];
}
__global__ void shortKernel_2(float * out_d, float * in_d){
int idx=blockIdx.x*blockDim.x+threadIdx.x;
if(idx<N) {
out_d[idx]=3*in_d[idx];
}
}
void test(){
size_t size_bytes = N * sizeof(float);
void * in_d_0;
void * out_d_0;
void * out_d_1;
void * out_d_2;
int threads = 128;
int blocks = (N+threads)/threads;
int iter = 10;
cudaStream_t stream;
cudaStreamCreate(&stream);
CUmemoryPool pool_;
cuDeviceGetDefaultMemPool(&pool_, 0);
uint64_t threshold = UINT64_MAX;
cuMemPoolSetAttribute(pool_, CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, &threshold);
cudaGraph_t graph;
cudaGraphExec_t instance;
bool graphCreated=false;
for (int i =0; i < iter; i++){
cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&in_d_0), size_bytes, pool_, stream);
cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_0), size_bytes, pool_, stream);
shortKernel_0<<<blocks, threads,0, stream>>>(reinterpret_cast<float *>(out_d_0), reinterpret_cast<float *>(in_d_0));
if (!graphCreated){
cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_1), size_bytes, pool_, stream);
cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(in_d_0), stream);
shortKernel_1<<<blocks, threads,0, stream>>>(reinterpret_cast<float *>(out_d_1), reinterpret_cast<float *>(out_d_0));
cudaStreamEndCapture(stream, &graph);
checkCudaErrors(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
checkCudaErrors(cudaGraphUpload(instance, stream));
graphCreated = true;
}else{
checkCudaErrors(cudaGraphLaunch(instance, stream));
}
cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_0), stream);
cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_2), size_bytes, pool_, stream);
shortKernel_2<<<blocks, threads,0, stream>>>(reinterpret_cast<float *>(out_d_2), reinterpret_cast<float *>(out_d_1));
cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_1), stream);
cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_2), stream);
}
cudaDeviceSynchronize();
printf("With async malloc done!");
cudaStreamDestroy(stream);
cudaGraphDestroy(graph);
cudaGraphExecDestroy(instance);
}
int main() {
test();
return 0;
}
The output from kernel_0 is consumed by kernel_1. and The output from kernel_1 is consumed by kernel_2.
However, when I ran with compute-sanitizer
, I got some errors. Any idea on this error? Part of error is attached:
========= Program hit CUDA_ERROR_INVALID_VALUE (error 1) due to "invalid argument" on CUDA API call to cuMemFreeAsync.
========= Saved host backtrace up to driver entry point at error
========= Host Frame: [0x2ef045]
========= in /usr/local/cuda/compat/lib.real/libcuda.so.1
========= Host Frame:test() [0xb221]
========= in /opt/test-cudagraph/./a.out
========= Host Frame:main [0xb4b3]
========= in /opt/test-cudagraph/./a.out
========= Host Frame:__libc_start_main [0x24083]
========= in /usr/lib/x86_64-linux-gnu/libc.so.6
========= Host Frame:_start [0xaf6e]
========= in /opt/test-cudagraph/./a.out
1. Figuring out where the error occurs, exactly
To get the "idea", you need to wrap all of your API calls with error checks. Doing so properly is a bit tricky, since the
cudaError_t
runtime-API status type and theCUresult
driver-API status type don't agree on all values, so you would need to overload the error-check function:when you then wrap all your calls with an error check, running the program gets you:
and the line triggering the error is:
i.e. the CUDA driver believes
out_d_1
is not a valid device pointer for (asynchronous) freeing.This was the easy part which isn't even that specific to your program.
2. The errors
There are two problems in your code:
On the first pass of your for loop, you capture the graph using stream capture. When capturing a graph this way, no actual work is done during the graph capture process. This means that on the first iteration of the for loop, this line
cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_1), size_bytes, pool_, stream);
does nothing. No allocation is performed.out_d_1
is not modified. However during that same for loop iteration, you attempt to free that pointer here:cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_1), stream);
, but on that particular for loop iteration it was never allocated. So the free fails. This explains thecuMemFreeAsync
problem related to the usage here:cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_1), stream);
There is also a problem with the usage of
cuMemFreeAsync
during the capture process, specifically this line:cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(in_d_0), stream);
We can see that the allocation for that item (in_d_0
) that you are attempting to free during graph capture (i.e. during the graph execution) is allocated outside the graph. But this is a no-no. See the documentation forcuMemFreeAsync
:3. What can you do about it?
Combining those two items, one possible way to fix your posted code is as follows:
A reasonable question might be "If freeing a non-graph allocation is not allowed in a graph, why didn't graph capture fail?" I suspect the answer to that is that the graph capture mechanism is not able to determine at the point of graph capture whether your
CUdeviceptr
will contain an entity that was allocated during graph execution, or not.You might also want to consider avoiding the de-allocation and re-allocation of other buffers. After all, the buffer sizes are constant over all iterations.
Some observations about this stream ordered memory allocation in graphs: