Behavior of cudaGraphInstantiateFlagUseNodePriority

89 Views Asked by At

My understanding of cudaGraphInstantiateFlagUseNodePriority is to prioritize the kernel calls. i.e. we have three independent kernels in cudaGraph first, second & third, and let each kernel waits for 1s and print its name.

If we set kernel graph node priority using cudaGraphKernelNodeSetAttribute (attr-name - cudaLaunchAttributePriority) for each as 0, 1, 2. When the graph is executed, it should honor priority i.e. third should be called followed by second, and followed by first.

Another thing to note is that after setting priority or kernel graph node If I try to confirm using cudaGraphKernelNodeGet Attribute (attr-name - cudaLaunchAttributePriority), I always get priority as 0. It should return the same value set by the Set call as mentioned previously … Right? Please correct me if I am wrong.

I have tried the below sample to understand behavior.

#include <cuda_runtime.h>
#include <vector>
#include <cstdio>
#include <chrono>

#define CUDA_CHECK(error)                                                                            \
    {                                                                                              \
        cudaError_t localError = error;                                                             \
        if (localError != cudaSuccess) {      \
           printf("error: '%s'(%d) from %s at %s:%d\n",  cudaGetErrorString(localError),   \
                   localError, #error, __FUNCTION__, __LINE__);                              \
                exit(0);\
        }                                                                                          \
    }

__global__ void first(uint32_t interval, const uint32_t ticks_per_ms) {
  size_t globaltid = blockIdx.x * blockDim.x + threadIdx.x;
  if (globaltid == 0) {
    printf("\nfirst..");
    while (interval--) {
      uint64_t start = clock64();
      while (clock64() - start < ticks_per_ms) {
      }
    }
    printf("first\n");
  }
}
__global__ void second(uint32_t interval, const uint32_t ticks_per_ms) {
  size_t globaltid = blockIdx.x * blockDim.x + threadIdx.x;
  if (globaltid == 0) {
    printf("\nsecond..");
    while (interval--) {
      uint64_t start = clock64();
      while (clock64() - start < ticks_per_ms) {
      }
    }
    printf("second\n");
  }
}
__global__ void third(uint32_t interval, const uint32_t ticks_per_ms) {
  size_t globaltid = blockIdx.x * blockDim.x + threadIdx.x;
  if (globaltid == 0) {
    printf("\nthird..");
    while (interval--) {
      uint64_t start = clock64();
      while (clock64() - start < ticks_per_ms) {
      }
    }
    printf("third\n");
  }
}

void cudaGraphsManual() {
  cudaStream_t streamForGraph;
  cudaGraph_t graph;
  cudaGraphNode_t kernelNode;
  CUDA_CHECK(cudaStreamCreate(&streamForGraph));
  cudaKernelNodeParams kernelNodeParams = {0};
  CUDA_CHECK(cudaGraphCreate(&graph, 0));

  int ticks_per_ms = 0;
  CUDA_CHECK(cudaDeviceGetAttribute(&ticks_per_ms, cudaDevAttrClockRate, 0));
  uint32_t interval = std::chrono::milliseconds(1000).count();

  void *kernelArgs[2] = {&interval,
                         &ticks_per_ms};

  kernelNodeParams.func = (void *)first;
  kernelNodeParams.gridDim = dim3(1, 1, 1);
  kernelNodeParams.blockDim = dim3(1, 1, 1);
  kernelNodeParams.sharedMemBytes = 0;
  kernelNodeParams.kernelParams = kernelArgs;
  kernelNodeParams.extra = NULL;

  CUDA_CHECK(cudaGraphAddKernelNode(&kernelNode, graph, NULL, 0, &kernelNodeParams));
  union cudaKernelNodeAttrValue p1; p1.priority = 0;
  CUDA_CHECK(cudaGraphKernelNodeSetAttribute(kernelNode, cudaLaunchAttributePriority, &p1));
  
  union cudaKernelNodeAttrValue p4;
  CUDA_CHECK(cudaGraphKernelNodeGetAttribute(kernelNode, cudaLaunchAttributePriority, &p4));
  printf("\nPriority : %d\n", p4.priority);
  
  kernelNodeParams.func = (void *)second;
  CUDA_CHECK(cudaGraphAddKernelNode(&kernelNode, graph,  NULL, 0, &kernelNodeParams));
  union cudaKernelNodeAttrValue p2; p2.priority = 2;
  CUDA_CHECK(cudaGraphKernelNodeSetAttribute(kernelNode, cudaLaunchAttributePriority, &p2));

  CUDA_CHECK(cudaGraphKernelNodeGetAttribute(kernelNode, cudaLaunchAttributePriority, &p4));
  printf("\nPriority : %d\n", p4.priority);

  kernelNodeParams.func = (void *)third;
  CUDA_CHECK(cudaGraphAddKernelNode(&kernelNode, graph,  NULL, 0, &kernelNodeParams));
  union cudaKernelNodeAttrValue p3; p3.priority = 1;
  CUDA_CHECK(cudaGraphKernelNodeSetAttribute(kernelNode, cudaLaunchAttributePriority, &p3));
  
  CUDA_CHECK(cudaGraphKernelNodeGetAttribute(kernelNode, cudaLaunchAttributePriority, &p4));
  printf("\nPriority : %d\n", p4.priority);
  
  cudaGraphExec_t graphExec;
  CUDA_CHECK(cudaGraphInstantiateWithFlags(&graphExec, graph, cudaGraphInstantiateFlagUseNodePriority));
  CUDA_CHECK(cudaGraphLaunch(graphExec, streamForGraph));
  CUDA_CHECK(cudaStreamSynchronize(streamForGraph));
  CUDA_CHECK(cudaGraphExecDestroy(graphExec));
  CUDA_CHECK(cudaGraphDestroy(graph));
  CUDA_CHECK(cudaStreamDestroy(streamForGraph));
}

int main(int argc, char **argv) {
  cudaGraphsManual();
  return EXIT_SUCCESS;
}
1

There are 1 best solutions below

2
On

My understanding of cudaGraphInstantiateFlagUseNodePriority is to prioritize kernel calls.

It should probably be thought of as an analog of CUDA stream priorities.

for each as 0, 1, 2.

That is evidently not what you want to choose for priorities (see below). Use the stream priority mechanism to find an appropriate range of priorities, rather than choosing your own numbers/range arbitrarily.

When graph is executed then it should honor priority i.e. third should be called followed by second and followed by first.

That is not how stream priority works. Kernels may still begin execution in the order in which they were launched. However stream priority suggests that the CUDA block scheduler will preferentially choose blocks from higher priority streams over lower priority streams, when choosing blocks to deposit on available SMs. This is more-or-less meaningless in your case because:

  1. Each kernel launch consists of only 1 block.
  2. All of your kernels can run concurrently anyway. The block scheduler is free to deposit the block of each kernel as soon as it is available.

I always get priority as 0. It should return same value set by Set call as mentioned previously … Right?

Not if you request an invalid stream priority level. 0 is evidently valid. The others evidently are not.