Call graphs for CUDA

788 Views Asked by At

I am trying to generate call graphs for a code that I have in CUDA with egypt but the usual way doesn't seem to work (since nvcc doesn't have any flag that can do the same thing as -fdump-rtl-expand).

More details :

I have a really large code (of which I am not the author) that spans over multiple .cu files and it would be easier for me to understand what it's doing if I had a call graph.

I bet that an answer to this question would be of use to other people as well.

Any ideas on how this can be done with cuda (.cu) files?

2

There are 2 best solutions below

0
On BEST ANSWER

You can do this with the CUDA support of clang 3.8.

First, compile your CUDA code to emit llvm (example on Windows with CUDA 7.5 installed):

clang++ -c main.cu --cuda-gpu-arch=sm_35 -o main.ll -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5\include"

Then, use the generated ll to build the callgraph with opt:

opt.exe main.ll -analyze -dot-callgraph

Note that opt is not part of the default binary distribution, you may need to build it yourself (I had a 3.7.1 build and it has been able to manage the ll from 3.8).

Example main.cu file:

#include <cuda_runtime.h>
__device__ int f() { return 1; }
__device__ float g(float* a) { return a[f()] ; }
__device__ float h() { return 42.0f ; }
__global__ void kernel (int a, float* b)
{
        int c = a + f();
        g(b);
        b[c] = h();
}

Generated dot file:

digraph "Call graph" {
        label="Call graph";

        Node0x1e3d438 [shape=record,label="{external node}"];
        Node0x1e3d438 -> Node0x1e3cfb0;
        Node0x1e3d438 -> Node0x1e3ce48;
        Node0x1e3d438 -> Node0x1e3d0a0;
        Node0x1e3d438 -> Node0x1e3d258;
        Node0x1e3d438 -> Node0x1e3cfd8;
        Node0x1e3d438 -> Node0x1e3ce98;
        Node0x1e3d438 -> Node0x1e3d000;
        Node0x1e3d438 -> Node0x1e3cee8;
        Node0x1e3d438 -> Node0x1e3d078;
        Node0x1e3d000 [shape=record,label="{__cuda_module_ctor}"];
        Node0x1e3d000 -> Node0x1e3ce98;
        Node0x1e3d000 -> Node0x1e3d168;
        Node0x1e3d078 [shape=record,label="{__cuda_module_dtor}"];
        Node0x1e3d078 -> Node0x1e3cee8;
        Node0x1e3cfb0 [shape=record,label="{^A?f@@YAHXZ}"];
        Node0x1e3d0a0 [shape=record,label="{^A?h@@YAMXZ}"];
        Node0x1e3ce48 [shape=record,label="{^A?g@@YAMPEAM@Z}"];
        Node0x1e3ce48 -> Node0x1e3cfb0;
        Node0x1e3d258 [shape=record,label="{^A?kernel@@YAXHPEAM@Z}"];
        Node0x1e3d258 -> Node0x1e3cfb0;
        Node0x1e3d258 -> Node0x1e3ce48;
        Node0x1e3d258 -> Node0x1e3d0a0;
        Node0x1e3d168 [shape=record,label="{__cuda_register_kernels}"];
        Node0x1e3cee8 [shape=record,label="{__cudaUnregisterFatBinary}"];
        Node0x1e3cee8 -> Node0x1e3d528;
        Node0x1e3cfd8 [shape=record,label="{__cudaRegisterFunction}"];
        Node0x1e3cfd8 -> Node0x1e3d528;
        Node0x1e3ce98 [shape=record,label="{__cudaRegisterFatBinary}"];
        Node0x1e3ce98 -> Node0x1e3d528;
}
0
On

As of the CUDA 11.3 release, the device code linker nvlink can also generate assembler level call graphs. Using the same example as the accepted answer, but with the device functions and calling kernel split into two translation units and linked:

$ cat callgraph_f.cu
__device__ int f() { return 1; }
__device__ float g(float* a) { return a[f()] ; }
__device__ float h() { return 42.0f ; }

$ cat callgraph.cu
#include <cuda_runtime.h>
extern __device__ int f();
extern __device__ float g(float* a);
extern __device__ float h();

__global__ void kernel (int a, float* b)
{
        int c = a + f();
        g(b);
        b[c] = h();
}

$ nvcc -dc callgraph_f.cu 
$ nvcc -dc callgraph.cu
$ nvcc -Xnvlink -dump-callgraph callgraph.o callgraph_f.o
callgraph for sm_52:
# A: s -> B // s (number A) potentially calls B
# s [N]     // s uses N registers
# ^s        // s is entry point
# &s        // s has address taken
1: ^kernel(int, float *) [6] -> 5 4 3
2: 
3: f() [5] ->
4: g(float *) [8] ->
5: h() [5] ->
regcount 8 for g(float *) propagated to entry kernel(int, float *)

This is subtly different from the Clang based call graph shown in the other answer, in that the Clang method is using the high level compiler generated IR prior to PTX assembly, whereas the nvlink call graph should be after all optimization passes are compiled and can also show register information for the call graph. Which is more useful depends on your use case of the call graph.