Experimentation with CUDA Graphs

This example will involve setting up a basic CUDA graph that performs a vector addition followed by a vector multiplication, showcasing the explicit construction of a graph with kernel launches and dependencies between them.

Example: Vector Addition and Multiplication using CUDA Graphs

First, let's set up a simple CUDA kernel for vector addition and vector multiplication:

__global__ void VecAdd(float* A, float* B, float* C, int N) {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < N)
        C[i] = A[i] + B[i];
}

__global__ void VecMul(float* A, float* B, float* C, int N) {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < N)
        C[i] = A[i] * B[i];
}

Now, let's construct and execute the graph:

#include <cuda_runtime.h>
#include <iostream>

int main() {
    int N = 1024;
    size_t size = N * sizeof(float);

    // Allocate memory
    float *h_A, *h_B, *h_C;
    cudaMallocManaged(&h_A, size);
    cudaMallocManaged(&h_B, size);
    cudaMallocManaged(&h_C, size);

    // Initialize data
    for (int i = 0; i < N; i++) {
        h_A[i] = i;
        h_B[i] = i;
    }

    // Create a stream to run kernels
    cudaStream_t stream;
    cudaStreamCreate(&stream);

    // Start graph capture
    cudaGraph_t graph;
    cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);

    // Launch the vector addition kernel
    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
    VecAdd<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(h_A, h_B, h_C, N);

    // Launch the vector multiplication kernel using the result of addition
    VecMul<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(h_A, h_C, h_C, N);

    // End graph capture
    cudaStreamEndCapture(stream, &graph);

    // Instantiate and launch the graph
    cudaGraphExec_t graphExec;
    cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0);
    cudaGraphLaunch(graphExec, stream);
    cudaStreamSynchronize(stream);

    // Display the result
    for (int i = 0; i < 10; i++) {
        std::cout << "C[" << i << "] = " << h_C[i] << std::endl;
    }

    // Cleanup
    cudaGraphExecDestroy(graphExec);
    cudaGraphDestroy(graph);
    cudaFree(h_A);
    cudaFree(h_B);
    cudaFree(h_C);
    cudaStreamDestroy(stream);

    return 0;
}

Explanation

Memory Allocation and Initialization: We allocate unified memory for vectors A, B, and C, and initialize vectors A and B.
Graph Capture: We start capturing the CUDA stream to automatically build a graph. The VecAdd kernel is recorded, followed by the VecMul kernel.
Instantiate and Execute: After capturing, we instantiate the graph and then execute it. This separates the setup from execution, allowing the graph to be reused without setup overhead.
Results: The program prints the first ten elements of vector C to verify the computations.

Compilation and Execution

Compile this program using nvcc:

nvcc -o graph_example graph_example.cu
./graph_example

This example should give you a practical look at how CUDA graphs can be used to optimise workflows that involve multiple dependent kernel executions.

It illustrates the efficiency of setting up the graph once and executing it multiple times if needed, particularly useful in iterative algorithms or repeated computations in simulations and machine learning inference.

PreviousCUDA Graphs NextTensorRT-LLM Libraries

Last updated 1 year ago

Was this helpful?

__global__ void VecAdd(float* A, float* B, float* C, int N) { int i = blockDim.x * blockIdx.x + threadIdx.x; if (i < N) C[i] = A[i] + B[i]; } __global__ void VecMul(float* A, float* B, float* C, int N) { int i = blockDim.x * blockIdx.x + threadIdx.x; if (i < N) C[i] = A[i] * B[i]; }

#include <cuda_runtime.h> #include <iostream> int main() { int N = 1024; size_t size = N * sizeof(float); // Allocate memory float *h_A, *h_B, *h_C; cudaMallocManaged(&h_A, size); cudaMallocManaged(&h_B, size); cudaMallocManaged(&h_C, size); // Initialize data for (int i = 0; i < N; i++) { h_A[i] = i; h_B[i] = i; } // Create a stream to run kernels cudaStream_t stream; cudaStreamCreate(&stream); // Start graph capture cudaGraph_t graph; cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal); // Launch the vector addition kernel int threadsPerBlock = 256; int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; VecAdd<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(h_A, h_B, h_C, N); // Launch the vector multiplication kernel using the result of addition VecMul<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(h_A, h_C, h_C, N); // End graph capture cudaStreamEndCapture(stream, &graph); // Instantiate and launch the graph cudaGraphExec_t graphExec; cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0); cudaGraphLaunch(graphExec, stream); cudaStreamSynchronize(stream); // Display the result for (int i = 0; i < 10; i++) { std::cout << "C[" << i << "] = " << h_C[i] << std::endl; } // Cleanup cudaGraphExecDestroy(graphExec); cudaGraphDestroy(graph); cudaFree(h_A); cudaFree(h_B); cudaFree(h_C); cudaStreamDestroy(stream); return 0; }