Experimentation with CUDA Graphs
Example: Vector Addition and Multiplication using CUDA Graphs
__global__ void VecAdd(float* A, float* B, float* C, int N) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N)
C[i] = A[i] + B[i];
}
__global__ void VecMul(float* A, float* B, float* C, int N) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N)
C[i] = A[i] * B[i];
}#include <cuda_runtime.h>
#include <iostream>
int main() {
int N = 1024;
size_t size = N * sizeof(float);
// Allocate memory
float *h_A, *h_B, *h_C;
cudaMallocManaged(&h_A, size);
cudaMallocManaged(&h_B, size);
cudaMallocManaged(&h_C, size);
// Initialize data
for (int i = 0; i < N; i++) {
h_A[i] = i;
h_B[i] = i;
}
// Create a stream to run kernels
cudaStream_t stream;
cudaStreamCreate(&stream);
// Start graph capture
cudaGraph_t graph;
cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
// Launch the vector addition kernel
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
VecAdd<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(h_A, h_B, h_C, N);
// Launch the vector multiplication kernel using the result of addition
VecMul<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(h_A, h_C, h_C, N);
// End graph capture
cudaStreamEndCapture(stream, &graph);
// Instantiate and launch the graph
cudaGraphExec_t graphExec;
cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0);
cudaGraphLaunch(graphExec, stream);
cudaStreamSynchronize(stream);
// Display the result
for (int i = 0; i < 10; i++) {
std::cout << "C[" << i << "] = " << h_C[i] << std::endl;
}
// Cleanup
cudaGraphExecDestroy(graphExec);
cudaGraphDestroy(graph);
cudaFree(h_A);
cudaFree(h_B);
cudaFree(h_C);
cudaStreamDestroy(stream);
return 0;
}Explanation
Compilation and Execution
Last updated
Was this helpful?

