跳到主要内容

流与事件

CUDA 流(Stream)和事件(Event)是实现异步执行和任务并发的关键机制。通过合理使用流和事件,可以显著提高 GPU 的利用率和程序性能。

CUDA 流基础

什么是流?

流是一系列按顺序执行的操作。不同流中的操作可以并发执行,从而实现任务重叠。

默认流(Stream 0):
┌─────────────────────────────────────────────────────────┐
│ Kernel1 → Copy1 → Kernel2 → Copy2 │
└─────────────────────────────────────────────────────────┘

多流并发:
Stream 1: ┌─────────┐┌─────────┐┌─────────┐
│ Kernel1 ││ Copy1 ││ Kernel2 │
Stream 2: └─────────┘└─────────┘└─────────┘
┌─────────┐┌─────────┐┌─────────┐
│ KernelA ││ CopyA ││ KernelB │
└─────────┘└─────────┘└─────────┘

创建和使用流

cudaStream_t stream1, stream2;

cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);

kernel<<<grid, block, 0, stream1>>>(args);
cudaMemcpyAsync(d_data1, h_data1, size, cudaMemcpyHostToDevice, stream1);

kernel<<<grid, block, 0, stream2>>>(args);
cudaMemcpyAsync(d_data2, h_data2, size, cudaMemcpyHostToDevice, stream2);

cudaStreamSynchronize(stream1);
cudaStreamSynchronize(stream2);

cudaStreamDestroy(stream1);
cudaStreamDestroy(stream2);

默认流

默认流(Stream 0)是隐式同步的,会等待其他流完成:

kernel<<<grid, block>>>(args);

使用 cudaStreamNonBlocking 标志创建非阻塞流:

cudaStream_t stream;
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);

异步内存操作

cudaMemcpyAsync

异步内存拷贝:

cudaError_t cudaMemcpyAsync(void *dst, const void *src, size_t count,
cudaMemcpyKind kind, cudaStream_t stream);

示例:

float *h_data, *d_data;
size_t size = N * sizeof(float);

h_data = (float*)malloc(size);
cudaMalloc(&d_data, size);

cudaStream_t stream;
cudaStreamCreate(&stream);

cudaMemcpyAsync(d_data, h_data, size, cudaMemcpyHostToDevice, stream);

kernel<<<grid, block, 0, stream>>>(d_data, N);

cudaMemcpyAsync(h_data, d_data, size, cudaMemcpyDeviceToHost, stream);

cudaStreamSynchronize(stream);

cudaStreamDestroy(stream);

cudaMemsetAsync

异步内存初始化:

cudaMemsetAsync(d_data, 0, size, stream);

页锁定内存

使用页锁定(Pinned)内存可以提高异步传输的性能:

float *h_data;
cudaMallocHost(&h_data, size);

cudaMemcpyAsync(d_data, h_data, size, cudaMemcpyHostToDevice, stream);

cudaFreeHost(h_data);

页锁定内存的特点:

  1. 不会被换出到磁盘
  2. DMA 直接传输,减少 CPU 参与
  3. 可以与计算重叠

流并发示例

多流处理多个数据集

const int nStreams = 4;
const int dataSize = N / nStreams;

cudaStream_t streams[nStreams];
for (int i = 0; i < nStreams; i++) {
cudaStreamCreate(&streams[i]);
}

for (int i = 0; i < nStreams; i++) {
int offset = i * dataSize;

cudaMemcpyAsync(d_data + offset, h_data + offset,
dataSize * sizeof(float),
cudaMemcpyHostToDevice, streams[i]);

kernel<<<gridSize, blockSize, 0, streams[i]>>>(
d_data + offset, dataSize);

cudaMemcpyAsync(h_result + offset, d_data + offset,
dataSize * sizeof(float),
cudaMemcpyDeviceToHost, streams[i]);
}

for (int i = 0; i < nStreams; i++) {
cudaStreamSynchronize(streams[i]);
cudaStreamDestroy(streams[i]);
}

计算与传输重叠

cudaStream_t computeStream, copyStream;
cudaStreamCreate(&computeStream);
cudaStreamCreate(&copyStream);

float *d_data1, *d_data2, *h_data;
cudaMalloc(&d_data1, size);
cudaMalloc(&d_data2, size);
cudaMallocHost(&h_data, size);

kernel<<<grid, block, 0, computeStream>>>(d_data1, N);

cudaMemcpyAsync(d_data2, h_data, size, cudaMemcpyHostToDevice, copyStream);

cudaStreamSynchronize(computeStream);
cudaStreamSynchronize(copyStream);

cudaStreamDestroy(computeStream);
cudaStreamDestroy(copyStream);

CUDA 事件

创建和记录事件

cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);

cudaEventRecord(start);
kernel<<<grid, block>>>(args);
cudaEventRecord(stop);

cudaEventSynchronize(stop);

float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("Time: %f ms\n", milliseconds);

cudaEventDestroy(start);
cudaEventDestroy(stop);

性能测量封装

class CudaTimer {
private:
cudaEvent_t start, stop;

public:
CudaTimer() {
cudaEventCreate(&start);
cudaEventCreate(&stop);
}

~CudaTimer() {
cudaEventDestroy(start);
cudaEventDestroy(stop);
}

void startTimer() {
cudaEventRecord(start);
}

float stopTimer() {
cudaEventRecord(stop);
cudaEventSynchronize(stop);

float milliseconds;
cudaEventElapsedTime(&milliseconds, start, stop);
return milliseconds;
}
};

int main() {
CudaTimer timer;

timer.startTimer();
kernel<<<grid, block>>>(args);
float time = timer.stopTimer();

printf("Kernel time: %f ms\n", time);
}

事件同步

事件可以用于流之间的同步:

cudaStream_t stream1, stream2;
cudaEvent_t event;

cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
cudaEventCreate(&event);

kernel1<<<grid, block, 0, stream1>>>(args);
cudaEventRecord(event, stream1);

cudaStreamWaitEvent(stream2, event);

kernel2<<<grid, block, 0, stream2>>>(args);

cudaStreamSynchronize(stream2);

cudaStreamDestroy(stream1);
cudaStreamDestroy(stream2);
cudaEventDestroy(event);

事件属性

cudaEvent_t event;
cudaEventCreate(&event);

bool isComplete = cudaEventQuery(event) == cudaSuccess;

cudaEventSynchronize(event);

float time;
cudaEventElapsedTime(&time, start, stop);

流优先级

CUDA 支持设置流的优先级:

int leastPriority, greatestPriority;
cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);

cudaStream_t highPriorityStream, lowPriorityStream;

cudaStreamCreateWithPriority(&highPriorityStream,
cudaStreamNonBlocking,
greatestPriority);

cudaStreamCreateWithPriority(&lowPriorityStream,
cudaStreamNonBlocking,
leastPriority);

流回调

CUDA 支持在流中插入回调函数:

void CUDART_CB myCallback(cudaStream_t stream, cudaError_t status, void *userData) {
printf("Callback executed with status: %d\n", status);
printf("User data: %s\n", (char*)userData);
}

int main() {
cudaStream_t stream;
cudaStreamCreate(&stream);

const char* message = "Hello from callback!";

kernel<<<grid, block, 0, stream>>>(args);

cudaLaunchHostFunc(stream, myCallback, (void*)message);

cudaStreamSynchronize(stream);
cudaStreamDestroy(stream);
}

流图

CUDA 10.0 引入了流图(Stream Graph),可以更高效地管理复杂的工作流。

创建流图

cudaGraph_t graph;
cudaStream_t stream;

cudaStreamCreate(&stream);

cudaGraphCreate(&graph, 0);

cudaGraphNode_t kernelNode, memcpyNode;

cudaKernelNodeParams kernelParams = {0};
kernelParams.func = (void*)kernel;
kernelParams.gridDim = grid;
kernelParams.blockDim = block;
kernelParams.kernelParams = (void**)&args;

cudaGraphAddKernelNode(&kernelNode, graph, NULL, 0, &kernelParams);

cudaMemcpy3DParms memcpyParams = {0};
memcpyParams.srcPtr = make_cudaPitchedPtr(h_data, size, N, 1);
memcpyParams.dstPtr = make_cudaPitchedPtr(d_data, size, N, 1);
memcpyParams.extent = make_cudaExtent(size, 1, 1);
memcpyParams.kind = cudaMemcpyHostToDevice;

cudaGraphAddMemcpyNode(&memcpyNode, graph, NULL, 0, &memcpyParams);

cudaGraphAddDependencies(graph, &memcpyNode, &kernelNode, 1);

实例化和执行

cudaGraphExec_t graphExec;
cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0);

cudaGraphLaunch(graphExec, stream);

cudaStreamSynchronize(stream);

cudaGraphExecDestroy(graphExec);
cudaGraphDestroy(graph);

从流捕获创建图

cudaGraph_t graph;
cudaStream_t stream;

cudaStreamCreate(&stream);

cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);

kernel<<<grid, block, 0, stream>>>(args);
cudaMemcpyAsync(d_data, h_data, size, cudaMemcpyHostToDevice, stream);

cudaStreamEndCapture(stream, &graph);

cudaGraphExec_t graphExec;
cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0);

for (int i = 0; i < 100; i++) {
cudaGraphLaunch(graphExec, stream);
}

cudaStreamSynchronize(stream);

最佳实践

1. 合理分配流数量

int deviceCount;
cudaGetDeviceCount(&deviceCount);

int maxStreams = 32;
int optimalStreams = min(deviceCount * 8, maxStreams);

2. 使用页锁定内存

float *h_data;
cudaMallocHost(&h_data, size);

cudaMemcpyAsync(d_data, h_data, size, cudaMemcpyHostToDevice, stream);

cudaFreeHost(h_data);

3. 避免过度同步

for (int i = 0; i < nStreams; i++) {
kernel<<<grid, block, 0, streams[i]>>>(args);
}

cudaDeviceSynchronize();

4. 使用事件进行细粒度同步

cudaEvent_t events[nStreams];
for (int i = 0; i < nStreams; i++) {
cudaEventCreate(&events[i]);
}

for (int i = 0; i < nStreams; i++) {
kernel<<<grid, block, 0, streams[i]>>>(args);
cudaEventRecord(events[i], streams[i]);
}

for (int i = 0; i < nStreams; i++) {
cudaEventSynchronize(events[i]);
}

小结

本章介绍了 CUDA 流与事件:

  1. 流基础:创建、使用、销毁流
  2. 异步操作cudaMemcpyAsynccudaMemsetAsync
  3. 流并发:多流并发执行、计算与传输重叠
  4. 事件:性能测量、流间同步
  5. 流优先级:设置不同优先级的流
  6. 流图:高效管理工作流

合理使用流和事件可以显著提高 GPU 利用率。下一章将介绍 性能优化,学习更多 CUDA 性能优化技巧。