流与事件
CUDA 流(Stream)和事件(Event)是实现异步执行和任务并发的关键机制。通过合理使用流和事件,可以显著提高 GPU 的利用率和程序性能。
CUDA 流基础
什么是流?
流是一系列按顺序执行的操作。不同流中的操作可以并发执行,从而实现任务重叠。
默认流(Stream 0):
┌─────────────────────────────────────────────────────────┐
│ Kernel1 → Copy1 → Kernel2 → Copy2 │
└─────────────────────────────────────────────────────────┘
多流并发:
Stream 1: ┌─────────┐┌─────────┐┌─────────┐
│ Kernel1 ││ Copy1 ││ Kernel2 │
Stream 2: └─────────┘└─────────┘└─────────┘
┌─────────┐┌─────────┐┌─────────┐
│ KernelA ││ CopyA ││ KernelB │
└─────────┘└─────────┘└─────────┘
创建和使用流
cudaStream_t stream1, stream2;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
kernel<<<grid, block, 0, stream1>>>(args);
cudaMemcpyAsync(d_data1, h_data1, size, cudaMemcpyHostToDevice, stream1);
kernel<<<grid, block, 0, stream2>>>(args);
cudaMemcpyAsync(d_data2, h_data2, size, cudaMemcpyHostToDevice, stream2);
cudaStreamSynchronize(stream1);
cudaStreamSynchronize(stream2);
cudaStreamDestroy(stream1);
cudaStreamDestroy(stream2);
默认流
默认流(Stream 0)是隐式同步的,会等待其他流完成:
kernel<<<grid, block>>>(args);
使用 cudaStreamNonBlocking 标志创建非阻塞流:
cudaStream_t stream;
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
异步内存操作
cudaMemcpyAsync
异步内存拷贝:
cudaError_t cudaMemcpyAsync(void *dst, const void *src, size_t count,
cudaMemcpyKind kind, cudaStream_t stream);
示例:
float *h_data, *d_data;
size_t size = N * sizeof(float);
h_data = (float*)malloc(size);
cudaMalloc(&d_data, size);
cudaStream_t stream;
cudaStreamCreate(&stream);
cudaMemcpyAsync(d_data, h_data, size, cudaMemcpyHostToDevice, stream);
kernel<<<grid, block, 0, stream>>>(d_data, N);
cudaMemcpyAsync(h_data, d_data, size, cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
cudaStreamDestroy(stream);
cudaMemsetAsync
异步内存初始化:
cudaMemsetAsync(d_data, 0, size, stream);
页锁定内存
使用页锁定(Pinned)内存可以提高异步传输的性能:
float *h_data;
cudaMallocHost(&h_data, size);
cudaMemcpyAsync(d_data, h_data, size, cudaMemcpyHostToDevice, stream);
cudaFreeHost(h_data);
页锁定内存的特点:
- 不会被换出到磁盘
- DMA 直接传输,减少 CPU 参与
- 可以与计算重叠
流并发示例
多流处理多个数据集
const int nStreams = 4;
const int dataSize = N / nStreams;
cudaStream_t streams[nStreams];
for (int i = 0; i < nStreams; i++) {
cudaStreamCreate(&streams[i]);
}
for (int i = 0; i < nStreams; i++) {
int offset = i * dataSize;
cudaMemcpyAsync(d_data + offset, h_data + offset,
dataSize * sizeof(float),
cudaMemcpyHostToDevice, streams[i]);
kernel<<<gridSize, blockSize, 0, streams[i]>>>(
d_data + offset, dataSize);
cudaMemcpyAsync(h_result + offset, d_data + offset,
dataSize * sizeof(float),
cudaMemcpyDeviceToHost, streams[i]);
}
for (int i = 0; i < nStreams; i++) {
cudaStreamSynchronize(streams[i]);
cudaStreamDestroy(streams[i]);
}
计算与传输重叠
cudaStream_t computeStream, copyStream;
cudaStreamCreate(&computeStream);
cudaStreamCreate(©Stream);
float *d_data1, *d_data2, *h_data;
cudaMalloc(&d_data1, size);
cudaMalloc(&d_data2, size);
cudaMallocHost(&h_data, size);
kernel<<<grid, block, 0, computeStream>>>(d_data1, N);
cudaMemcpyAsync(d_data2, h_data, size, cudaMemcpyHostToDevice, copyStream);
cudaStreamSynchronize(computeStream);
cudaStreamSynchronize(copyStream);
cudaStreamDestroy(computeStream);
cudaStreamDestroy(copyStream);
CUDA 事件
创建和记录事件
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
kernel<<<grid, block>>>(args);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("Time: %f ms\n", milliseconds);
cudaEventDestroy(start);
cudaEventDestroy(stop);
性能测量封装
class CudaTimer {
private:
cudaEvent_t start, stop;
public:
CudaTimer() {
cudaEventCreate(&start);
cudaEventCreate(&stop);
}
~CudaTimer() {
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
void startTimer() {
cudaEventRecord(start);
}
float stopTimer() {
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds;
cudaEventElapsedTime(&milliseconds, start, stop);
return milliseconds;
}
};
int main() {
CudaTimer timer;
timer.startTimer();
kernel<<<grid, block>>>(args);
float time = timer.stopTimer();
printf("Kernel time: %f ms\n", time);
}
事件同步
事件可以用于流之间的同步:
cudaStream_t stream1, stream2;
cudaEvent_t event;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
cudaEventCreate(&event);
kernel1<<<grid, block, 0, stream1>>>(args);
cudaEventRecord(event, stream1);
cudaStreamWaitEvent(stream2, event);
kernel2<<<grid, block, 0, stream2>>>(args);
cudaStreamSynchronize(stream2);
cudaStreamDestroy(stream1);
cudaStreamDestroy(stream2);
cudaEventDestroy(event);
事件属性
cudaEvent_t event;
cudaEventCreate(&event);
bool isComplete = cudaEventQuery(event) == cudaSuccess;
cudaEventSynchronize(event);
float time;
cudaEventElapsedTime(&time, start, stop);
流优先级
CUDA 支持设置流的优先级:
int leastPriority, greatestPriority;
cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
cudaStream_t highPriorityStream, lowPriorityStream;
cudaStreamCreateWithPriority(&highPriorityStream,
cudaStreamNonBlocking,
greatestPriority);
cudaStreamCreateWithPriority(&lowPriorityStream,
cudaStreamNonBlocking,
leastPriority);
流回调
CUDA 支持在流中插入回调函数:
void CUDART_CB myCallback(cudaStream_t stream, cudaError_t status, void *userData) {
printf("Callback executed with status: %d\n", status);
printf("User data: %s\n", (char*)userData);
}
int main() {
cudaStream_t stream;
cudaStreamCreate(&stream);
const char* message = "Hello from callback!";
kernel<<<grid, block, 0, stream>>>(args);
cudaLaunchHostFunc(stream, myCallback, (void*)message);
cudaStreamSynchronize(stream);
cudaStreamDestroy(stream);
}
流图
CUDA 10.0 引入了流图(Stream Graph),可以更高效地管理复杂的工作流。
创建流图
cudaGraph_t graph;
cudaStream_t stream;
cudaStreamCreate(&stream);
cudaGraphCreate(&graph, 0);
cudaGraphNode_t kernelNode, memcpyNode;
cudaKernelNodeParams kernelParams = {0};
kernelParams.func = (void*)kernel;
kernelParams.gridDim = grid;
kernelParams.blockDim = block;
kernelParams.kernelParams = (void**)&args;
cudaGraphAddKernelNode(&kernelNode, graph, NULL, 0, &kernelParams);
cudaMemcpy3DParms memcpyParams = {0};
memcpyParams.srcPtr = make_cudaPitchedPtr(h_data, size, N, 1);
memcpyParams.dstPtr = make_cudaPitchedPtr(d_data, size, N, 1);
memcpyParams.extent = make_cudaExtent(size, 1, 1);
memcpyParams.kind = cudaMemcpyHostToDevice;
cudaGraphAddMemcpyNode(&memcpyNode, graph, NULL, 0, &memcpyParams);
cudaGraphAddDependencies(graph, &memcpyNode, &kernelNode, 1);
实例化和执行
cudaGraphExec_t graphExec;
cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0);
cudaGraphLaunch(graphExec, stream);
cudaStreamSynchronize(stream);
cudaGraphExecDestroy(graphExec);
cudaGraphDestroy(graph);
从流捕获创建图
cudaGraph_t graph;
cudaStream_t stream;
cudaStreamCreate(&stream);
cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
kernel<<<grid, block, 0, stream>>>(args);
cudaMemcpyAsync(d_data, h_data, size, cudaMemcpyHostToDevice, stream);
cudaStreamEndCapture(stream, &graph);
cudaGraphExec_t graphExec;
cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0);
for (int i = 0; i < 100; i++) {
cudaGraphLaunch(graphExec, stream);
}
cudaStreamSynchronize(stream);
最佳实践
1. 合理分配流数量
int deviceCount;
cudaGetDeviceCount(&deviceCount);
int maxStreams = 32;
int optimalStreams = min(deviceCount * 8, maxStreams);
2. 使用页锁定内存
float *h_data;
cudaMallocHost(&h_data, size);
cudaMemcpyAsync(d_data, h_data, size, cudaMemcpyHostToDevice, stream);
cudaFreeHost(h_data);
3. 避免过度同步
for (int i = 0; i < nStreams; i++) {
kernel<<<grid, block, 0, streams[i]>>>(args);
}
cudaDeviceSynchronize();
4. 使用事件进行细粒度同步
cudaEvent_t events[nStreams];
for (int i = 0; i < nStreams; i++) {
cudaEventCreate(&events[i]);
}
for (int i = 0; i < nStreams; i++) {
kernel<<<grid, block, 0, streams[i]>>>(args);
cudaEventRecord(events[i], streams[i]);
}
for (int i = 0; i < nStreams; i++) {
cudaEventSynchronize(events[i]);
}
小结
本章介绍了 CUDA 流与事件:
- 流基础:创建、使用、销毁流
- 异步操作:
cudaMemcpyAsync、cudaMemsetAsync - 流并发:多流并发执行、计算与传输重叠
- 事件:性能测量、流间同步
- 流优先级:设置不同优先级的流
- 流图:高效管理工作流
合理使用流和事件可以显著提高 GPU 利用率。下一章将介绍 性能优化,学习更多 CUDA 性能优化技巧。