高级特性
本章介绍 CUDA 的高级特性,包括动态并行、统一内存、多 GPU 编程等。
动态并行
动态并行允许核函数在 GPU 上启动其他核函数,无需返回 CPU。
基本用法
__global__ void childKernel(int *data, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
data[idx] *= 2;
}
}
__global__ void parentKernel(int *data, int n) {
if (threadIdx.x == 0 && blockIdx.x == 0) {
childKernel<<<1, n>>>(data, n);
cudaDeviceSynchronize();
}
}
int main() {
int *d_data;
cudaMalloc(&d_data, N * sizeof(int));
parentKernel<<<1, 1>>>(d_data, N);
cudaDeviceSynchronize();
cudaFree(d_data);
}
编译选项
动态并行需要启用可重定位设备代码:
nvcc -rdc=true dynamic_parallel.cu -o dynamic_parallel
嵌套归约示例
__global__ void nestedReduce(int *data, int n, int depth) {
int tid = threadIdx.x;
int offset = blockDim.x >> depth;
if (offset > 0) {
if (tid < offset) {
data[tid] += data[tid + offset];
}
__syncthreads();
if (tid == 0 && depth < 5) {
nestedReduce<<<1, blockDim.x>>>(data, n, depth + 1);
}
}
}
注意事项
- 子核函数启动有额外开销
- 需要足够的资源支持嵌套执行
- 最大嵌套深度有限制
统一内存
统一内存提供了一个单一的内存地址空间,CPU 和 GPU 都可以访问。
基本用法
int main() {
int n = 1024;
float *data;
cudaMallocManaged(&data, n * sizeof(float));
for (int i = 0; i < n; i++) {
data[i] = 1.0f;
}
kernel<<<grid, block>>>(data, n);
cudaDeviceSynchronize();
for (int i = 0; i < n; i++) {
printf("%f ", data[i]);
}
cudaFree(data);
}
数据预取
使用 cudaMemPrefetchAsync 预取数据到指定设备:
int main() {
float *data;
cudaMallocManaged(&data, N * sizeof(float));
cudaMemPrefetchAsync(data, N * sizeof(float), 0);
kernel<<<grid, block>>>(data, N);
cudaMemPrefetchAsync(data, N * sizeof(float), cudaCpuDeviceId);
cudaDeviceSynchronize();
for (int i = 0; i < N; i++) {
printf("%f ", data[i]);
}
cudaFree(data);
}
内存建议
使用 cudaMemAdvise 提供内存访问建议:
cudaMemAdvise(data, size, cudaMemAdviseSetPreferredLocation, 0);
cudaMemAdvise(data, size, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId);
统一内存优势
- 简化编程模型
- 自动数据迁移
- 支持超额订阅(数据量超过 GPU 内存)
- 更容易实现 CPU/GPU 协作
多 GPU 编程
设备管理
int deviceCount;
cudaGetDeviceCount(&deviceCount);
for (int i = 0; i < deviceCount; i++) {
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
printf("Device %d: %s\n", i, prop.name);
}
设备选择
cudaSetDevice(0);
kernel<<<grid, block>>>(data0);
cudaSetDevice(1);
kernel<<<grid, block>>>(data1);
点对点通信
检查点对点访问:
int canAccess;
cudaDeviceCanAccessPeer(&canAccess, 0, 1);
if (canAccess) {
cudaSetDevice(0);
cudaDeviceEnablePeerAccess(1, 0);
cudaMemcpyPeer(data1, 1, data0, 0, size);
}
多 GPU 示例
int main() {
int deviceCount;
cudaGetDeviceCount(&deviceCount);
float *d_data[deviceCount];
cudaStream_t streams[deviceCount];
for (int i = 0; i < deviceCount; i++) {
cudaSetDevice(i);
cudaMalloc(&d_data[i], size);
cudaStreamCreate(&streams[i]);
}
for (int i = 0; i < deviceCount; i++) {
cudaSetDevice(i);
cudaMemcpyAsync(d_data[i], h_data, size, cudaMemcpyHostToDevice, streams[i]);
kernel<<<grid, block, 0, streams[i]>>>(d_data[i], N);
cudaMemcpyAsync(h_result[i], d_data[i], size, cudaMemcpyDeviceToHost, streams[i]);
}
for (int i = 0; i < deviceCount; i++) {
cudaSetDevice(i);
cudaStreamSynchronize(streams[i]);
cudaStreamDestroy(streams[i]);
cudaFree(d_data[i]);
}
}
协作组扩展
Grid 同步
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
__global__ void gridSyncKernel(float *data, int n) {
cg::grid_group grid = cg::this_grid();
int idx = grid.thread_rank();
if (idx < n) {
data[idx] = idx;
}
cg::sync(grid);
if (idx < n) {
data[idx] = data[(idx + 1) % n];
}
}
int main() {
int blockSize = 256;
int gridSize = (N + blockSize - 1) / blockSize;
void *args[] = {&d_data, &N};
cudaLaunchCooperativeKernel((void*)gridSyncKernel, gridSize, blockSize, args);
cudaDeviceSynchronize();
}
多 Grid 同步
__global__ void multiGridKernel(float *data, int n) {
cg::multi_grid_group mg = cg::this_multi_grid();
int idx = mg.thread_rank();
if (idx < n) {
data[idx] = idx;
}
cg::sync(mg);
}
int main() {
cudaLaunchParams params[2];
for (int i = 0; i < 2; i++) {
params[i].func = (void*)multiGridKernel;
params[i].gridDim = gridSize;
params[i].blockDim = blockSize;
params[i].args = args;
params[i].sharedMem = 0;
params[i].stream = streams[i];
}
cudaLaunchCooperativeKernelMultiDevice(params, 2);
}
张量核心
张量核心是 NVIDIA GPU 上专门用于矩阵乘法的硬件单元。
WMMA API
#include <mma.h>
using namespace nvcuda;
__global__ void tensorCoreKernel(half *A, half *B, float *C, int M, int N, int K) {
wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> a_frag;
wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::col_major> b_frag;
wmma::fragment<wmma::accumulator, 16, 16, 16, float> c_frag;
wmma::load_matrix_sync(a_frag, A, 16);
wmma::load_matrix_sync(b_frag, B, 16);
wmma::fill_fragment(c_frag, 0.0f);
wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
wmma::store_matrix_sync(C, c_frag, 16, wmma::mem_row_major);
}
注意事项
- 张量核心需要特定的数据类型(FP16、BF16、INT8 等)
- 矩阵维度需要是 16 的倍数
- 不同架构的张量核心支持不同
异步计算
异步拷贝
__global__ void asyncCopyKernel(float *dst, float *src, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
__pipeline_memcpy_async(&dst[idx], &src[idx], sizeof(float));
}
__pipeline_commit();
__pipeline_wait_prior(0);
}
异步屏障
__global__ void asyncBarrierKernel(float *data, int n) {
__shared__ cuda::barrier<cuda::thread_scope_block> barrier;
if (threadIdx.x == 0) {
init(&barrier, blockDim.x);
}
__syncthreads();
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
data[idx] *= 2;
}
cuda::barrier_arrive_and_wait(barrier);
if (idx < n) {
data[idx] += 1;
}
}
CUDA 图高级用法
条件节点
cudaGraph_t graph;
cudaGraphCreate(&graph, 0);
cudaGraphNode_t kernelNode, conditionalNode;
cudaKernelNodeParams kernelParams = {0};
kernelParams.func = (void*)kernel;
kernelParams.gridDim = grid;
kernelParams.blockDim = block;
cudaGraphAddKernelNode(&kernelNode, graph, NULL, 0, &kernelParams);
cudaGraphConditionalParams conditionalParams = {0};
conditionalParams.handle = conditionHandle;
cudaGraphAddConditionalNode(&conditionalNode, graph, &kernelNode, 1, &conditionalParams);
图更新
cudaGraphExec_t graphExec;
cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0);
cudaKernelNodeParams newParams = {0};
newParams.func = (void*)newKernel;
newParams.gridDim = newGrid;
newParams.blockDim = newBlock;
cudaGraphExecKernelNodeSetParams(graphExec, kernelNode, &newParams);
cudaGraphLaunch(graphExec, stream);
小结
本章介绍了 CUDA 的高级特性:
- 动态并行:核函数启动核函数
- 统一内存:CPU 和 GPU 共享地址空间
- 多 GPU 编程:设备管理、点对点通信
- 协作组扩展:Grid 同步、多 Grid 同步
- 张量核心:高性能矩阵运算
- 异步计算:异步拷贝、异步屏障
- CUDA 图高级用法:条件节点、图更新
这些高级特性可以帮助实现更复杂、更高效的 CUDA 程序。下一章将通过 实战案例 巩固所学知识。