跳到主要内容

高级特性

本章介绍 CUDA 的高级特性,包括动态并行、统一内存、多 GPU 编程等。

动态并行

动态并行允许核函数在 GPU 上启动其他核函数,无需返回 CPU。

基本用法

__global__ void childKernel(int *data, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
data[idx] *= 2;
}
}

__global__ void parentKernel(int *data, int n) {
if (threadIdx.x == 0 && blockIdx.x == 0) {
childKernel<<<1, n>>>(data, n);
cudaDeviceSynchronize();
}
}

int main() {
int *d_data;
cudaMalloc(&d_data, N * sizeof(int));

parentKernel<<<1, 1>>>(d_data, N);
cudaDeviceSynchronize();

cudaFree(d_data);
}

编译选项

动态并行需要启用可重定位设备代码:

nvcc -rdc=true dynamic_parallel.cu -o dynamic_parallel

嵌套归约示例

__global__ void nestedReduce(int *data, int n, int depth) {
int tid = threadIdx.x;
int offset = blockDim.x >> depth;

if (offset > 0) {
if (tid < offset) {
data[tid] += data[tid + offset];
}

__syncthreads();

if (tid == 0 && depth < 5) {
nestedReduce<<<1, blockDim.x>>>(data, n, depth + 1);
}
}
}

注意事项

  1. 子核函数启动有额外开销
  2. 需要足够的资源支持嵌套执行
  3. 最大嵌套深度有限制

统一内存

统一内存提供了一个单一的内存地址空间,CPU 和 GPU 都可以访问。

基本用法

int main() {
int n = 1024;
float *data;

cudaMallocManaged(&data, n * sizeof(float));

for (int i = 0; i < n; i++) {
data[i] = 1.0f;
}

kernel<<<grid, block>>>(data, n);
cudaDeviceSynchronize();

for (int i = 0; i < n; i++) {
printf("%f ", data[i]);
}

cudaFree(data);
}

数据预取

使用 cudaMemPrefetchAsync 预取数据到指定设备:

int main() {
float *data;
cudaMallocManaged(&data, N * sizeof(float));

cudaMemPrefetchAsync(data, N * sizeof(float), 0);

kernel<<<grid, block>>>(data, N);

cudaMemPrefetchAsync(data, N * sizeof(float), cudaCpuDeviceId);
cudaDeviceSynchronize();

for (int i = 0; i < N; i++) {
printf("%f ", data[i]);
}

cudaFree(data);
}

内存建议

使用 cudaMemAdvise 提供内存访问建议:

cudaMemAdvise(data, size, cudaMemAdviseSetPreferredLocation, 0);
cudaMemAdvise(data, size, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId);

统一内存优势

  1. 简化编程模型
  2. 自动数据迁移
  3. 支持超额订阅(数据量超过 GPU 内存)
  4. 更容易实现 CPU/GPU 协作

多 GPU 编程

设备管理

int deviceCount;
cudaGetDeviceCount(&deviceCount);

for (int i = 0; i < deviceCount; i++) {
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
printf("Device %d: %s\n", i, prop.name);
}

设备选择

cudaSetDevice(0);
kernel<<<grid, block>>>(data0);

cudaSetDevice(1);
kernel<<<grid, block>>>(data1);

点对点通信

检查点对点访问:

int canAccess;
cudaDeviceCanAccessPeer(&canAccess, 0, 1);

if (canAccess) {
cudaSetDevice(0);
cudaDeviceEnablePeerAccess(1, 0);

cudaMemcpyPeer(data1, 1, data0, 0, size);
}

多 GPU 示例

int main() {
int deviceCount;
cudaGetDeviceCount(&deviceCount);

float *d_data[deviceCount];
cudaStream_t streams[deviceCount];

for (int i = 0; i < deviceCount; i++) {
cudaSetDevice(i);
cudaMalloc(&d_data[i], size);
cudaStreamCreate(&streams[i]);
}

for (int i = 0; i < deviceCount; i++) {
cudaSetDevice(i);
cudaMemcpyAsync(d_data[i], h_data, size, cudaMemcpyHostToDevice, streams[i]);
kernel<<<grid, block, 0, streams[i]>>>(d_data[i], N);
cudaMemcpyAsync(h_result[i], d_data[i], size, cudaMemcpyDeviceToHost, streams[i]);
}

for (int i = 0; i < deviceCount; i++) {
cudaSetDevice(i);
cudaStreamSynchronize(streams[i]);
cudaStreamDestroy(streams[i]);
cudaFree(d_data[i]);
}
}

协作组扩展

Grid 同步

#include <cooperative_groups.h>

namespace cg = cooperative_groups;

__global__ void gridSyncKernel(float *data, int n) {
cg::grid_group grid = cg::this_grid();

int idx = grid.thread_rank();
if (idx < n) {
data[idx] = idx;
}

cg::sync(grid);

if (idx < n) {
data[idx] = data[(idx + 1) % n];
}
}

int main() {
int blockSize = 256;
int gridSize = (N + blockSize - 1) / blockSize;

void *args[] = {&d_data, &N};
cudaLaunchCooperativeKernel((void*)gridSyncKernel, gridSize, blockSize, args);

cudaDeviceSynchronize();
}

多 Grid 同步

__global__ void multiGridKernel(float *data, int n) {
cg::multi_grid_group mg = cg::this_multi_grid();

int idx = mg.thread_rank();
if (idx < n) {
data[idx] = idx;
}

cg::sync(mg);
}

int main() {
cudaLaunchParams params[2];

for (int i = 0; i < 2; i++) {
params[i].func = (void*)multiGridKernel;
params[i].gridDim = gridSize;
params[i].blockDim = blockSize;
params[i].args = args;
params[i].sharedMem = 0;
params[i].stream = streams[i];
}

cudaLaunchCooperativeKernelMultiDevice(params, 2);
}

张量核心

张量核心是 NVIDIA GPU 上专门用于矩阵乘法的硬件单元。

WMMA API

#include <mma.h>

using namespace nvcuda;

__global__ void tensorCoreKernel(half *A, half *B, float *C, int M, int N, int K) {
wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> a_frag;
wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::col_major> b_frag;
wmma::fragment<wmma::accumulator, 16, 16, 16, float> c_frag;

wmma::load_matrix_sync(a_frag, A, 16);
wmma::load_matrix_sync(b_frag, B, 16);
wmma::fill_fragment(c_frag, 0.0f);

wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);

wmma::store_matrix_sync(C, c_frag, 16, wmma::mem_row_major);
}

注意事项

  1. 张量核心需要特定的数据类型(FP16、BF16、INT8 等)
  2. 矩阵维度需要是 16 的倍数
  3. 不同架构的张量核心支持不同

异步计算

异步拷贝

__global__ void asyncCopyKernel(float *dst, float *src, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;

if (idx < n) {
__pipeline_memcpy_async(&dst[idx], &src[idx], sizeof(float));
}

__pipeline_commit();
__pipeline_wait_prior(0);
}

异步屏障

__global__ void asyncBarrierKernel(float *data, int n) {
__shared__ cuda::barrier<cuda::thread_scope_block> barrier;

if (threadIdx.x == 0) {
init(&barrier, blockDim.x);
}
__syncthreads();

int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
data[idx] *= 2;
}

cuda::barrier_arrive_and_wait(barrier);

if (idx < n) {
data[idx] += 1;
}
}

CUDA 图高级用法

条件节点

cudaGraph_t graph;
cudaGraphCreate(&graph, 0);

cudaGraphNode_t kernelNode, conditionalNode;

cudaKernelNodeParams kernelParams = {0};
kernelParams.func = (void*)kernel;
kernelParams.gridDim = grid;
kernelParams.blockDim = block;

cudaGraphAddKernelNode(&kernelNode, graph, NULL, 0, &kernelParams);

cudaGraphConditionalParams conditionalParams = {0};
conditionalParams.handle = conditionHandle;

cudaGraphAddConditionalNode(&conditionalNode, graph, &kernelNode, 1, &conditionalParams);

图更新

cudaGraphExec_t graphExec;
cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0);

cudaKernelNodeParams newParams = {0};
newParams.func = (void*)newKernel;
newParams.gridDim = newGrid;
newParams.blockDim = newBlock;

cudaGraphExecKernelNodeSetParams(graphExec, kernelNode, &newParams);

cudaGraphLaunch(graphExec, stream);

小结

本章介绍了 CUDA 的高级特性:

  1. 动态并行:核函数启动核函数
  2. 统一内存:CPU 和 GPU 共享地址空间
  3. 多 GPU 编程:设备管理、点对点通信
  4. 协作组扩展:Grid 同步、多 Grid 同步
  5. 张量核心:高性能矩阵运算
  6. 异步计算:异步拷贝、异步屏障
  7. CUDA 图高级用法:条件节点、图更新

这些高级特性可以帮助实现更复杂、更高效的 CUDA 程序。下一章将通过 实战案例 巩固所学知识。