核函数详解

核函数（Kernel）是 CUDA 编程的核心，是在 GPU 上并行执行的函数。本章将详细介绍核函数的定义、执行配置和优化技巧。

核函数基础

函数修饰符

CUDA 扩展了 C++ 的函数修饰符：

修饰符	执行位置	调用位置	返回类型
`__global__`	Device	Host 或 Device	void
`__device__`	Device	Device	任意
`__host__`	Host	Host	任意

核函数定义

__global__ void kernelName(parameters) {
}

核函数的限制：

返回类型必须是 void
不能是类的成员函数（除非使用静态成员）
不能使用可变参数
不能使用递归（CUDA 9.0 之前）
不能使用静态变量

函数组合修饰符

可以同时使用多个修饰符：

__host__ __device__ int add(int a, int b) {
    return a + b;
}

这样定义的函数可以同时在 CPU 和 GPU 上调用。

执行配置

基本语法

kernel<<<gridDim, blockDim>>>(args);

dim3 类型

dim3 是 CUDA 定义的三维向量类型：

dim3 v1;
dim3 v2(x);
dim3 v3(x, y);
dim3 v4(x, y, z);

未指定的维度默认为 1。

一维配置

int n = 10000;
int blockSize = 256;
int gridSize = (n + blockSize - 1) / blockSize;

kernel<<<gridSize, blockSize>>>(data, n);

二维配置

int width = 1024, height = 1024;
dim3 blockSize(16, 16);
dim3 gridSize((width + 15) / 16, (height + 15) / 16);

kernel<<<gridSize, blockSize>>>(data, width, height);

三维配置

int nx = 64, ny = 64, nz = 64;
dim3 blockSize(4, 4, 4);
dim3 gridSize((nx + 3) / 4, (ny + 3) / 4, (nz + 3) / 4);

kernel<<<gridSize, blockSize>>>(data, nx, ny, nz);

完整执行配置

kernel<<<gridDim, blockDim, sharedMemSize, stream>>>(args);

参数	类型	说明
gridDim	dim3/int	Grid 维度
blockDim	dim3/int	Block 维度
sharedMemSize	size_t	动态共享内存大小
stream	cudaStream_t	执行流

内置变量

核函数中可以访问以下内置变量：

线程和块索引

变量	类型	说明
`threadIdx`	uint3	线程在 Block 中的索引
`blockIdx`	uint3	Block 在 Grid 中的索引
`blockDim`	dim3	Block 的维度
`gridDim`	dim3	Grid 的维度

其他内置变量

变量	类型	说明
`warpSize`	int	Warp 大小（通常为 32）

计算全局索引

__device__ int getGlobalIdx_1D() {
    return blockIdx.x * blockDim.x + threadIdx.x;
}

__device__ int getGlobalIdx_2D() {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    return y * gridDim.x * blockDim.x + x;
}

__device__ int getGlobalIdx_3D() {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int z = blockIdx.z * blockDim.z + threadIdx.z;
    return z * gridDim.x * blockDim.x * gridDim.y * blockDim.y 
         + y * gridDim.x * blockDim.x + x;
}

核函数限制

寄存器限制

每个线程可用的寄存器数量有限。过多的寄存器使用会导致：

占用率下降
寄存器溢出到局部内存

查看寄存器使用情况：

nvcc -Xptxas -v kernel.cu

参数传递限制

核函数参数通过常量内存传递，总大小限制为 4KB：

__global__ void kernel(float *data, int n) {
}

struct LargeStruct {
    float data[10000];
};

__global__ void badKernel(LargeStruct s) {
}

解决方案：使用指针传递大型数据。

递归限制

CUDA 9.0 之前不支持递归，CUDA 9.0+ 支持有限递归：

__global__ void recursiveKernel(int depth) {
    if (depth > 0) {
        recursiveKernel<<<1, 1>>>(depth - 1);
        cudaDeviceSynchronize();
    }
}

动态并行

动态并行允许核函数启动其他核函数（CUDA 5.0+）。

基本用法

__global__ void childKernel(int *data, int n) {
    int idx = threadIdx.x;
    if (idx < n) {
        data[idx] *= 2;
    }
}

__global__ void parentKernel(int *data, int n) {
    if (threadIdx.x == 0) {
        childKernel<<<1, n>>>(data, n);
        cudaDeviceSynchronize();
    }
}

注意事项

需要链接 CUDA 运行时：nvcc -rdc=true
子核函数在新的 Grid 中执行
父核函数必须等待子核函数完成

核函数优化

1. 减少分支分化

避免 Warp 内的分支分化：

__global__ void badBranch(int *data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        if (idx % 2 == 0) {
            data[idx] *= 2;
        } else {
            data[idx] += 1;
        }
    }
}

__global__ void goodBranch(int *data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        int warpIdx = idx % 32;
        if (warpIdx < 16) {
            data[idx] *= 2;
        } else {
            data[idx] += 1;
        }
    }
}

2. 循环展开

使用 #pragma unroll 展开循环：

__global__ void kernel(float *data) {
    int idx = threadIdx.x;
    float sum = 0.0f;
    
    #pragma unroll
    for (int i = 0; i < 32; i++) {
        sum += data[idx + i * blockDim.x];
    }
    
    data[idx] = sum;
}

3. 使用 restrict

使用 __restrict__ 告诉编译器指针不重叠：

__global__ void kernel(float * __restrict__ a, 
                       float * __restrict__ b,
                       float * __restrict__ c, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}

4. 使用 const 和 restrict

__global__ void kernel(const float * __restrict__ input,
                       float * __restrict__ output, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        output[idx] = input[idx] * 2.0f;
    }
}

核函数模板

CUDA 支持模板核函数：

template<typename T>
__global__ void addKernel(T *a, T *b, T *c, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}

int main() {
    addKernel<float><<<grid, block>>>(d_a, d_b, d_c, n);
    addKernel<int><<<grid, block>>>(d_a, d_b, d_c, n);
}

核函数类

CUDA 支持使用函数对象（Functor）：

struct AddOp {
    __device__ float operator()(float a, float b) {
        return a + b;
    }
};

struct MulOp {
    __device__ float operator()(float a, float b) {
        return a * b;
    }
};

template<typename Op>
__global__ void binaryOp(float *a, float *b, float *c, int n, Op op) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        c[idx] = op(a[idx], b[idx]);
    }
}

int main() {
    binaryOp<<<grid, block>>>(d_a, d_b, d_c, n, AddOp());
    binaryOp<<<grid, block>>>(d_a, d_b, d_c, n, MulOp());
}

错误处理

启动错误

kernel<<<grid, block>>>(args);

cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
    printf("Launch error: %s\n", cudaGetErrorString(err));
}

执行错误

kernel<<<grid, block>>>(args);

cudaError_t err = cudaDeviceSynchronize();
if (err != cudaSuccess) {
    printf("Execution error: %s\n", cudaGetErrorString(err));
}

完整错误检查

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", \
                    __FILE__, __LINE__, cudaGetErrorString(err)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)

#define KERNEL_CHECK(call) \
    do { \
        call; \
        cudaError_t err = cudaGetLastError(); \
        if (err != cudaSuccess) { \
            fprintf(stderr, "Kernel launch error at %s:%d: %s\n", \
                    __FILE__, __LINE__, cudaGetErrorString(err)); \
            exit(EXIT_FAILURE); \
        } \
        err = cudaDeviceSynchronize(); \
        if (err != cudaSuccess) { \
            fprintf(stderr, "Kernel execution error at %s:%d: %s\n", \
                    __FILE__, __LINE__, cudaGetErrorString(err)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)

KERNEL_CHECK(kernel<<<grid, block>>>(args));

小结

本章详细介绍了 CUDA 核函数：

函数修饰符：__global__、__device__、__host__
执行配置：Grid 和 Block 维度的设置
内置变量：threadIdx、blockIdx、blockDim、gridDim
核函数限制：返回类型、参数大小、递归等
动态并行：核函数启动核函数
优化技巧：减少分支分化、循环展开、使用 __restrict__

下一章将介绍同步机制，学习如何在 CUDA 程序中进行线程同步和原子操作。

核函数基础​

函数修饰符​

核函数定义​

函数组合修饰符​

执行配置​

基本语法​

dim3 类型​

一维配置​

二维配置​

三维配置​

完整执行配置​

内置变量​

线程和块索引​

其他内置变量​

计算全局索引​

核函数限制​

寄存器限制​

参数传递限制​

递归限制​

动态并行​

基本用法​

注意事项​

核函数优化​

1. 减少分支分化​

2. 循环展开​

3. 使用 restrict​

4. 使用 const 和 restrict​

核函数模板​

核函数类​

错误处理​

启动错误​

执行错误​

完整错误检查​

小结​