跳到主要内容

实战案例

本章通过几个经典案例,将前面学习的 CUDA 知识应用到实际场景中。

向量加法

向量加法是最简单的 CUDA 示例,适合入门学习。

完整实现

#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>

#define CUDA_CHECK(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA error at %s:%d: %s\n", \
__FILE__, __LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
} while(0)

__global__ void vectorAdd(const float *a, const float *b, float *c, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;

if (idx < n) {
c[idx] = a[idx] + b[idx];
}
}

int main() {
const int N = 1 << 20;
const size_t size = N * sizeof(float);

float *h_a = (float*)malloc(size);
float *h_b = (float*)malloc(size);
float *h_c = (float*)malloc(size);

for (int i = 0; i < N; i++) {
h_a[i] = 1.0f;
h_b[i] = 2.0f;
}

float *d_a, *d_b, *d_c;
CUDA_CHECK(cudaMalloc(&d_a, size));
CUDA_CHECK(cudaMalloc(&d_b, size));
CUDA_CHECK(cudaMalloc(&d_c, size));

CUDA_CHECK(cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice));

int blockSize = 256;
int gridSize = (N + blockSize - 1) / blockSize;

cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);

cudaEventRecord(start);
vectorAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, N);
cudaEventRecord(stop);
cudaEventSynchronize(stop);

float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("Kernel time: %.3f ms\n", milliseconds);

CUDA_CHECK(cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost));

bool success = true;
for (int i = 0; i < N; i++) {
if (fabs(h_c[i] - 3.0f) > 1e-5) {
success = false;
break;
}
}
printf("Result: %s\n", success ? "PASS" : "FAIL");

cudaEventDestroy(start);
cudaEventDestroy(stop);
CUDA_CHECK(cudaFree(d_a));
CUDA_CHECK(cudaFree(d_b));
CUDA_CHECK(cudaFree(d_c));
free(h_a);
free(h_b);
free(h_c);

return 0;
}

矩阵乘法

矩阵乘法是展示 CUDA 性能优化的经典案例。

朴素实现

__global__ void matrixMulNaive(float *A, float *B, float *C, int M, int N, int K) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;

if (row < M && col < N) {
float sum = 0.0f;
for (int k = 0; k < K; k++) {
sum += A[row * K + k] * B[k * N + col];
}
C[row * N + col] = sum;
}
}

共享内存优化

#define TILE_SIZE 16

__global__ void matrixMulShared(float *A, float *B, float *C, int M, int N, int K) {
__shared__ float sA[TILE_SIZE][TILE_SIZE];
__shared__ float sB[TILE_SIZE][TILE_SIZE];

int row = blockIdx.y * TILE_SIZE + threadIdx.y;
int col = blockIdx.x * TILE_SIZE + threadIdx.x;

float sum = 0.0f;

for (int t = 0; t < (K + TILE_SIZE - 1) / TILE_SIZE; t++) {
if (row < M && t * TILE_SIZE + threadIdx.x < K) {
sA[threadIdx.y][threadIdx.x] = A[row * K + t * TILE_SIZE + threadIdx.x];
} else {
sA[threadIdx.y][threadIdx.x] = 0.0f;
}

if (t * TILE_SIZE + threadIdx.y < K && col < N) {
sB[threadIdx.y][threadIdx.x] = B[(t * TILE_SIZE + threadIdx.y) * N + col];
} else {
sB[threadIdx.y][threadIdx.x] = 0.0f;
}

__syncthreads();

for (int k = 0; k < TILE_SIZE; k++) {
sum += sA[threadIdx.y][k] * sB[k][threadIdx.x];
}

__syncthreads();
}

if (row < M && col < N) {
C[row * N + col] = sum;
}
}

完整测试代码

#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define TILE_SIZE 16

__global__ void matrixMulShared(float *A, float *B, float *C, int M, int N, int K);

void cpuMatrixMul(float *A, float *B, float *C, int M, int N, int K) {
for (int i = 0; i < M; i++) {
for (int j = 0; j < N; j++) {
float sum = 0.0f;
for (int k = 0; k < K; k++) {
sum += A[i * K + k] * B[k * N + j];
}
C[i * N + j] = sum;
}
}
}

int main() {
int M = 1024, N = 1024, K = 1024;
size_t sizeA = M * K * sizeof(float);
size_t sizeB = K * N * sizeof(float);
size_t sizeC = M * N * sizeof(float);

float *h_A = (float*)malloc(sizeA);
float *h_B = (float*)malloc(sizeB);
float *h_C = (float*)malloc(sizeC);
float *h_C_ref = (float*)malloc(sizeC);

srand(time(NULL));
for (int i = 0; i < M * K; i++) h_A[i] = (float)rand() / RAND_MAX;
for (int i = 0; i < K * N; i++) h_B[i] = (float)rand() / RAND_MAX;

float *d_A, *d_B, *d_C;
cudaMalloc(&d_A, sizeA);
cudaMalloc(&d_B, sizeB);
cudaMalloc(&d_C, sizeC);

cudaMemcpy(d_A, h_A, sizeA, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, sizeB, cudaMemcpyHostToDevice);

dim3 blockSize(TILE_SIZE, TILE_SIZE);
dim3 gridSize((N + TILE_SIZE - 1) / TILE_SIZE, (M + TILE_SIZE - 1) / TILE_SIZE);

cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);

cudaEventRecord(start);
matrixMulShared<<<gridSize, blockSize>>>(d_A, d_B, d_C, M, N, K);
cudaEventRecord(stop);
cudaEventSynchronize(stop);

float gpuTime;
cudaEventElapsedTime(&gpuTime, start, stop);

cudaMemcpy(h_C, d_C, sizeC, cudaMemcpyDeviceToHost);

printf("GPU time: %.3f ms\n", gpuTime);
printf("GPU performance: %.2f GFLOPS\n",
2.0 * M * N * K / (gpuTime * 1e6));

cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
free(h_C_ref);

return 0;
}

并行归约

并行归约是展示 CUDA 并行算法的经典案例。

朴素归约

__global__ void reduceNaive(float *data, float *result, int n) {
int tid = threadIdx.x;
int idx = blockIdx.x * blockDim.x + threadIdx.x;

__shared__ float sData[256];
sData[tid] = (idx < n) ? data[idx] : 0.0f;

__syncthreads();

for (int stride = 1; stride < blockDim.x; stride *= 2) {
if (tid % (2 * stride) == 0) {
sData[tid] += sData[tid + stride];
}
__syncthreads();
}

if (tid == 0) {
atomicAdd(result, sData[0]);
}
}

优化归约(避免 Bank 冲突)

__global__ void reduceOptimized(float *data, float *result, int n) {
int tid = threadIdx.x;
int idx = blockIdx.x * blockDim.x + threadIdx.x;

__shared__ float sData[256];
sData[tid] = (idx < n) ? data[idx] : 0.0f;

__syncthreads();

for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
if (tid < stride) {
sData[tid] += sData[tid + stride];
}
__syncthreads();
}

if (tid == 0) {
atomicAdd(result, sData[0]);
}
}

Warp 级归约

__device__ float warpReduce(float val) {
for (int offset = 16; offset > 0; offset >>= 1) {
val += __shfl_down_sync(0xffffffff, val, offset);
}
return val;
}

__global__ void reduceWarp(float *data, float *result, int n) {
int tid = threadIdx.x;
int idx = blockIdx.x * blockDim.x + threadIdx.x;

float val = (idx < n) ? data[idx] : 0.0f;

__shared__ float sData[32];

val = warpReduce(val);

if (tid % 32 == 0) {
sData[tid / 32] = val;
}

__syncthreads();

if (tid < 32) {
val = sData[tid];
val = warpReduce(val);

if (tid == 0) {
atomicAdd(result, val);
}
}
}

图像处理

灰度转换

__global__ void rgbToGray(unsigned char *rgb, unsigned char *gray, int width, int height) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;

if (x < width && y < height) {
int idx = y * width + x;
int rgbIdx = idx * 3;

unsigned char r = rgb[rgbIdx];
unsigned char g = rgb[rgbIdx + 1];
unsigned char b = rgb[rgbIdx + 2];

gray[idx] = (unsigned char)(0.299f * r + 0.587f * g + 0.114f * b);
}
}

高斯模糊

#define KERNEL_SIZE 5
#define KERNEL_RADIUS 2

__constant__ float gaussianKernel[KERNEL_SIZE * KERNEL_SIZE];

__global__ void gaussianBlur(unsigned char *input, unsigned char *output,
int width, int height) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;

if (x < width && y < height) {
float sum = 0.0f;

for (int ky = -KERNEL_RADIUS; ky <= KERNEL_RADIUS; ky++) {
for (int kx = -KERNEL_RADIUS; kx <= KERNEL_RADIUS; kx++) {
int nx = min(max(x + kx, 0), width - 1);
int ny = min(max(y + ky, 0), height - 1);

int kernelIdx = (ky + KERNEL_RADIUS) * KERNEL_SIZE + (kx + KERNEL_RADIUS);
sum += input[ny * width + nx] * gaussianKernel[kernelIdx];
}
}

output[y * width + x] = (unsigned char)sum;
}
}

直方图计算

__global__ void histogramKernel(unsigned char *data, int *histogram, int n, int bins) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;

__shared__ int sHistogram[256];

if (threadIdx.x < bins) {
sHistogram[threadIdx.x] = 0;
}
__syncthreads();

if (idx < n) {
atomicAdd(&sHistogram[data[idx]], 1);
}
__syncthreads();

if (threadIdx.x < bins) {
atomicAdd(&histogram[threadIdx.x], sHistogram[threadIdx.x]);
}
}

前缀和(Scan)

__global__ void scanKernel(float *input, float *output, int n) {
__shared__ float sData[1024];

int tid = threadIdx.x;
int idx = blockIdx.x * blockDim.x + threadIdx.x;

sData[tid] = (idx < n) ? input[idx] : 0.0f;

__syncthreads();

for (int offset = 1; offset < blockDim.x; offset *= 2) {
float val = 0.0f;
if (tid >= offset) {
val = sData[tid - offset];
}
__syncthreads();
if (tid >= offset) {
sData[tid] += val;
}
__syncthreads();
}

if (idx < n) {
output[idx] = sData[tid];
}
}

小结

本章通过几个经典案例展示了 CUDA 编程的实际应用:

  1. 向量加法:最基础的 CUDA 示例
  2. 矩阵乘法:展示内存优化的重要性
  3. 并行归约:展示并行算法设计
  4. 图像处理:展示二维数据处理
  5. 直方图:展示原子操作应用
  6. 前缀和:展示复杂并行算法

这些案例涵盖了 CUDA 编程的主要技术点,可以作为实际项目开发的参考。