AI 基础设施速查表
本文档提供 AI 基础设施开发运维中常用的命令、配置和最佳实践速查。
GPU 管理
GPU 规格速查
| GPU | 显存 | 带宽 | FP16 算力 | NVLink |
|---|---|---|---|---|
| A100 80GB | 80GB HBM2e | 2.0 TB/s | 312 TFLOPS | 600 GB/s |
| H100 SXM | 80GB HBM3 | 3.35 TB/s | 1,979 TFLOPS | 900 GB/s |
| H200 SXM | 141GB HBM3e | 4.8 TB/s | 1,979 TFLOPS | 900 GB/s |
| B200 | 180GB HBM3e | 8.0 TB/s | 4,500 TFLOPS | 1.8 TB/s |
NVIDIA 驱动和工具
# 查看 GPU 信息
nvidia-smi
# 持续监控
watch -n 1 nvidia-smi
# 查看 GPU 详细信息
nvidia-smi -q
# 查看特定 GPU
nvidia-smi -i 0
# 设置 GPU 性能模式
sudo nvidia-smi -pm 1 # 启用持久模式
sudo nvidia-smi -pl 250 # 设置功耗限制
sudo nvidia-smi -ac 1215,1410 # 设置时钟频率
# 重置 GPU
sudo nvidia-smi --gpu-reset -i 0
CUDA 环境
# 查看 CUDA 版本
nvcc --version
cat /usr/local/cuda/version.txt
# 查看 cuDNN 版本
cat /usr/include/cudnn_version.h | grep CUDNN_MAJOR -A 2
# 设置环境变量
export CUDA_HOME=/usr/local/cuda
export PATH=$CUDA_HOME/bin:$PATH
export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
# 编译 CUDA 程序
nvcc -o myprogram myprogram.cu
nvcc -arch=sm_80 -o myprogram myprogram.cu # 指定架构
GPU 显存管理
import torch
# 查看显存使用
print(torch.cuda.memory_allocated()) # 已分配显存
print(torch.cuda.memory_reserved()) # 预留显存
print(torch.cuda.max_memory_allocated()) # 峰值显存
# 清空缓存
torch.cuda.empty_cache()
# 重置峰值统计
torch.cuda.reset_peak_memory_stats()
# 详细显存信息
print(torch.cuda.memory_summary())
# 设置显存增长模式
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
# 限制显存使用
torch.cuda.set_per_process_memory_fraction(0.8, 0)
显存诊断脚本:
def print_gpu_memory():
allocated = torch.cuda.memory_allocated() / 1024**3
reserved = torch.cuda.memory_reserved() / 1024**3
max_allocated = torch.cuda.max_memory_allocated() / 1024**3
print(f"已分配: {allocated:.2f} GB | 预留: {reserved:.2f} GB | 峰值: {max_allocated:.2f} GB")
分布式训练
PyTorch DDP
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
# 初始化
dist.init_process_group(backend="nccl")
local_rank = int(os.environ["LOCAL_RANK"])
torch.cuda.set_device(local_rank)
# 包装模型
model = DDP(model, device_ids=[local_rank])
# 清理
dist.destroy_process_group()
# 启动命令
torchrun --nproc_per_node=8 train.py
torchrun --nnodes=2 --nproc_per_node=8 --node_rank=0 \
--master_addr="10.0.0.1" --master_port=29500 train.py
DeepSpeed
# 安装
pip install deepspeed
# 启动训练
deepspeed --num_gpus=8 train.py --deepspeed_config ds_config.json
# 多机训练
deepspeed --num_nodes=2 --num_gpus=8 \
--hostfile hostfile train.py --deepspeed_config ds_config.json
ds_config.json 模板:
{
"train_batch_size": 256,
"gradient_accumulation_steps": 1,
"optimizer": {
"type": "AdamW",
"params": {
"lr": 1e-5,
"betas": [0.9, 0.999],
"eps": 1e-8
}
},
"zero_optimization": {
"stage": 2,
"offload_optimizer": {"device": "cpu"},
"allgather_partitions": true,
"reduce_scatter": true
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"initial_scale_power": 16
}
}
环境变量
# 分布式训练常用环境变量
export MASTER_ADDR=10.0.0.1
export MASTER_PORT=29500
export WORLD_SIZE=16
export RANK=0
export LOCAL_RANK=0
# NCCL 调试
export NCCL_DEBUG=INFO
export NCCL_DEBUG_SUBSYS=ALL
# NCCL 性能调优
export NCCL_IB_DISABLE=0
export NCCL_IB_HCA=mlx5
export NCCL_SOCKET_IFNAME=eth0
模型推理
vLLM
# 安装
pip install vllm
# 启动 API 服务
vllm serve meta-llama/Llama-2-7b-hf \
--host 0.0.0.0 \
--port 8000 \
--tensor-parallel-size 2 \
--gpu-memory-utilization 0.9
# 离线推理
python -c "
from vllm import LLM, SamplingParams
llm = LLM(model='meta-llama/Llama-2-7b-hf')
outputs = llm.generate(['Hello'], SamplingParams(max_tokens=100))
print(outputs[0].outputs[0].text)
"
TensorRT-LLM
# 构建引擎
python convert_checkpoint.py --model_dir /model --output_dir /checkpoint
trtllm-build --checkpoint_dir /checkpoint --output_dir /engine
# 运行推理
python run.py --engine_dir /engine --max_output_len 100
量化配置
# bitsandbytes 量化
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
load_in_4bit=True, # 4bit 量化
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
device_map="auto"
)
# GPTQ 量化
from auto_gptq import AutoGPTQForCausalLM
model = AutoGPTQForCausalLM.from_quantized(
"model-gptq",
device_map="auto"
)
Kubernetes
常用命令
# 节点管理
kubectl get nodes
kubectl describe node <node-name>
kubectl cordon <node-name> # 标记不可调度
kubectl uncordon <node-name> # 恢复调度
kubectl drain <node-name> # 驱逐 Pod
# Pod 管理
kubectl get pods -n <namespace>
kubectl describe pod <pod-name>
kubectl logs -f <pod-name>
kubectl exec -it <pod-name> -- /bin/bash
# Job 管理
kubectl get jobs
kubectl delete job <job-name>
kubectl logs job/<job-name>
# 资源配额
kubectl describe resourcequota -n <namespace>
kubectl top nodes
kubectl top pods
GPU Pod 配置
apiVersion: v1
kind: Pod
metadata:
name: gpu-training
spec:
containers:
- name: training
image: pytorch/pytorch:latest
resources:
limits:
nvidia.com/gpu: 8
memory: "256Gi"
cpu: "32"
volumeMounts:
- name: data
mountPath: /data
- name: output
mountPath: /output
volumes:
- name: data
nfs:
server: nfs-server
path: /datasets
- name: output
persistentVolumeClaim:
claimName: output-pvc
PyTorchJob 配置
apiVersion: kubeflow.org/v1
kind: PyTorchJob
metadata:
name: distributed-training
spec:
pytorchReplicaSpecs:
Master:
replicas: 1
template:
spec:
containers:
- name: pytorch
image: training:latest
resources:
limits:
nvidia.com/gpu: 8
Worker:
replicas: 3
template:
spec:
containers:
- name: pytorch
image: training:latest
resources:
limits:
nvidia.com/gpu: 8
存储
MinIO 操作
# 启动 MinIO
docker run -d -p 9000:9000 -p 9001:9001 \
-v /data:/data \
-e MINIO_ROOT_USER=minioadmin \
-e MINIO_ROOT_PASSWORD=minioadmin \
minio/minio server /data --console-address ":9001"
# mc 客户端操作
mc alias set myminio http://localhost:9000 minioadmin minioadmin
mc ls myminio
mc cp data.parquet myminio/bucket/
mc mirror ./local-dir myminio/bucket/
数据加载优化
from torch.utils.data import DataLoader
dataloader = DataLoader(
dataset,
batch_size=32,
shuffle=True,
num_workers=8, # CPU 核心数
pin_memory=True, # 锁页内存
prefetch_factor=2, # 预取批次
persistent_workers=True, # 保持 worker
drop_last=True # 丢弃不完整批次
)
监控
Prometheus 指标
# GPU 指标示例
dcgm_gpu_utilization # GPU 利用率
dcgm_mem_copy_utilization # 显存带宽利用率
dcgm_gpu_temp # GPU 温度
dcgm_power_usage # 功耗
dcgm_fb_free # 空闲显存
dcgm_fb_used # 已用显存
# 训练指标示例
training_loss # 训练损失
training_learning_rate # 学习率
training_epoch_progress # Epoch 进度
training_samples_per_second # 吞吐量
Grafana 查询
# GPU 平均利用率
avg(dcgm_gpu_utilization)
# 显存使用率
dcgm_fb_used / (dcgm_fb_used + dcgm_fb_free) * 100
# 训练吞吐量趋势
rate(training_samples_total[5m])
性能调优
常用优化技巧
# 混合精度训练
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()
with autocast():
output = model(input)
loss = criterion(output, target)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
# 梯度累积
accumulation_steps = 4
for i, batch in enumerate(dataloader):
loss = model(batch) / accumulation_steps
loss.backward()
if (i + 1) % accumulation_steps == 0:
optimizer.step()
optimizer.zero_grad()
# 梯度检查点
from torch.utils.checkpoint import checkpoint
x = checkpoint(layer, x)
# 数据预取
torch.cuda.synchronize() # 同步 GPU
性能分析
import torch.profiler as profiler
with profiler.profile(
activities=[
profiler.ProfilerActivity.CPU,
profiler.ProfilerActivity.CUDA,
],
on_trace_ready=profiler.tensorboard_trace_handler('./logs'),
record_shapes=True,
profile_memory=True
) as p:
model(input)
# 查看 TensorBoard
# tensorboard --logdir=./logs
故障排查
常见问题
| 问题 | 可能原因 | 解决方案 |
|---|---|---|
| CUDA OOM | 显存不足 | 减小批次、启用量化、使用 FSDP |
| NCCL 超时 | 网络问题 | 检查网络、增加超时时间 |
| GPU 利用率低 | I/O 瓶颈 | 优化数据加载、增加缓存 |
| 训练不收敛 | 学习率问题 | 调整学习率、检查梯度 |
| 死锁 | 同步问题 | 检查 barrier 调用 |
调试命令
# 检查 GPU 状态
nvidia-smi -q | grep -i error
# 检查 NCCL
python -c "import torch; print(torch.cuda.nccl.version())"
# 内存检查
cuda-memcheck ./program
# 网络检查
ibstat
ibv_devinfo
监控与可观测性
DCGM Exporter
# Docker 部署
docker run -d --name dcgm-exporter --gpus all \
-p 9400:9400 \
nvcr.io/nvidia/k8s/dcgm-exporter:3.3.0-3.4.0-ubuntu22.04
# 验证指标
curl http://localhost:9400/metrics | grep DCGM_FI_DEV_GPU_UTIL
Prometheus 查询
# GPU 平均利用率
avg(DCGM_FI_DEV_GPU_UTIL)
# GPU 显存使用率
DCGM_FI_DEV_FB_USED / (DCGM_FI_DEV_FB_USED + DCGM_FI_DEV_FB_FREE) * 100
# 训练吞吐量
rate(training_samples_total[1m])
# GPU 温度告警
DCGM_FI_DEV_GPU_TEMP > 85
TensorBoard
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter(log_dir="runs/experiment")
writer.add_scalar('Loss/train', loss, global_step)
writer.add_scalar('Learning_Rate', lr, global_step)
writer.add_histogram('Gradients/layer1', param.grad, epoch)
writer.close()
tensorboard --logdir=runs --port=6006
Weights & Biases
import wandb
wandb.init(project="my-project", config={"lr": 1e-4, "batch_size": 32})
wandb.log({"loss": loss, "accuracy": acc})
wandb.finish()
故障排查
常见错误速查
| 错误 | 原因 | 解决方案 |
|---|---|---|
CUDA out of memory | 显存不足 | 减批次、启用量化、FSDP |
NCCL error: unhandled cuda error | CUDA 操作失败 | 先解决 CUDA 错误 |
NCCL WARN CANNOT RECV FROM PEER | 网络问题 | 检查 IB/网络连接 |
CUDA error: illegal memory access | 内存越界 | cuda-memcheck 检查 |
| 训练卡住 | 可能死锁 | 检查 barrier/sync 调用 |
NCCL 调试
# 启用调试日志
export NCCL_DEBUG=INFO
export NCCL_DEBUG_SUBSYS=ALL
# 网络配置
export NCCL_SOCKET_IFNAME=eth0
export NCCL_IB_HCA=mlx5_0
export NCCL_IB_DISABLE=0
# 超时设置
export NCCL_TIMEOUT=1800
export NCCL_BLOCKING_WAIT=1
CUDA 内存检查
# 内存错误检测
cuda-memcheck ./program
# 详细检查
cuda-memcheck --tool memcheck --leak-check full ./program
显存诊断
import torch
# 显存统计
print(f"已分配: {torch.cuda.memory_allocated()/1e9:.2f} GB")
print(f"已预留: {torch.cuda.memory_reserved()/1e9:.2f} GB")
print(f"峰值: {torch.cuda.max_memory_allocated()/1e9:.2f} GB")
# 清空缓存
torch.cuda.empty_cache()
# 详细信息
print(torch.cuda.memory_summary())
网络检查
# TCP 连通性
ping <node-ip>
nc -zv <node-ip> 29500
# InfiniBand 状态
ibstat
ibping <node-guid>
# NVLink 状态
nvidia-smi nvlink --status
nvidia-smi topo -m
检查点管理
保存检查点
checkpoint = {
'epoch': epoch,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'scheduler_state_dict': scheduler.state_dict(),
'loss': loss,
'random_state': torch.get_rng_state(),
'cuda_random_state': torch.cuda.get_rng_state(),
}
torch.save(checkpoint, 'checkpoint.pt')
加载检查点
checkpoint = torch.load('checkpoint.pt')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
torch.set_rng_state(checkpoint['random_state'])
torch.cuda.set_rng_state(checkpoint['cuda_random_state'])
epoch = checkpoint['epoch']
FSDP 检查点
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
from torch.distributed.fsdp import StateDictType, FullStateDictConfig
FSDP.set_state_dict_type(
model,
StateDictType.FULL_STATE_DICT,
FullStateDictConfig(rank0_only=True)
)
# 保存(仅 rank 0)
if dist.get_rank() == 0:
torch.save(model.state_dict(), "checkpoint.pt")