AI 基础设施速查表
本文档提供 AI 基础设施开发运维中常用的命令、配置和最佳实践速查。
GPU 管理
NVIDIA 驱动和工具
# 查看 GPU 信息
nvidia-smi
# 持续监控
watch -n 1 nvidia-smi
# 查看 GPU 详细信息
nvidia-smi -q
# 查看特定 GPU
nvidia-smi -i 0
# 设置 GPU 性能模式
sudo nvidia-smi -pm 1 # 启用持久模式
sudo nvidia-smi -pl 250 # 设置功耗限制
sudo nvidia-smi -ac 1215,1410 # 设置时钟频率
# 重置 GPU
sudo nvidia-smi --gpu-reset -i 0
CUDA 环境
# 查看 CUDA 版本
nvcc --version
cat /usr/local/cuda/version.txt
# 查看 cuDNN 版本
cat /usr/include/cudnn_version.h | grep CUDNN_MAJOR -A 2
# 设置环境变量
export CUDA_HOME=/usr/local/cuda
export PATH=$CUDA_HOME/bin:$PATH
export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
# 编译 CUDA 程序
nvcc -o myprogram myprogram.cu
nvcc -arch=sm_80 -o myprogram myprogram.cu # 指定架构
GPU 显存管理
import torch
# 查看显存使用
print(torch.cuda.memory_allocated()) # 已分配显存
print(torch.cuda.memory_reserved()) # 预留显存
print(torch.cuda.max_memory_allocated()) # 峰值显存
# 清空缓存
torch.cuda.empty_cache()
# 设置显存增长模式
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
# 限制显存使用
torch.cuda.set_per_process_memory_fraction(0.8, 0)
分布式训练
PyTorch DDP
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
# 初始化
dist.init_process_group(backend="nccl")
local_rank = int(os.environ["LOCAL_RANK"])
torch.cuda.set_device(local_rank)
# 包装模型
model = DDP(model, device_ids=[local_rank])
# 清理
dist.destroy_process_group()
# 启动命令
torchrun --nproc_per_node=8 train.py
torchrun --nnodes=2 --nproc_per_node=8 --node_rank=0 \
--master_addr="10.0.0.1" --master_port=29500 train.py
DeepSpeed
# 安装
pip install deepspeed
# 启动训练
deepspeed --num_gpus=8 train.py --deepspeed_config ds_config.json
# 多机训练
deepspeed --num_nodes=2 --num_gpus=8 \
--hostfile hostfile train.py --deepspeed_config ds_config.json
ds_config.json 模板:
{
"train_batch_size": 256,
"gradient_accumulation_steps": 1,
"optimizer": {
"type": "AdamW",
"params": {
"lr": 1e-5,
"betas": [0.9, 0.999],
"eps": 1e-8
}
},
"zero_optimization": {
"stage": 2,
"offload_optimizer": {"device": "cpu"},
"allgather_partitions": true,
"reduce_scatter": true
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"initial_scale_power": 16
}
}
环境变量
# 分布式训练常用环境变量
export MASTER_ADDR=10.0.0.1
export MASTER_PORT=29500
export WORLD_SIZE=16
export RANK=0
export LOCAL_RANK=0
# NCCL 调试
export NCCL_DEBUG=INFO
export NCCL_DEBUG_SUBSYS=ALL
# NCCL 性能调优
export NCCL_IB_DISABLE=0
export NCCL_IB_HCA=mlx5
export NCCL_SOCKET_IFNAME=eth0
模型推理
vLLM
# 安装
pip install vllm
# 启动 API 服务
vllm serve meta-llama/Llama-2-7b-hf \
--host 0.0.0.0 \
--port 8000 \
--tensor-parallel-size 2 \
--gpu-memory-utilization 0.9
# 离线推理
python -c "
from vllm import LLM, SamplingParams
llm = LLM(model='meta-llama/Llama-2-7b-hf')
outputs = llm.generate(['Hello'], SamplingParams(max_tokens=100))
print(outputs[0].outputs[0].text)
"
TensorRT-LLM
# 构建引擎
python convert_checkpoint.py --model_dir /model --output_dir /checkpoint
trtllm-build --checkpoint_dir /checkpoint --output_dir /engine
# 运行推理
python run.py --engine_dir /engine --max_output_len 100
量化配置
# bitsandbytes 量化
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
load_in_4bit=True, # 4bit 量化
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
device_map="auto"
)
# GPTQ 量化
from auto_gptq import AutoGPTQForCausalLM
model = AutoGPTQForCausalLM.from_quantized(
"model-gptq",
device_map="auto"
)
Kubernetes
常用命令
# 节点管理
kubectl get nodes
kubectl describe node <node-name>
kubectl cordon <node-name> # 标记不可调度
kubectl uncordon <node-name> # 恢复调度
kubectl drain <node-name> # 驱逐 Pod
# Pod 管理
kubectl get pods -n <namespace>
kubectl describe pod <pod-name>
kubectl logs -f <pod-name>
kubectl exec -it <pod-name> -- /bin/bash
# Job 管理
kubectl get jobs
kubectl delete job <job-name>
kubectl logs job/<job-name>
# 资源配额
kubectl describe resourcequota -n <namespace>
kubectl top nodes
kubectl top pods
GPU Pod 配置
apiVersion: v1
kind: Pod
metadata:
name: gpu-training
spec:
containers:
- name: training
image: pytorch/pytorch:latest
resources:
limits:
nvidia.com/gpu: 8
memory: "256Gi"
cpu: "32"
volumeMounts:
- name: data
mountPath: /data
- name: output
mountPath: /output
volumes:
- name: data
nfs:
server: nfs-server
path: /datasets
- name: output
persistentVolumeClaim:
claimName: output-pvc
PyTorchJob 配置
apiVersion: kubeflow.org/v1
kind: PyTorchJob
metadata:
name: distributed-training
spec:
pytorchReplicaSpecs:
Master:
replicas: 1
template:
spec:
containers:
- name: pytorch
image: training:latest
resources:
limits:
nvidia.com/gpu: 8
Worker:
replicas: 3
template:
spec:
containers:
- name: pytorch
image: training:latest
resources:
limits:
nvidia.com/gpu: 8
存储
MinIO 操作
# 启动 MinIO
docker run -d -p 9000:9000 -p 9001:9001 \
-v /data:/data \
-e MINIO_ROOT_USER=minioadmin \
-e MINIO_ROOT_PASSWORD=minioadmin \
minio/minio server /data --console-address ":9001"
# mc 客户端操作
mc alias set myminio http://localhost:9000 minioadmin minioadmin
mc ls myminio
mc cp data.parquet myminio/bucket/
mc mirror ./local-dir myminio/bucket/
数据加载优化
from torch.utils.data import DataLoader
dataloader = DataLoader(
dataset,
batch_size=32,
shuffle=True,
num_workers=8, # CPU 核心数
pin_memory=True, # 锁页内存
prefetch_factor=2, # 预取批次
persistent_workers=True, # 保持 worker
drop_last=True # 丢弃不完整批次
)
监控
Prometheus 指标
# GPU 指标示例
dcgm_gpu_utilization # GPU 利用率
dcgm_mem_copy_utilization # 显存带宽利用率
dcgm_gpu_temp # GPU 温度
dcgm_power_usage # 功耗
dcgm_fb_free # 空闲显存
dcgm_fb_used # 已用显存
# 训练指标示例
training_loss # 训练损失
training_learning_rate # 学习率
training_epoch_progress # Epoch 进度
training_samples_per_second # 吞吐量
Grafana 查询
# GPU 平均利用率
avg(dcgm_gpu_utilization)
# 显存使用率
dcgm_fb_used / (dcgm_fb_used + dcgm_fb_free) * 100
# 训练吞吐量趋势
rate(training_samples_total[5m])
性能调优
常用优化技巧
# 混合精度训练
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()
with autocast():
output = model(input)
loss = criterion(output, target)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
# 梯度累积
accumulation_steps = 4
for i, batch in enumerate(dataloader):
loss = model(batch) / accumulation_steps
loss.backward()
if (i + 1) % accumulation_steps == 0:
optimizer.step()
optimizer.zero_grad()
# 梯度检查点
from torch.utils.checkpoint import checkpoint
x = checkpoint(layer, x)
# 数据预取
torch.cuda.synchronize() # 同步 GPU
性能分析
import torch.profiler as profiler
with profiler.profile(
activities=[
profiler.ProfilerActivity.CPU,
profiler.ProfilerActivity.CUDA,
],
on_trace_ready=profiler.tensorboard_trace_handler('./logs'),
record_shapes=True,
profile_memory=True
) as p:
model(input)
# 查看 TensorBoard
# tensorboard --logdir=./logs
故障排查
常见问题
| 问题 | 可能原因 | 解决方案 |
|---|---|---|
| CUDA OOM | 显存不足 | 减小批次、启用量化、使用 FSDP |
| NCCL 超时 | 网络问题 | 检查网络、增加超时时间 |
| GPU 利用率低 | I/O 瓶颈 | 优化数据加载、增加缓存 |
| 训练不收敛 | 学习率问题 | 调整学习率、检查梯度 |
| 死锁 | 同步问题 | 检查 barrier 调用 |
调试命令
# 检查 GPU 状态
nvidia-smi -q | grep -i error
# 检查 NCCL
python -c "import torch; print(torch.cuda.nccl.version())"
# 内存检查
cuda-memcheck ./program
# 网络检查
ibstat
ibv_devinfo