模型训练
本章将详细介绍 PyTorch 中的模型训练流程,包括损失函数、优化器、学习率调度、训练循环和模型评估。
训练流程概述
PyTorch 的标准训练流程如下:
┌─────────────────────────────────────────────────────────┐
│ 训练流程 │
├─────────────────────────────────────────────────────────┤
│ 1. 数据准备 → DataLoader │
│ ↓ │
│ 2. 模型创建 → nn.Module │
│ ↓ │
│ 3. 损失函数 → nn.Loss │
│ ↓ │
│ 4. 优化器 → optim.Optimizer │
│ ↓ │
│ 5. 训练循环: │
│ for epoch in range(epochs): │
│ for batch in dataloader: │
│ ① 前向传播 (forward) │
│ ② 计算损失 (compute loss) │
│ ③ 清零梯度 (zero grad) │
│ ④ 反向传播 (backward) │
│ ⑤ 更新参数 (optimizer step) │
│ ↓ │
│ 6. 模型评估 │
└─────────────────────────────────────────────────────────┘
损失函数
损失函数衡量模型预测值与真实值之间的差异。
常用损失函数
import torch
import torch.nn as nn
# 交叉熵损失(多分类)
criterion = nn.CrossEntropyLoss()
output = torch.randn(32, 10) # batch_size=32, 10个类别
target = torch.randint(0, 10, (32,)) # 真实标签
loss = criterion(output, target)
# 二分类交叉熵损失
criterion = nn.BCELoss()
output = torch.sigmoid(torch.randn(32, 1)) # 需先通过sigmoid
target = torch.randint(0, 2, (32, 1)).float()
loss = criterion(output, target)
# 带权重的交叉熵损失(数据不平衡时使用)
weights = torch.tensor([1.0, 2.0, 3.0]) # 类别权重
criterion = nn.CrossEntropyLoss(weight=weights)
# 均方误差损失(回归)
criterion = nn.MSELoss()
output = torch.randn(32, 1)
target = torch.randn(32, 1)
loss = criterion(output, target)
# L1 损失(回归)
criterion = nn.L1Loss()
# Smooth L1 损失(Huber损失,回归)
criterion = nn.SmoothL1Loss()
# 负对数似然损失(需要 LogSoftmax)
criterion = nn.NLLLoss()
log_probs = torch.randn(32, 10)
target = torch.randint(0, 10, (32,))
loss = criterion(log_probs, target)
损失函数选择指南
| 任务类型 | 推荐损失函数 | 说明 |
|---|---|---|
| 多分类 | CrossEntropyLoss | 最常用 |
| 二分类 | BCEWithLogitsLoss | 内置sigmoid |
| 回归 | MSELoss / SmoothL1Loss | 连续值预测 |
| 序列到序列 | CTCLoss | 语音识别 |
优化器
优化器负责根据梯度更新模型参数。
常用优化器
import torch.optim as optim
# SGD(随机梯度下降)
optimizer = optim.SGD(
model.parameters(),
lr=0.01, # 学习率
momentum=0.9, # 动量
weight_decay=1e-4 # L2正则化
)
# Adam(自适应学习率,推荐)
optimizer = optim.Adam(
model.parameters(),
lr=0.001,
betas=(0.9, 0.999), # 动量估计的指数衰减率
eps=1e-8, # 数值稳定性
weight_decay=1e-4
)
# AdamW(带权重衰减的Adam,推荐用于transformer)
optimizer = optim.AdamW(
model.parameters(),
lr=0.001,
weight_decay=0.01
)
# RMSprop
optimizer = optim.RMSprop(
model.parameters(),
lr=0.01,
alpha=0.99,
momentum=0.9
)
# Adamax
optimizer = optim.Adamax(
model.parameters(),
lr=0.002,
weight_decay=1e-4
)
优化器对比
| 优化器 | 优点 | 缺点 | 适用场景 |
|---|---|---|---|
| SGD | 稳定、理论基础好 | 收敛慢、需要调参 | 学术研究 |
| Adam | 收敛快、自适应 | 可能泛化差 | 通用 |
| AdamW | 收敛快、泛化好 | - | Transformer |
| RMSprop | 适合循环神经网络 | - | RNN |
学习率调度
学习率调度可以动态调整学习率,提高训练效果。
常用调度器
import torch.optim.lr_scheduler as lr_scheduler
# 步骤衰减(每N个epoch衰减gamma倍)
scheduler = lr_scheduler.StepLR(
optimizer,
step_size=10, # 每10个epoch调整
gamma=0.1 # 学习率乘以0.1
)
# 多步衰减(在指定epoch衰减)
scheduler = lr_scheduler.MultiStepLR(
optimizer,
milestones=[30, 60, 90], # 在第30、60、90个epoch衰减
gamma=0.1
)
# 指数衰减
scheduler = lr_scheduler.ExponentialLR(
optimizer,
gamma=0.95 # 每个epoch乘以0.95
)
# 余弦退火
scheduler = lr_scheduler.CosineAnnealingLR(
optimizer,
T_max=50, # 最大迭代次数
eta_min=1e-6 # 最小学习率
)
# 余弦退火带重启
scheduler = lr_scheduler.CosineAnnealingWarmRestarts(
optimizer,
T_0=10, # 第一个周期
T_mult=2 # 周期增长因子
)
# 热身+余弦衰减
# 需要安装 torch.optim.lr_scheduler
# scheduler = lr_scheduler.OneCycleLR(...)
完整训练示例
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
# 1. 准备数据
x_train = torch.randn(1000, 20)
y_train = torch.randn(1000, 1)
train_dataset = TensorDataset(x_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# 2. 创建模型
class SimpleNet(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(20, 64)
self.fc2 = nn.Linear(64, 32)
self.fc3 = nn.Linear(32, 1)
self.relu = nn.ReLU()
def forward(self, x):
x = self.relu(self.fc1(x))
x = self.relu(self.fc2(x))
x = self.fc3(x)
return x
model = SimpleNet()
# 3. 损失函数和优化器
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)
# 4. 训练循环
num_epochs = 50
for epoch in range(num_epochs):
# 训练阶段
model.train()
train_loss = 0.0
for batch_x, batch_y in train_loader:
# 前向传播
outputs = model(batch_x)
loss = criterion(outputs, batch_y)
# 反向传播
optimizer.zero_grad()
loss.backward()
# 梯度裁剪(防止梯度爆炸)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
train_loss += loss.item()
# 更新学习率
scheduler.step()
# 打印信息
avg_loss = train_loss / len(train_loader)
current_lr = optimizer.param_groups[0]['lr']
if (epoch + 1) % 5 == 0:
print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, LR: {current_lr:.6f}")
print("训练完成!")
模型验证
# 创建验证集
x_val = torch.randn(200, 20)
y_val = torch.randn(200, 1)
val_dataset = TensorDataset(x_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
def evaluate(model, loader, criterion):
model.eval()
total_loss = 0.0
correct = 0
total = 0
with torch.no_grad():
for batch_x, batch_y in loader:
outputs = model(batch_x)
loss = criterion(outputs, batch_y)
total_loss += loss.item()
# 对于分类任务
# _, predicted = torch.max(outputs, 1)
# total += batch_y.size(0)
# correct += (predicted == batch_y).sum().item()
return total_loss / len(loader)
# 完整训练+验证循环
best_val_loss = float('inf')
for epoch in range(num_epochs):
# 训练
model.train()
train_loss = 0.0
for batch_x, batch_y in train_loader:
optimizer.zero_grad()
outputs = model(batch_x)
loss = criterion(outputs, batch_y)
loss.backward()
optimizer.step()
train_loss += loss.item()
# 验证
val_loss = evaluate(model, val_loader, criterion)
# 保存最佳模型
if val_loss < best_val_loss:
best_val_loss = val_loss
torch.save(model.state_dict(), 'best_model.pth')
print(f"保存最佳模型,验证损失: {val_loss:.4f}")
if (epoch + 1) % 10 == 0:
print(f"Epoch {epoch+1}: Train Loss={train_loss/len(train_loader):.4f}, Val Loss={val_loss:.4f}")
断点续训
训练过程中可能中断,需要支持断点续训:
import os
checkpoint_path = 'checkpoint.pth'
start_epoch = 0
# 检查是否有保存的检查点
if os.path.exists(checkpoint_path):
checkpoint = torch.load(checkpoint_path)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
start_epoch = checkpoint['epoch']
print(f"从第 {start_epoch} 个epoch继续训练")
for epoch in range(start_epoch, num_epochs):
# 训练...
# 每10个epoch保存检查点
if (epoch + 1) % 10 == 0:
checkpoint = {
'epoch': epoch + 1,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'loss': loss,
}
torch.save(checkpoint, checkpoint_path)
print(f"保存检查点: epoch {epoch+1}")
训练技巧
学习率选择
# 方法1:使用 Learning Rate Finder
# 方法2:先使用较大的学习率快速收敛,再减小
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = lr_scheduler.OneCycleLR(
optimizer,
max_lr=0.1,
epochs=50,
steps_per_epoch=len(train_loader)
)
# 在每个batch后更新学习率
for batch_x, batch_y in train_loader:
optimizer.zero_grad()
outputs = model(batch_x)
loss = criterion(outputs, batch_y)
loss.backward()
optimizer.step()
scheduler.step() # 更新学习率
梯度累积
当 GPU 显存不足时,可以使用梯度累积:
accumulation_steps = 4 # 累积4个batch
optimizer.zero_grad()
for i, (batch_x, batch_y) in enumerate(train_loader):
outputs = model(batch_x)
loss = criterion(outputs, batch_y)
loss = loss / accumulation_steps # 缩放损失
loss.backward()
if (i + 1) % accumulation_steps == 0:
optimizer.step()
optimizer.zero_grad()
混合精度训练
使用 FP16 加速训练:
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()
for batch_x, batch_y in train_loader:
batch_x, batch_y = batch_x.cuda(), batch_y.cuda()
optimizer.zero_grad()
# 自动混合精度
with autocast():
outputs = model(batch_x)
loss = criterion(outputs, batch_y)
# 梯度缩放
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
早停法
防止过拟合:
patience = 10 # 允许没有改善的最大epoch数
best_loss = float('inf')
patience_counter = 0
for epoch in range(num_epochs):
# 训练...
val_loss = evaluate(model, val_loader, criterion)
if val_loss < best_loss:
best_loss = val_loss
patience_counter = 0
torch.save(model.state_dict(), 'best_model.pth')
else:
patience_counter += 1
if patience_counter >= patience:
print(f"早停!连续{patience}个epoch没有改善")
break
下一步
现在我们已经掌握了模型训练的基本流程。下一章我们将学习卷积神经网络(CNN),这是计算机视觉领域的核心技术。