跳到主要内容

模型训练

本章将详细介绍 PyTorch 中的模型训练流程,包括损失函数、优化器、学习率调度、训练循环和模型评估。

训练流程概述

PyTorch 的标准训练流程如下:

┌─────────────────────────────────────────────────────────┐
│ 训练流程 │
├─────────────────────────────────────────────────────────┤
│ 1. 数据准备 → DataLoader │
│ ↓ │
│ 2. 模型创建 → nn.Module │
│ ↓ │
│ 3. 损失函数 → nn.Loss │
│ ↓ │
│ 4. 优化器 → optim.Optimizer │
│ ↓ │
│ 5. 训练循环: │
│ for epoch in range(epochs): │
│ for batch in dataloader: │
│ ① 前向传播 (forward) │
│ ② 计算损失 (compute loss) │
│ ③ 清零梯度 (zero grad) │
│ ④ 反向传播 (backward) │
│ ⑤ 更新参数 (optimizer step) │
│ ↓ │
│ 6. 模型评估 │
└─────────────────────────────────────────────────────────┘

损失函数

损失函数衡量模型预测值与真实值之间的差异。

常用损失函数

import torch
import torch.nn as nn

# 交叉熵损失(多分类)
criterion = nn.CrossEntropyLoss()
output = torch.randn(32, 10) # batch_size=32, 10个类别
target = torch.randint(0, 10, (32,)) # 真实标签
loss = criterion(output, target)

# 二分类交叉熵损失
criterion = nn.BCELoss()
output = torch.sigmoid(torch.randn(32, 1)) # 需先通过sigmoid
target = torch.randint(0, 2, (32, 1)).float()
loss = criterion(output, target)

# 带权重的交叉熵损失(数据不平衡时使用)
weights = torch.tensor([1.0, 2.0, 3.0]) # 类别权重
criterion = nn.CrossEntropyLoss(weight=weights)

# 均方误差损失(回归)
criterion = nn.MSELoss()
output = torch.randn(32, 1)
target = torch.randn(32, 1)
loss = criterion(output, target)

# L1 损失(回归)
criterion = nn.L1Loss()

# Smooth L1 损失(Huber损失,回归)
criterion = nn.SmoothL1Loss()

# 负对数似然损失(需要 LogSoftmax)
criterion = nn.NLLLoss()
log_probs = torch.randn(32, 10)
target = torch.randint(0, 10, (32,))
loss = criterion(log_probs, target)

损失函数选择指南

任务类型推荐损失函数说明
多分类CrossEntropyLoss最常用
二分类BCEWithLogitsLoss内置sigmoid
回归MSELoss / SmoothL1Loss连续值预测
序列到序列CTCLoss语音识别

优化器

优化器负责根据梯度更新模型参数。

常用优化器

import torch.optim as optim

# SGD(随机梯度下降)
optimizer = optim.SGD(
model.parameters(),
lr=0.01, # 学习率
momentum=0.9, # 动量
weight_decay=1e-4 # L2正则化
)

# Adam(自适应学习率,推荐)
optimizer = optim.Adam(
model.parameters(),
lr=0.001,
betas=(0.9, 0.999), # 动量估计的指数衰减率
eps=1e-8, # 数值稳定性
weight_decay=1e-4
)

# AdamW(带权重衰减的Adam,推荐用于transformer)
optimizer = optim.AdamW(
model.parameters(),
lr=0.001,
weight_decay=0.01
)

# RMSprop
optimizer = optim.RMSprop(
model.parameters(),
lr=0.01,
alpha=0.99,
momentum=0.9
)

# Adamax
optimizer = optim.Adamax(
model.parameters(),
lr=0.002,
weight_decay=1e-4
)

优化器对比

优化器优点缺点适用场景
SGD稳定、理论基础好收敛慢、需要调参学术研究
Adam收敛快、自适应可能泛化差通用
AdamW收敛快、泛化好-Transformer
RMSprop适合循环神经网络-RNN

学习率调度

学习率调度可以动态调整学习率,提高训练效果。

常用调度器

import torch.optim.lr_scheduler as lr_scheduler

# 步骤衰减(每N个epoch衰减gamma倍)
scheduler = lr_scheduler.StepLR(
optimizer,
step_size=10, # 每10个epoch调整
gamma=0.1 # 学习率乘以0.1
)

# 多步衰减(在指定epoch衰减)
scheduler = lr_scheduler.MultiStepLR(
optimizer,
milestones=[30, 60, 90], # 在第30、60、90个epoch衰减
gamma=0.1
)

# 指数衰减
scheduler = lr_scheduler.ExponentialLR(
optimizer,
gamma=0.95 # 每个epoch乘以0.95
)

# 余弦退火
scheduler = lr_scheduler.CosineAnnealingLR(
optimizer,
T_max=50, # 最大迭代次数
eta_min=1e-6 # 最小学习率
)

# 余弦退火带重启
scheduler = lr_scheduler.CosineAnnealingWarmRestarts(
optimizer,
T_0=10, # 第一个周期
T_mult=2 # 周期增长因子
)

# 热身+余弦衰减
# 需要安装 torch.optim.lr_scheduler
# scheduler = lr_scheduler.OneCycleLR(...)

完整训练示例

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# 1. 准备数据
x_train = torch.randn(1000, 20)
y_train = torch.randn(1000, 1)
train_dataset = TensorDataset(x_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# 2. 创建模型
class SimpleNet(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(20, 64)
self.fc2 = nn.Linear(64, 32)
self.fc3 = nn.Linear(32, 1)
self.relu = nn.ReLU()

def forward(self, x):
x = self.relu(self.fc1(x))
x = self.relu(self.fc2(x))
x = self.fc3(x)
return x

model = SimpleNet()

# 3. 损失函数和优化器
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

# 4. 训练循环
num_epochs = 50

for epoch in range(num_epochs):
# 训练阶段
model.train()
train_loss = 0.0

for batch_x, batch_y in train_loader:
# 前向传播
outputs = model(batch_x)
loss = criterion(outputs, batch_y)

# 反向传播
optimizer.zero_grad()
loss.backward()

# 梯度裁剪(防止梯度爆炸)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

optimizer.step()

train_loss += loss.item()

# 更新学习率
scheduler.step()

# 打印信息
avg_loss = train_loss / len(train_loader)
current_lr = optimizer.param_groups[0]['lr']

if (epoch + 1) % 5 == 0:
print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, LR: {current_lr:.6f}")

print("训练完成!")

模型验证

# 创建验证集
x_val = torch.randn(200, 20)
y_val = torch.randn(200, 1)
val_dataset = TensorDataset(x_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

def evaluate(model, loader, criterion):
model.eval()
total_loss = 0.0
correct = 0
total = 0

with torch.no_grad():
for batch_x, batch_y in loader:
outputs = model(batch_x)
loss = criterion(outputs, batch_y)
total_loss += loss.item()

# 对于分类任务
# _, predicted = torch.max(outputs, 1)
# total += batch_y.size(0)
# correct += (predicted == batch_y).sum().item()

return total_loss / len(loader)

# 完整训练+验证循环
best_val_loss = float('inf')

for epoch in range(num_epochs):
# 训练
model.train()
train_loss = 0.0
for batch_x, batch_y in train_loader:
optimizer.zero_grad()
outputs = model(batch_x)
loss = criterion(outputs, batch_y)
loss.backward()
optimizer.step()
train_loss += loss.item()

# 验证
val_loss = evaluate(model, val_loader, criterion)

# 保存最佳模型
if val_loss < best_val_loss:
best_val_loss = val_loss
torch.save(model.state_dict(), 'best_model.pth')
print(f"保存最佳模型,验证损失: {val_loss:.4f}")

if (epoch + 1) % 10 == 0:
print(f"Epoch {epoch+1}: Train Loss={train_loss/len(train_loader):.4f}, Val Loss={val_loss:.4f}")

断点续训

训练过程中可能中断,需要支持断点续训:

import os

checkpoint_path = 'checkpoint.pth'
start_epoch = 0

# 检查是否有保存的检查点
if os.path.exists(checkpoint_path):
checkpoint = torch.load(checkpoint_path)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
start_epoch = checkpoint['epoch']
print(f"从第 {start_epoch} 个epoch继续训练")

for epoch in range(start_epoch, num_epochs):
# 训练...

# 每10个epoch保存检查点
if (epoch + 1) % 10 == 0:
checkpoint = {
'epoch': epoch + 1,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'loss': loss,
}
torch.save(checkpoint, checkpoint_path)
print(f"保存检查点: epoch {epoch+1}")

训练技巧

学习率选择

# 方法1:使用 Learning Rate Finder
# 方法2:先使用较大的学习率快速收敛,再减小
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = lr_scheduler.OneCycleLR(
optimizer,
max_lr=0.1,
epochs=50,
steps_per_epoch=len(train_loader)
)

# 在每个batch后更新学习率
for batch_x, batch_y in train_loader:
optimizer.zero_grad()
outputs = model(batch_x)
loss = criterion(outputs, batch_y)
loss.backward()
optimizer.step()
scheduler.step() # 更新学习率

梯度累积

当 GPU 显存不足时,可以使用梯度累积:

accumulation_steps = 4  # 累积4个batch
optimizer.zero_grad()

for i, (batch_x, batch_y) in enumerate(train_loader):
outputs = model(batch_x)
loss = criterion(outputs, batch_y)
loss = loss / accumulation_steps # 缩放损失
loss.backward()

if (i + 1) % accumulation_steps == 0:
optimizer.step()
optimizer.zero_grad()

混合精度训练

使用 FP16 加速训练:

from torch.cuda.amp import autocast, GradScaler

scaler = GradScaler()

for batch_x, batch_y in train_loader:
batch_x, batch_y = batch_x.cuda(), batch_y.cuda()

optimizer.zero_grad()

# 自动混合精度
with autocast():
outputs = model(batch_x)
loss = criterion(outputs, batch_y)

# 梯度缩放
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()

早停法

防止过拟合:

patience = 10  # 允许没有改善的最大epoch数
best_loss = float('inf')
patience_counter = 0

for epoch in range(num_epochs):
# 训练...
val_loss = evaluate(model, val_loader, criterion)

if val_loss < best_loss:
best_loss = val_loss
patience_counter = 0
torch.save(model.state_dict(), 'best_model.pth')
else:
patience_counter += 1

if patience_counter >= patience:
print(f"早停!连续{patience}个epoch没有改善")
break

下一步

现在我们已经掌握了模型训练的基本流程。下一章我们将学习卷积神经网络(CNN),这是计算机视觉领域的核心技术。