神经网络 (Neural Network)
本章将介绍如何使用 PyTorch 的 nn.Module 构建神经网络,这是定义深度学习模型的标准方式。
nn.Module 基础
torch.nn.Module 是 PyTorch 中所有神经网络的基类。它提供了一种组织模型结构、管理参数和处理输入输出的标准方式。
创建一个简单的神经网络
import torch
import torch.nn as nn
# 定义神经网络
class SimpleNet(nn.Module):
def __init__(self):
# 调用父类的构造函数
super(SimpleNet, self).__init__()
# 定义网络层
self.fc1 = nn.Linear(784, 256) # 全连接层:输入784,输出256
self.relu = nn.ReLU() # 激活函数
self.fc2 = nn.Linear(256, 10) # 输出层:10个类别
def forward(self, x):
# 定义前向传播
x = self.fc1(x)
x = self.relu(x)
x = self.fc2(x)
return x
# 创建模型实例
model = SimpleNet()
print(model)
# 测试模型
x = torch.randn(1, 784) # 模拟一个样本(28x28图像展平)
output = model(x)
print(f"输出形状: {output.shape}") # torch.Size([1, 10])
print(f"输出值: {output}")
nn.Module 的核心特性
- 参数管理:自动追踪所有可训练参数
- 前向传播:通过
forward()方法定义 - 设备管理:自动处理 CPU/GPU 迁移
- 状态保存:支持模型保存和加载
# 查看模型参数
for name, param in model.named_parameters():
print(f"{name}: {param.shape}")
# 查看可训练参数总数
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"总参数: {total_params:,}")
print(f"可训练参数: {trainable_params:,}")
常用网络层
全连接层 (Linear)
全连接层(也称为线性层或密集层)是神经网络中最基本的层:
import torch.nn as nn
# nn.Linear(in_features, out_features, bias=True)
linear = nn.Linear(100, 50)
# 输入: (batch_size, in_features)
# 输出: (batch_size, out_features)
x = torch.randn(32, 100) # batch_size=32, 100个特征
y = linear(x)
print(f"输出形状: {y.shape}") # torch.Size([32, 50])
参数解释:
in_features:输入特征维度out_features:输出特征维度bias:是否添加偏置(默认为 True)
激活函数层
激活函数为神经网络引入非线性,使其能够学习复杂的模式。
import torch
import torch.nn as nn
# ReLU (Rectified Linear Unit) - 最常用
relu = nn.ReLU()
print(relu(torch.tensor([-1, 0, 1]))) # tensor([0., 0., 1.])
# LeakyReLU - 解决 ReLU 的"死亡神经元"问题
leaky_relu = nn.LeakyReLU(negative_slope=0.01)
# Sigmoid - 输出在 (0, 1) 区间
sigmoid = nn.Sigmoid()
# Tanh - 输出在 (-1, 1) 区间
tanh = nn.Tanh()
# Softmax - 多分类输出(沿指定维度)
softmax = nn.Softmax(dim=1)
# LogSoftmax - Softmax 的对数形式(配合 NLLLoss 使用)
log_softmax = nn.LogSoftmax(dim=1)
Dropout 层
Dropout 是一种正则化技术,随机将部分神经元输出置零,防止过拟合:
import torch.nn as nn
# Dropout 层
dropout = nn.Dropout(p=0.5) # p 是丢弃概率
x = torch.randn(1, 10)
y = dropout(x)
print(f"Dropout 前: {x}")
print(f"Dropout 后: {y}")
# 训练时 Dropout 生效,推理时不生效
model = nn.Sequential(
nn.Linear(10, 20),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(20, 5)
)
model.train() # 训练模式
model.eval() # 评估模式
BatchNorm 层
BatchNorm(批归一化)通过对每批数据进行归一化,加速训练并提高稳定性:
import torch.nn as nn
# 1D BatchNorm(常用于全连接网络)
bn1d = nn.BatchNorm1d(num_features=100)
# 2D BatchNorm(常用于卷积网络)
bn2d = nn.BatchNorm2d(num_features=64)
# 3D BatchNorm(常用于3D卷积)
bn3d = nn.BatchNorm3d(num_features=64)
# 使用示例
x = torch.randn(32, 100) # batch=32, features=100
y = bn1d(x)
print(f"BatchNorm 后均值: {y.mean(dim=0).abs().max().item():.6f}") # 接近0
print(f"BatchNorm 后标准差: {y.std(dim=0).mean().item():.6f}") # 接近1
使用 nn.Sequential 快速构建
对于简单的顺序网络,可以使用 nn.Sequential:
import torch.nn as nn
# 方式1:直接传入层
model = nn.Sequential(
nn.Linear(784, 256),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(256, 10)
)
# 方式2:使用 OrderedDict(给层命名)
from collections import OrderedDict
model = nn.Sequential(OrderedDict([
('fc1', nn.Linear(784, 256)),
('relu1', nn.ReLU()),
('dropout', nn.Dropout(0.2)),
('fc2', nn.Linear(256, 10))
]))
# 访问特定层
print(model.fc1)
print(model.relu1)
构建复杂网络
多层感知机 (MLP)
import torch
import torch.nn as nn
class MLP(nn.Module):
def __init__(self, input_size, hidden_sizes, output_size, dropout=0.2):
super(MLP, self).__init__()
# 构建多层网络
layers = []
prev_size = input_size
for hidden_size in hidden_sizes:
layers.append(nn.Linear(prev_size, hidden_size))
layers.append(nn.BatchNorm1d(hidden_size))
layers.append(nn.ReLU())
layers.append(nn.Dropout(dropout))
prev_size = hidden_size
# 输出层(通常不加激活,让损失函数处理)
layers.append(nn.Linear(prev_size, output_size))
self.network = nn.Sequential(*layers)
def forward(self, x):
return self.network(x)
# 创建模型
model = MLP(input_size=784, hidden_sizes=[512, 256, 128], output_size=10)
print(model)
# 测试
x = torch.randn(64, 784)
output = model(x)
print(f"输出形状: {output.shape}") # torch.Size([64, 10])
带残差连接的深度网络
残差连接(Residual Connection)可以帮助训练更深的网络:
import torch.nn as nn
class ResidualBlock(nn.Module):
def __init__(self, in_channels, out_channels, stride=1):
super(ResidualBlock, self).__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels,
kernel_size=3, stride=stride, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(out_channels)
self.relu = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(out_channels, out_channels,
kernel_size=3, stride=1, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(out_channels)
# 残差连接(维度不匹配时需要投影)
self.shortcut = nn.Sequential()
if stride != 1 or in_channels != out_channels:
self.shortcut = nn.Sequential(
nn.Conv2d(in_channels, out_channels,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(out_channels)
)
def forward(self, x):
residual = self.shortcut(x)
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out += residual # 残差连接
out = self.relu(out)
return out
class ResNetLike(nn.Module):
def __init__(self, num_classes=10):
super(ResNetLike, self).__init__()
self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
self.layer1 = self._make_layer(64, 64, 2, stride=1)
self.layer2 = self._make_layer(64, 128, 2, stride=2)
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(128, num_classes)
def _make_layer(self, in_channels, out_channels, num_blocks, stride):
layers = []
layers.append(ResidualBlock(in_channels, out_channels, stride))
for _ in range(1, num_blocks):
layers.append(ResidualBlock(out_channels, out_channels))
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.fc(x)
return x
前向传播与推理
基本推理流程
import torch
import torch.nn as nn
# 创建模型
model = nn.Sequential(
nn.Linear(784, 256),
nn.ReLU(),
nn.Linear(256, 10)
)
# 切换到评估模式(重要!影响 Dropout、BatchNorm 等层的行为)
model.eval()
# 推理
x = torch.randn(1, 784)
# 方式1:直接调用
with torch.no_grad(): # 禁用梯度计算,节省内存
output = model(x)
# 方式2:使用 forward(等效)
with torch.no_grad():
output = model.forward(x)
# 获取预测类别
pred = output.argmax(dim=1)
print(f"预测类别: {pred.item()}")
GPU 推理
# 检查 GPU 是否可用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")
# 移动模型到 GPU
model = model.to(device)
# 移动输入到 GPU
x = x.to(device)
# 推理
with torch.no_grad():
output = model(x)
模型参数管理
访问参数
# 方式1:直接访问
for param in model.parameters():
print(f"参数形状: {param.shape}")
# 方式2:命名参数
for name, param in model.named_parameters():
print(f"{name}: {param.shape}")
# 初始化参数
def initialize_weights(m):
if isinstance(m, nn.Linear):
nn.init.xavier_uniform_(m.weight)
if m.bias is not None:
nn.init.zeros_(m.bias)
model.apply(initialize_weights)
常见初始化方法
import torch.nn.init as init
# Xavier 初始化(适合 ReLU 激活)
init.xavier_uniform_(layer.weight)
init.xavier_normal_(layer.weight)
# Kaiming 初始化(适合 ReLU/LeakyReLU)
init.kaiming_uniform_(layer.weight, nonlinearity='relu')
init.kaiming_normal_(layer.weight, nonlinearity='relu')
# 常数初始化
init.zeros_(layer.weight)
init.ones_(layer.weight)
init.constant_(layer.weight, 0.5)
# 正态分布初始化
init.normal_(layer.weight, mean=0, std=0.01)
完整示例:MNIST 分类器
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
# 定义模型
class MNISTClassifier(nn.Module):
def __init__(self):
super(MNISTClassifier, self).__init__()
self.network = nn.Sequential(
nn.Linear(784, 512),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(512, 256),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(256, 10)
)
def forward(self, x):
x = x.view(-1, 784) # 展平
return self.network(x)
# 创建模型
model = MNISTClassifier()
# 损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# 模拟数据
x_train = torch.randn(1000, 1, 28, 28)
y_train = torch.randint(0, 10, (1000,))
train_dataset = TensorDataset(x_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# 训练
model.train()
for epoch in range(5):
total_loss = 0
for batch_x, batch_y in train_loader:
# 前向传播
outputs = model(batch_x)
loss = criterion(outputs, batch_y)
# 反向传播
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")
# 评估
model.eval()
with torch.no_grad():
x_test = torch.randn(10, 1, 28, 28)
outputs = model(x_test)
_, predicted = torch.max(outputs, 1)
print(f"预测结果: {predicted}")
下一步
神经网络模型需要数据来训练。下一章我们将学习如何使用 PyTorch 的 Dataset 和 DataLoader 高效加载和预处理数据。