卷积神经网络 (CNN)
卷积神经网络(Convolutional Neural Network,CNN)是深度学习中最成功的模型架构之一,特别适用于图像和视频等视觉数据的处理。本章将详细介绍 CNN 的核心概念和实现。
CNN 基础
什么是卷积?
卷积是一种数学运算,在图像处理中用于提取特征。简单来说,卷积核(kernel)在图像上滑动,计算局部区域的加权和。
卷积操作示意:
输入 (5x5) 卷积核 (3x3) 输出 (3x3)
1 1 1 0 0 1 0 1
0 1 1 1 0 1 1 1 (1*1+1*0+1*1+
0 0 1 1 1 * 0 1 0 0*1+1*1+1*1+
0 0 1 0 0 0*0+1*1+1*0) = 4
0 1 0 1 0
CNN 的核心思想
CNN 通过局部连接和权重共享来提取图像的空间层次特征:
- 局部连接:每个神经元只连接上一层的局部区域
- 权重共享:同一层的卷积核参数共享
- 平移不变性:特征的位置不影响检测
卷积层 (Conv2d)
基本使用
import torch
import torch.nn as nn
# nn.Conv2d 参数说明
conv = nn.Conv2d(
in_channels=3, # 输入通道数(RGB图像为3)
out_channels=64, # 输出通道数(卷积核数量)
kernel_size=3, # 卷积核大小(3x3)
stride=1, # 步长
padding=0, # 填充
padding_mode='zeros', # 填充模式
bias=True # 是否添加偏置
)
# 输入形状: [batch_size, channels, height, width]
x = torch.randn(1, 3, 32, 32)
# 输出形状
output = conv(x)
print(f"输出形状: {output.shape}") # torch.Size([1, 64, 30, 30])
填充 (Padding)
填充在输入边缘添加像素,控制输出大小:
# 无填充
conv_no_pad = nn.Conv2d(3, 64, kernel_size=3)
x = torch.randn(1, 3, 32, 32)
output = conv_no_pad(x)
print(f"无填充输出: {output.shape}") # [1, 64, 30, 30]
# 填充1(保持尺寸)
conv_pad = nn.Conv2d(3, 64, kernel_size=3, padding=1)
output = conv_pad(x)
print(f"填充1输出: {output.shape}") # [1, 64, 32, 32]
# 填充公式:输出尺寸 = (输入尺寸 - 卷积核尺寸 + 2*填充) / 步长 + 1
步长 (Stride)
步长控制卷积核滑动的距离:
# 步长为2(尺寸减半)
conv_stride = nn.Conv2d(3, 64, kernel_size=3, stride=2)
output = conv_stride(x)
print(f"步长2输出: {output.shape}") # [1, 64, 15, 15]
# 步长为3
conv_stride3 = nn.Conv2d(3, 64, kernel_size=3, stride=3)
output = conv_stride3(x)
print(f"步长3输出: {output.shape}") # [1, 64, 10, 10]
空洞卷积 (Dilated Conv)
空洞卷积增大感受野:
# 空洞卷积(dilation=2)
conv_dilated = nn.Conv2d(3, 64, kernel_size=3, dilation=2)
output = conv_dilated(x)
print(f"空洞卷积输出: {output.shape}") # [1, 64, 28, 28]
# 感受野 = 1 + (kernel_size - 1) * dilation
# 当 kernel=3, dilation=2 时,感受野 = 1 + (3-1)*2 = 5
池化层 (Pooling)
池化层用于降维和特征聚合。
最大池化
# MaxPool2d
maxpool = nn.MaxPool2d(
kernel_size=2, # 池化窗口大小
stride=2, # 步长(默认等于kernel_size)
padding=0 # 填充
)
x = torch.randn(1, 64, 28, 28)
output = maxpool(x)
print(f"最大池化输出: {output.shape}") # [1, 64, 14, 14]
# 自适应池化(输出固定大小)
adaptive_pool = nn.AdaptiveAvgPool2d((1, 1)) # 输出1x1
adaptive_pool = nn.AdaptiveMaxPool2d((7, 7)) # 输出7x7
平均池化
# 平均池化
avgpool = nn.AvgPool2d(kernel_size=2, stride=2)
output = avgpool(x)
print(f"平均池化输出: {output.shape}") # [1, 64, 14, 14]
# 全局平均池化(将每个通道池化为一个值)
global_avgpool = nn.AdaptiveAvgPool2d(1)
output = global_avgpool(x)
print(f"全局平均池化输出: {output.shape}") # [1, 64, 1, 1]
构建 CNN 模型
LeNet-5(经典网络)
import torch.nn as nn
class LeNet5(nn.Module):
"""LeNet-5: 最早的卷积神经网络之一"""
def __init__(self, num_classes=10):
super(LeNet5, self).__init__()
# 第一个卷积块
self.conv1 = nn.Sequential(
nn.Conv2d(1, 6, kernel_size=5, padding=0), # 28x28 -> 24x24
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2) # 24x24 -> 12x12
)
# 第二个卷积块
self.conv2 = nn.Sequential(
nn.Conv2d(6, 16, kernel_size=5), # 12x12 -> 8x8
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2) # 8x8 -> 4x4
)
# 全连接层
self.fc = nn.Sequential(
nn.Flatten(),
nn.Linear(16 * 4 * 4, 120),
nn.ReLU(),
nn.Linear(120, 84),
nn.ReLU(),
nn.Linear(84, num_classes)
)
def forward(self, x):
x = self.conv1(x)
x = self.conv2(x)
x = self.fc(x)
return x
# 测试
model = LeNet5(num_classes=10)
x = torch.randn(1, 1, 28, 28) # MNIST图像
output = model(x)
print(f"输出形状: {output.shape}") # torch.Size([1, 10])
VGGNet
class VGGNet(nn.Module):
"""VGGNet-16 简化版"""
def __init__(self, num_classes=1000):
super(VGGNet, self).__init__()
# 特征提取层
self.features = nn.Sequential(
# Block 1
nn.Conv2d(3, 64, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(64, 64, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2), # 224->112
# Block 2
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(128, 128, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2), # 112->56
# Block 3
nn.Conv2d(128, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2), # 56->28
)
# 分类器
self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
self.classifier = nn.Sequential(
nn.Linear(256 * 7 * 7, 4096),
nn.ReLU(inplace=True),
nn.Dropout(0.5),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Dropout(0.5),
nn.Linear(4096, num_classes)
)
def forward(self, x):
x = self.features(x)
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.classifier(x)
return x
CNN 训练实战:MNIST 分类
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
# 定义 CNN 模型
class CNN(nn.Module):
def __init__(self):
super(CNN, self).__init__()
# 第一个卷积块
self.conv1 = nn.Sequential(
nn.Conv2d(1, 32, kernel_size=3, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.Conv2d(32, 32, kernel_size=3, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(2), # 28x28 -> 14x14
nn.Dropout(0.25)
)
# 第二个卷积块
self.conv2 = nn.Sequential(
nn.Conv2d(32, 64, kernel_size=3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.Conv2d(64, 64, kernel_size=3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(2), # 14x14 -> 7x7
nn.Dropout(0.25)
)
# 全连接层
self.fc = nn.Sequential(
nn.Flatten(),
nn.Linear(64 * 7 * 7, 256),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(256, 10)
)
def forward(self, x):
x = self.conv1(x)
x = self.conv2(x)
x = self.fc(x)
return x
# 数据变换
train_transform = transforms.Compose([
transforms.RandomRotation(10),
transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
test_transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
# 加载数据
train_dataset = datasets.MNIST('./data', train=True, download=True,
transform=train_transform)
test_dataset = datasets.MNIST('./data', train=False, download=True,
transform=test_transform)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)
# 创建模型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CNN().to(device)
# 损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# 训练
def train_epoch(model, loader, criterion, optimizer, device):
model.train()
total_loss = 0
correct = 0
total = 0
for images, labels in loader:
images, labels = images.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(images)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
_, predicted = outputs.max(1)
total += labels.size(0)
correct += predicted.eq(labels).sum().item()
return total_loss / len(loader), 100. * correct / total
# 评估
def evaluate(model, loader, device):
model.eval()
correct = 0
total = 0
with torch.no_grad():
for images, labels in loader:
images, labels = images.to(device), labels.to(device)
outputs = model(images)
_, predicted = outputs.max(1)
total += labels.size(0)
correct += predicted.eq(labels).sum().item()
return 100. * correct / total
# 训练循环
num_epochs = 10
for epoch in range(num_epochs):
train_loss, train_acc = train_epoch(model, train_loader,
criterion, optimizer, device)
test_acc = evaluate(model, test_loader, device)
print(f"Epoch {epoch+1}/{num_epochs}:")
print(f" Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%")
print(f" Test Acc: {test_acc:.2f}%")
经典 CNN 架构
ResNet(残差网络)
ResNet 通过残差连接解决了深层网络梯度消失的问题:
class ResidualBlock(nn.Module):
def __init__(self, in_channels, out_channels, stride=1):
super(ResidualBlock, self).__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels,
kernel_size=3, stride=stride, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(out_channels)
self.conv2 = nn.Conv2d(out_channels, out_channels,
kernel_size=3, stride=1, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(out_channels)
# 短路连接
self.shortcut = nn.Sequential()
if stride != 1 or in_channels != out_channels:
self.shortcut = nn.Sequential(
nn.Conv2d(in_channels, out_channels,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(out_channels)
)
def forward(self, x):
out = torch.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
out += self.shortcut(x) # 残差连接
out = torch.relu(out)
return out
使用预训练模型
import torchvision.models as models
# 加载预训练的 ResNet-18
model = models.resnet18(pretrained=True)
# 修改最后的全连接层(用于自己的数据集)
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, 10) # 10类分类
# 冻结前面的层(只训练最后一层)
for param in model.parameters():
param.requires_grad = False
for param in model.fc.parameters():
param.requires_grad = True
下一步
现在你已经掌握了 CNN 的核心概念和实现。模型训练完成后,需要保存和加载模型。下一章(速查表)将包含常用的 PyTorch 操作回顾。