模型训练
本章将介绍如何使用 Transformers 的 Trainer API 进行模型训练,包括训练配置、数据准备、回调函数等核心内容。
Trainer 概述
Trainer 是 Transformers 提供的高级训练 API,封装了训练循环、评估、日志记录等复杂逻辑。
┌─────────────────────────────────────────────────────────────┐
│ Trainer 架构 │
├─────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
│ │ 数据集 │───▶│ 数据加载器 │───▶│ 模型 │ │
│ │ Dataset │ │ DataLoader │ │ Model │ │
│ └─────────────┘ └─────────────┘ └──────┬──────┘ │
│ │ │
│ ┌─────────────┐ ┌─────────────┐ ▼ │
│ │ 评估指标 │◀───│ Trainer │◀──── 训练循环 │
│ │ Metrics │ │ │ (forward/ │
│ └─────────────┘ │ • 训练配置 │ backward/ │
│ │ • 优化器 │ optimizer.step) │
│ ┌─────────────┐ │ • 学习率调度 │ │
│ │ 回调函数 │◀──▶│ • 日志记录 │ │
│ │ Callbacks │ │ • 检查点保存 │ │
│ └─────────────┘ └─────────────┘ │
│ │
└─────────────────────────────────────────────────────────────┘
基础训练流程
准备数据
from datasets import load_dataset
from transformers import AutoTokenizer
# 加载数据集
dataset = load_dataset("imdb")
# 加载分词器
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# 预处理函数
def preprocess_function(examples):
return tokenizer(
examples["text"],
truncation=True,
padding=True,
max_length=512
)
# 应用预处理
tokenized_dataset = dataset.map(preprocess_function, batched=True)
# 设置格式
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")
# 划分训练集和验证集
train_dataset = tokenized_dataset["train"].shuffle(seed=42).select(range(1000))
eval_dataset = tokenized_dataset["test"].shuffle(seed=42).select(range(500))
定义训练参数
from transformers import TrainingArguments
training_args = TrainingArguments(
output_dir="./results", # 输出目录
evaluation_strategy="epoch", # 评估策略
learning_rate=2e-5, # 学习率
per_device_train_batch_size=8, # 训练批次大小
per_device_eval_batch_size=8, # 评估批次大小
num_train_epochs=3, # 训练轮数
weight_decay=0.01, # 权重衰减
save_strategy="epoch", # 保存策略
load_best_model_at_end=True, # 训练结束加载最佳模型
logging_dir="./logs", # 日志目录
logging_steps=10, # 日志记录步数
fp16=True, # 混合精度训练
)
创建 Trainer 并训练
from transformers import AutoModelForSequenceClassification, Trainer
# 加载模型
model = AutoModelForSequenceClassification.from_pretrained(
"bert-base-uncased",
num_labels=2
)
# 定义评估指标
import numpy as np
from datasets import load_metric
metric = load_metric("accuracy")
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return metric.compute(predictions=predictions, references=labels)
# 创建 Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
compute_metrics=compute_metrics,
)
# 开始训练
trainer.train()
TrainingArguments 详解
基础配置
from transformers import TrainingArguments
training_args = TrainingArguments(
# 输出和保存
output_dir="./results", # 模型输出目录
overwrite_output_dir=True, # 覆盖输出目录
save_strategy="steps", # 保存策略: "no"/"steps"/"epoch"
save_steps=500, # 每多少步保存
save_total_limit=3, # 最多保存检查点数量
# 训练配置
num_train_epochs=3, # 训练轮数
max_steps=-1, # 最大训练步数(-1表示不限制)
per_device_train_batch_size=8, # 每个设备的训练批次
gradient_accumulation_steps=4, # 梯度累积步数
# 优化器配置
learning_rate=5e-5, # 学习率
weight_decay=0.01, # 权重衰减
adam_beta1=0.9, # Adam beta1
adam_beta2=0.999, # Adam beta2
adam_epsilon=1e-8, # Adam epsilon
max_grad_norm=1.0, # 梯度裁剪
# 学习率调度
lr_scheduler_type="linear", # 学习率调度类型
warmup_ratio=0.1, # 预热比例
warmup_steps=0, # 预热步数
)
评估配置
training_args = TrainingArguments(
# 评估配置
evaluation_strategy="steps", # 评估策略
eval_steps=100, # 评估步数
per_device_eval_batch_size=8, # 评估批次大小
# 加载最佳模型
load_best_model_at_end=True, # 训练结束加载最佳模型
metric_for_best_model="accuracy", # 选择最佳模型的指标
greater_is_better=True, # 指标越大越好
)
日志和监控
training_args = TrainingArguments(
# 日志配置
logging_dir="./logs", # 日志目录
logging_strategy="steps", # 日志策略
logging_steps=10, # 日志记录步数
report_to=["tensorboard"], # 报告目标: "tensorboard"/"wandb"/"mlflow"
# 进度条
disable_tqdm=False, # 是否禁用进度条
)
性能优化
training_args = TrainingArguments(
# 混合精度训练
fp16=True, # FP16 训练
bf16=False, # BF16 训练
# 数据加载
dataloader_num_workers=4, # 数据加载器工作进程数
dataloader_pin_memory=True, # 固定内存
# 优化
group_by_length=True, # 按长度分组(加速训练)
length_column_name="length", # 长度列名
# 分布式训练
local_rank=-1, # 本地 rank
deepspeed=None, # DeepSpeed 配置
)
自定义 Trainer
继承 Trainer 类
from transformers import Trainer
class CustomTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
"""自定义损失计算"""
labels = inputs.pop("labels")
outputs = model(**inputs)
logits = outputs.logits
# 自定义损失函数
loss_fct = torch.nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0]).to(logits.device))
loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
return (loss, outputs) if return_outputs else loss
def training_step(self, model, inputs, num_items_in_batch=None):
"""自定义训练步骤"""
model.train()
inputs = self._prepare_inputs(inputs)
with self.compute_loss_context_manager():
loss = self.compute_loss(model, inputs)
if self.args.n_gpu > 1:
loss = loss.mean()
if self.args.gradient_accumulation_steps > 1:
loss = loss / self.args.gradient_accumulation_steps
loss.backward()
return loss.detach()
自定义数据整理器
from transformers import DataCollatorWithPadding, DataCollatorForLanguageModeling
# 动态填充(推荐)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# 语言模型数据整理器
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=True, # 掩码语言模型
mlm_probability=0.15 # 掩码概率
)
# 在 Trainer 中使用
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
data_collator=data_collator,
)
回调函数
内置回调函数
from transformers import (
EarlyStoppingCallback,
PrinterCallback,
ProgressCallback,
TensorBoardCallback,
WandbCallback,
)
# 早停回调
early_stopping = EarlyStoppingCallback(
early_stopping_patience=3, # 耐心值
early_stopping_threshold=0.001 # 阈值
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
callbacks=[early_stopping],
)
自定义回调函数
from transformers import TrainerCallback
class CustomCallback(TrainerCallback):
def __init__(self):
super().__init__()
self.best_metric = 0
def on_init_end(self, args, state, control, **kwargs):
"""初始化结束时调用"""
print("训练初始化完成")
def on_train_begin(self, args, state, control, **kwargs):
"""训练开始时调用"""
print(f"开始训练,总步数: {state.max_steps}")
def on_epoch_begin(self, args, state, control, **kwargs):
"""每个 epoch 开始时调用"""
print(f"开始 Epoch {state.epoch}")
def on_step_end(self, args, state, control, **kwargs):
"""每步结束时调用"""
if state.global_step % 100 == 0:
print(f"Step {state.global_step}/{state.max_steps}")
def on_evaluate(self, args, state, control, metrics=None, **kwargs):
"""评估结束时调用"""
current_metric = metrics.get("eval_accuracy", 0)
if current_metric > self.best_metric:
self.best_metric = current_metric
print(f"新的最佳模型!准确率: {current_metric:.4f}")
def on_train_end(self, args, state, control, **kwargs):
"""训练结束时调用"""
print("训练完成!")
# 使用自定义回调
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
callbacks=[CustomCallback()],
)
训练技巧
梯度累积
training_args = TrainingArguments(
per_device_train_batch_size=2, # 小批次
gradient_accumulation_steps=8, # 累积 8 步
# 有效批次大小 = 2 * 8 = 16
)
学习率预热
training_args = TrainingArguments(
warmup_ratio=0.1, # 10% 的步数用于预热
# 或者
warmup_steps=500, # 预热 500 步
)
混合精度训练
training_args = TrainingArguments(
fp16=True, # NVIDIA GPU
# 或者
bf16=True, # Ampere GPU (A100, RTX 30系列)
)
梯度检查点
# 节省显存,增加计算时间
model.gradient_checkpointing_enable()
完整训练示例
文本分类
import torch
from datasets import load_dataset
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer,
DataCollatorWithPadding,
)
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
# 1. 加载数据和模型
dataset = load_dataset("imdb")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained(
"bert-base-uncased",
num_labels=2
)
# 2. 数据预处理
def preprocess(examples):
return tokenizer(
examples["text"],
truncation=True,
padding=False,
max_length=512
)
tokenized_dataset = dataset.map(preprocess, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
# 3. 数据整理器
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# 4. 评估指标
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
precision, recall, f1, _ = precision_recall_fscore_support(
labels, preds, average='binary'
)
acc = accuracy_score(labels, preds)
return {
'accuracy': acc,
'f1': f1,
'precision': precision,
'recall': recall
}
# 5. 训练参数
training_args = TrainingArguments(
output_dir="./imdb-classifier",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="f1",
fp16=True,
logging_steps=100,
)
# 6. 创建 Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"].shuffle(seed=42).select(range(2000)),
eval_dataset=tokenized_dataset["test"].shuffle(seed=42).select(range(500)),
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
# 7. 训练
trainer.train()
# 8. 评估
results = trainer.evaluate()
print(results)
# 9. 保存模型
trainer.save_model("./imdb-classifier-best")
命名实体识别 (NER)
from datasets import load_dataset
from transformers import (
AutoTokenizer,
AutoModelForTokenClassification,
TrainingArguments,
Trainer,
DataCollatorForTokenClassification,
)
import numpy as np
# 加载数据
dataset = load_dataset("conll2003")
label_list = dataset["train"].features["ner_tags"].feature.names
# 加载模型和分词器
model_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
model_name,
num_labels=len(label_list)
)
# 预处理
def tokenize_and_align_labels(examples):
tokenized_inputs = tokenizer(
examples["tokens"],
truncation=True,
is_split_into_words=True
)
labels = []
for i, label in enumerate(examples["ner_tags"]):
word_ids = tokenized_inputs.word_ids(batch_index=i)
label_ids = []
previous_word_idx = None
for word_idx in word_ids:
if word_idx is None:
label_ids.append(-100)
elif word_idx != previous_word_idx:
label_ids.append(label[word_idx])
else:
label_ids.append(-100)
previous_word_idx = word_idx
labels.append(label_ids)
tokenized_inputs["labels"] = labels
return tokenized_inputs
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
# 数据整理器
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
# 评估指标
def compute_metrics(p):
predictions, labels = p
predictions = np.argmax(predictions, axis=2)
true_predictions = [
[label_list[p] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
true_labels = [
[label_list[l] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
from seqeval.metrics import f1_score
return {"f1": f1_score(true_labels, true_predictions)}
# 训练
training_args = TrainingArguments(
output_dir="./ner-model",
evaluation_strategy="epoch",
learning_rate=5e-5,
per_device_train_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["validation"],
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
trainer.train()
模型保存和加载
保存模型
# 保存模型和分词器
trainer.save_model("./my-model")
# 保存训练状态(用于恢复训练)
trainer.save_state()
# 只保存模型权重
model.save_pretrained("./my-model-weights")
tokenizer.save_pretrained("./my-model-weights")
加载模型
from transformers import AutoModelForSequenceClassification, AutoTokenizer
# 加载保存的模型
model = AutoModelForSequenceClassification.from_pretrained("./my-model")
tokenizer = AutoTokenizer.from_pretrained("./my-model")
# 继续训练
trainer = Trainer(model=model, ...)
trainer.train(resume_from_checkpoint="./my-model/checkpoint-1000")
分布式训练
使用 torchrun
# 单机多卡
torchrun --nproc_per_node=4 train.py
# 多机多卡
torchrun \
--nnodes=2 \
--node_rank=0 \
--master_addr="192.168.1.1" \
--master_port=12345 \
--nproc_per_node=4 \
train.py
DeepSpeed 集成
# deepspeed_config.json
{
"fp16": {
"enabled": true
},
"zero_optimization": {
"stage": 2,
"offload_optimizer": {
"device": "cpu"
}
},
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto"
}
# 训练参数
training_args = TrainingArguments(
deepspeed="./deepspeed_config.json",
# ...
)
下一步
掌握模型训练后,你可以继续学习: