微调实战
本章将通过实际案例展示如何对预训练模型进行微调,包括文本分类、命名实体识别、问答系统等常见任务的完整流程。
微调概述
什么是微调?
微调(Fine-tuning)是在预训练模型的基础上,使用特定任务的数据继续训练,使模型适应下游任务的过程。
预训练模型 微调后模型
┌─────────────┐ ┌─────────────┐
│ 通用知识 │ + 微调 → │ 任务专用 │
│ 语言理解 │ 数据 │ 能力增强 │
└─────────────┘ └─────────────┘
优势:
- 利用预训练知识,减少训练数据需求
- 训练速度更快,效果更好
- 泛化能力更强
微调策略
| 策略 | 说明 | 适用场景 |
|---|---|---|
| 全参数微调 | 训练所有参数 | 数据充足,计算资源充足 |
| 冻结部分层 | 只训练顶层 | 数据较少,防止过拟合 |
| LoRA | 低秩适配 | 大模型微调,节省显存 |
| Prompt Tuning | 只训练提示嵌入 | 超大模型,极少数据 |
案例1:文本分类
任务描述
对电影评论进行情感分类(正面/负面)。
完整代码
#!/usr/bin/env python3
"""
文本分类微调示例 - IMDB 情感分析
"""
from datasets import load_dataset
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer,
DataCollatorWithPadding,
EarlyStoppingCallback,
)
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
# 1. 加载数据
print("Loading IMDB dataset...")
dataset = load_dataset("imdb")
# 查看数据样本
print(f"Train samples: {len(dataset['train'])}")
print(f"Test samples: {len(dataset['test'])}")
print(f"Example: {dataset['train'][0]}")
# 2. 加载模型和分词器
model_name = "distilbert-base-uncased"
print(f"Loading {model_name}...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
model_name,
num_labels=2,
id2label={0: "NEGATIVE", 1: "POSITIVE"},
label2id={"NEGATIVE": 0, "POSITIVE": 1}
)
# 3. 数据预处理
def preprocess_function(examples):
"""预处理函数"""
return tokenizer(
examples["text"],
truncation=True,
max_length=512,
padding=False, # 动态填充
)
print("Preprocessing data...")
tokenized_dataset = dataset.map(
preprocess_function,
batched=True,
remove_columns=["text"]
)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")
# 4. 评估指标
def compute_metrics(eval_pred):
"""计算评估指标"""
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
accuracy = accuracy_score(labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(
labels, predictions, average="binary"
)
return {
"accuracy": accuracy,
"precision": precision,
"recall": recall,
"f1": f1,
}
# 5. 训练配置
training_args = TrainingArguments(
output_dir="./imdb_classifier",
evaluation_strategy="steps",
eval_steps=500,
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
warmup_ratio=0.1,
save_strategy="steps",
save_steps=500,
load_best_model_at_end=True,
metric_for_best_model="f1",
logging_steps=100,
fp16=True,
report_to="tensorboard",
)
# 6. 创建 Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"].shuffle(seed=42).select(range(10000)),
eval_dataset=tokenized_dataset["test"].shuffle(seed=42).select(range(2000)),
tokenizer=tokenizer,
data_collator=DataCollatorWithPadding(tokenizer),
compute_metrics=compute_metrics,
callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)
# 7. 训练
print("Starting training...")
trainer.train()
# 8. 评估
print("Evaluating...")
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")
# 9. 保存模型
trainer.save_model("./imdb_classifier_final")
print("Model saved to ./imdb_classifier_final")
# 10. 测试推理
from transformers import pipeline
classifier = pipeline(
"sentiment-analysis",
model="./imdb_classifier_final",
tokenizer=tokenizer
)
test_texts = [
"This movie was absolutely fantastic!",
"I hated every minute of this film.",
"It was okay, nothing special."
]
print("\nTest predictions:")
for text in test_texts:
result = classifier(text)
print(f"{text} -> {result[0]['label']} ({result[0]['score']:.4f})")
案例2:命名实体识别(NER)
任务描述
识别文本中的人名、地名、组织名等实体。
#!/usr/bin/env python3
"""
命名实体识别微调示例
"""
from datasets import load_dataset
from transformers import (
AutoTokenizer,
AutoModelForTokenClassification,
TrainingArguments,
Trainer,
DataCollatorForTokenClassification,
)
import numpy as np
# 1. 加载数据
dataset = load_dataset("conll2003")
# 查看标签
label_list = dataset["train"].features["ner_tags"].feature.names
print(f"Labels: {label_list}")
# 2. 加载模型和分词器
model_name = "bert-base-cased" # NER 通常使用 cased 模型
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
model_name,
num_labels=len(label_list)
)
# 3. 数据预处理(处理子词对齐)
def tokenize_and_align_labels(examples):
"""分词并对齐标签"""
tokenized_inputs = tokenizer(
examples["tokens"],
truncation=True,
is_split_into_words=True,
)
labels = []
for i, label in enumerate(examples["ner_tags"]):
word_ids = tokenized_inputs.word_ids(batch_index=i)
label_ids = []
previous_word_idx = None
for word_idx in word_ids:
if word_idx is None:
# 特殊标记 [CLS], [SEP], [PAD]
label_ids.append(-100)
elif word_idx != previous_word_idx:
# 新词的第一个子词
label_ids.append(label[word_idx])
else:
# 同一个词的后续子词
label_ids.append(-100)
previous_word_idx = word_idx
labels.append(label_ids)
tokenized_inputs["labels"] = labels
return tokenized_inputs
print("Preprocessing data...")
tokenized_dataset = dataset.map(
tokenize_and_align_labels,
batched=True,
remove_columns=dataset["train"].column_names
)
# 4. 评估指标
def compute_metrics(eval_pred):
"""计算 NER 指标"""
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=2)
# 移除特殊标记
true_predictions = [
[label_list[p] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
true_labels = [
[label_list[l] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
# 计算指标(简化版)
correct = sum(
sum(p == l for p, l in zip(pred, lab))
for pred, lab in zip(true_predictions, true_labels)
)
total = sum(len(lab) for lab in true_labels)
accuracy = correct / total if total > 0 else 0
return {"accuracy": accuracy}
# 5. 训练
training_args = TrainingArguments(
output_dir="./ner_model",
evaluation_strategy="epoch",
learning_rate=5e-5,
per_device_train_batch_size=8,
num_train_epochs=3,
weight_decay=0.01,
logging_steps=100,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["validation"],
tokenizer=tokenizer,
data_collator=DataCollatorForTokenClassification(tokenizer),
compute_metrics=compute_metrics,
)
print("Starting training...")
trainer.train()
trainer.save_model("./ner_model_final")
# 6. 测试
from transformers import pipeline
ner_pipeline = pipeline(
"ner",
model="./ner_model_final",
tokenizer=tokenizer,
aggregation_strategy="simple"
)
text = "Apple Inc. is planning to open a new store in Paris."
results = ner_pipeline(text)
print(f"\nNER results for: {text}")
for entity in results:
print(f" {entity['word']} -> {entity['entity_group']}")
案例3:问答系统
任务描述
在给定上下文中抽取答案。
#!/usr/bin/env python3
"""
抽取式问答微调示例
"""
from datasets import load_dataset
from transformers import (
AutoTokenizer,
AutoModelForQuestionAnswering,
TrainingArguments,
Trainer,
default_data_collator,
)
import numpy as np
from collections import defaultdict
# 1. 加载数据
dataset = load_dataset("squad")
# 2. 加载模型和分词器
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
# 3. 预处理
def preprocess_function(examples):
"""预处理问答数据"""
questions = [q.strip() for q in examples["question"]]
contexts = examples["context"]
answers = examples["answers"]
# 编码问题和上下文
tokenized = tokenizer(
questions,
contexts,
truncation="only_second",
max_length=384,
stride=128,
return_overflowing_tokens=True,
return_offsets_mapping=True,
padding="max_length",
)
# 计算答案的起始和结束位置
sample_mapping = tokenized.pop("overflow_to_sample_mapping")
offset_mapping = tokenized.pop("offset_mapping")
start_positions = []
end_positions = []
for i, offsets in enumerate(offset_mapping):
input_ids = tokenized["input_ids"][i]
cls_index = input_ids.index(tokenizer.cls_token_id)
sequence_ids = tokenized.sequence_ids(i)
sample_index = sample_mapping[i]
answer = answers[sample_index]
# 如果没有答案
if len(answer["answer_start"]) == 0:
start_positions.append(cls_index)
end_positions.append(cls_index)
else:
start_char = answer["answer_start"][0]
end_char = start_char + len(answer["text"][0])
# 找到 token 位置
token_start_index = 0
while sequence_ids[token_start_index] != 1:
token_start_index += 1
token_end_index = len(input_ids) - 1
while sequence_ids[token_end_index] != 1:
token_end_index -= 1
# 检查答案是否在范围内
if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
start_positions.append(cls_index)
end_positions.append(cls_index)
else:
# 找到起始和结束 token
while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
token_start_index += 1
start_positions.append(token_start_index - 1)
while offsets[token_end_index][1] >= end_char:
token_end_index -= 1
end_positions.append(token_end_index + 1)
tokenized["start_positions"] = start_positions
tokenized["end_positions"] = end_positions
return tokenized
print("Preprocessing data...")
tokenized_dataset = dataset.map(
preprocess_function,
batched=True,
remove_columns=dataset["train"].column_names
)
# 4. 训练
training_args = TrainingArguments(
output_dir="./qa_model",
evaluation_strategy="epoch",
learning_rate=3e-5,
per_device_train_batch_size=12,
num_train_epochs=2,
weight_decay=0.01,
logging_steps=100,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"].select(range(10000)),
eval_dataset=tokenized_dataset["validation"].select(range(1000)),
tokenizer=tokenizer,
data_collator=default_data_collator,
)
print("Starting training...")
trainer.train()
trainer.save_model("./qa_model_final")
# 5. 测试
from transformers import pipeline
qa_pipeline = pipeline(
"question-answering",
model="./qa_model_final",
tokenizer=tokenizer
)
context = """
Hugging Face is a company that develops tools for building applications using machine learning.
It is most notable for its Transformers library built for natural language processing applications.
"""
question = "What is Hugging Face most notable for?"
result = qa_pipeline(question=question, context=context)
print(f"\nQ: {question}")
print(f"A: {result['answer']}")
参数高效微调(PEFT)
LoRA 微调
#!/usr/bin/env python3
"""
使用 LoRA 进行参数高效微调
"""
from transformers import (
AutoModelForSequenceClassification,
AutoTokenizer,
TrainingArguments,
Trainer,
DataCollatorWithPadding,
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score
# 1. 加载数据
dataset = load_dataset("imdb")
# 2. 加载模型和分词器
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
# 3. 配置 LoRA
peft_config = LoraConfig(
task_type=TaskType.SEQ_CLS, # 序列分类任务
inference_mode=False,
r=16, # LoRA 秩
lora_alpha=32, # LoRA alpha
lora_dropout=0.1, # Dropout
target_modules=["query", "key", "value", "dense"], # 目标模块
)
# 4. 应用 LoRA
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
# 可训练参数: 1,234,567 (0.42%)
# 5. 预处理
def preprocess(examples):
return tokenizer(examples["text"], truncation=True, max_length=512)
tokenized = dataset.map(preprocess, batched=True, remove_columns=["text"])
tokenized = tokenized.rename_column("label", "labels")
tokenized.set_format("torch")
# 6. 训练
training_args = TrainingArguments(
output_dir="./lora_imdb",
evaluation_strategy="epoch",
learning_rate=1e-3, # LoRA 通常使用更大的学习率
per_device_train_batch_size=16,
num_train_epochs=3,
logging_steps=100,
fp16=True,
)
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return {"accuracy": accuracy_score(labels, predictions)}
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized["train"].select(range(5000)),
eval_dataset=tokenized["test"].select(range(1000)),
tokenizer=tokenizer,
data_collator=DataCollatorWithPadding(tokenizer),
compute_metrics=compute_metrics,
)
print("Starting LoRA training...")
trainer.train()
# 7. 保存 LoRA 权重
model.save_pretrained("./lora_imdb_final")
# 8. 推理(合并权重或直接使用)
from peft import PeftModel
# 加载基础模型
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
# 加载 LoRA 权重
model = PeftModel.from_pretrained(base_model, "./lora_imdb_final")
# 可选:合并权重以获得完整模型
model = model.merge_and_unload()
微调最佳实践
1. 学习率选择
# 不同微调策略的学习率建议
# 全参数微调
learning_rate = 2e-5
# 冻结部分层(只训练分类头)
learning_rate = 5e-5
# LoRA 微调
learning_rate = 1e-3
# Prompt Tuning
learning_rate = 1e-2
2. 层冻结策略
# 冻结除分类层外的所有参数
for param in model.base_model.parameters():
param.requires_grad = False
# 只解冻最后几层
for param in model.base_model.encoder.layer[-2:].parameters():
param.requires_grad = True
3. 数据增强
from transformers import DataCollatorForLanguageModeling
# 使用数据增强技术
# 1. 回译(Back Translation)
# 2. 同义词替换
# 3. 随机删除/交换
4. 早停和检查点
from transformers import EarlyStoppingCallback
training_args = TrainingArguments(
# ...
load_best_model_at_end=True,
metric_for_best_model="f1",
greater_is_better=True,
save_total_limit=3, # 只保留 3 个最佳检查点
)
trainer = Trainer(
# ...
callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)
下一步
掌握微调技能后,你可以继续学习:
- 模型部署 - 将微调后的模型部署到生产环境(敬请期待)
- 速查表 - 常用 API 和代码片段