跳到主要内容

微调实战

本章将通过实际案例展示如何对预训练模型进行微调,包括文本分类、命名实体识别、问答系统等常见任务的完整流程。

微调概述

什么是微调?

微调(Fine-tuning)是在预训练模型的基础上,使用特定任务的数据继续训练,使模型适应下游任务的过程。

预训练模型                    微调后模型
┌─────────────┐ ┌─────────────┐
│ 通用知识 │ + 微调 → │ 任务专用 │
│ 语言理解 │ 数据 │ 能力增强 │
└─────────────┘ └─────────────┘

优势:
- 利用预训练知识,减少训练数据需求
- 训练速度更快,效果更好
- 泛化能力更强

微调策略

策略说明适用场景
全参数微调训练所有参数数据充足,计算资源充足
冻结部分层只训练顶层数据较少,防止过拟合
LoRA低秩适配大模型微调,节省显存
Prompt Tuning只训练提示嵌入超大模型,极少数据

案例1:文本分类

任务描述

对电影评论进行情感分类(正面/负面)。

完整代码

#!/usr/bin/env python3
"""
文本分类微调示例 - IMDB 情感分析
"""

from datasets import load_dataset
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer,
DataCollatorWithPadding,
EarlyStoppingCallback,
)
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 1. 加载数据
print("Loading IMDB dataset...")
dataset = load_dataset("imdb")

# 查看数据样本
print(f"Train samples: {len(dataset['train'])}")
print(f"Test samples: {len(dataset['test'])}")
print(f"Example: {dataset['train'][0]}")

# 2. 加载模型和分词器
model_name = "distilbert-base-uncased"
print(f"Loading {model_name}...")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
model_name,
num_labels=2,
id2label={0: "NEGATIVE", 1: "POSITIVE"},
label2id={"NEGATIVE": 0, "POSITIVE": 1}
)

# 3. 数据预处理
def preprocess_function(examples):
"""预处理函数"""
return tokenizer(
examples["text"],
truncation=True,
max_length=512,
padding=False, # 动态填充
)

print("Preprocessing data...")
tokenized_dataset = dataset.map(
preprocess_function,
batched=True,
remove_columns=["text"]
)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")

# 4. 评估指标
def compute_metrics(eval_pred):
"""计算评估指标"""
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)

accuracy = accuracy_score(labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(
labels, predictions, average="binary"
)

return {
"accuracy": accuracy,
"precision": precision,
"recall": recall,
"f1": f1,
}

# 5. 训练配置
training_args = TrainingArguments(
output_dir="./imdb_classifier",
evaluation_strategy="steps",
eval_steps=500,
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
warmup_ratio=0.1,
save_strategy="steps",
save_steps=500,
load_best_model_at_end=True,
metric_for_best_model="f1",
logging_steps=100,
fp16=True,
report_to="tensorboard",
)

# 6. 创建 Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"].shuffle(seed=42).select(range(10000)),
eval_dataset=tokenized_dataset["test"].shuffle(seed=42).select(range(2000)),
tokenizer=tokenizer,
data_collator=DataCollatorWithPadding(tokenizer),
compute_metrics=compute_metrics,
callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

# 7. 训练
print("Starting training...")
trainer.train()

# 8. 评估
print("Evaluating...")
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# 9. 保存模型
trainer.save_model("./imdb_classifier_final")
print("Model saved to ./imdb_classifier_final")

# 10. 测试推理
from transformers import pipeline

classifier = pipeline(
"sentiment-analysis",
model="./imdb_classifier_final",
tokenizer=tokenizer
)

test_texts = [
"This movie was absolutely fantastic!",
"I hated every minute of this film.",
"It was okay, nothing special."
]

print("\nTest predictions:")
for text in test_texts:
result = classifier(text)
print(f"{text} -> {result[0]['label']} ({result[0]['score']:.4f})")

案例2:命名实体识别(NER)

任务描述

识别文本中的人名、地名、组织名等实体。

#!/usr/bin/env python3
"""
命名实体识别微调示例
"""

from datasets import load_dataset
from transformers import (
AutoTokenizer,
AutoModelForTokenClassification,
TrainingArguments,
Trainer,
DataCollatorForTokenClassification,
)
import numpy as np

# 1. 加载数据
dataset = load_dataset("conll2003")

# 查看标签
label_list = dataset["train"].features["ner_tags"].feature.names
print(f"Labels: {label_list}")

# 2. 加载模型和分词器
model_name = "bert-base-cased" # NER 通常使用 cased 模型
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
model_name,
num_labels=len(label_list)
)

# 3. 数据预处理(处理子词对齐)
def tokenize_and_align_labels(examples):
"""分词并对齐标签"""
tokenized_inputs = tokenizer(
examples["tokens"],
truncation=True,
is_split_into_words=True,
)

labels = []
for i, label in enumerate(examples["ner_tags"]):
word_ids = tokenized_inputs.word_ids(batch_index=i)
label_ids = []
previous_word_idx = None

for word_idx in word_ids:
if word_idx is None:
# 特殊标记 [CLS], [SEP], [PAD]
label_ids.append(-100)
elif word_idx != previous_word_idx:
# 新词的第一个子词
label_ids.append(label[word_idx])
else:
# 同一个词的后续子词
label_ids.append(-100)
previous_word_idx = word_idx

labels.append(label_ids)

tokenized_inputs["labels"] = labels
return tokenized_inputs

print("Preprocessing data...")
tokenized_dataset = dataset.map(
tokenize_and_align_labels,
batched=True,
remove_columns=dataset["train"].column_names
)

# 4. 评估指标
def compute_metrics(eval_pred):
"""计算 NER 指标"""
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=2)

# 移除特殊标记
true_predictions = [
[label_list[p] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
true_labels = [
[label_list[l] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]

# 计算指标(简化版)
correct = sum(
sum(p == l for p, l in zip(pred, lab))
for pred, lab in zip(true_predictions, true_labels)
)
total = sum(len(lab) for lab in true_labels)
accuracy = correct / total if total > 0 else 0

return {"accuracy": accuracy}

# 5. 训练
training_args = TrainingArguments(
output_dir="./ner_model",
evaluation_strategy="epoch",
learning_rate=5e-5,
per_device_train_batch_size=8,
num_train_epochs=3,
weight_decay=0.01,
logging_steps=100,
)

trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["validation"],
tokenizer=tokenizer,
data_collator=DataCollatorForTokenClassification(tokenizer),
compute_metrics=compute_metrics,
)

print("Starting training...")
trainer.train()
trainer.save_model("./ner_model_final")

# 6. 测试
from transformers import pipeline

ner_pipeline = pipeline(
"ner",
model="./ner_model_final",
tokenizer=tokenizer,
aggregation_strategy="simple"
)

text = "Apple Inc. is planning to open a new store in Paris."
results = ner_pipeline(text)
print(f"\nNER results for: {text}")
for entity in results:
print(f" {entity['word']} -> {entity['entity_group']}")

案例3:问答系统

任务描述

在给定上下文中抽取答案。

#!/usr/bin/env python3
"""
抽取式问答微调示例
"""

from datasets import load_dataset
from transformers import (
AutoTokenizer,
AutoModelForQuestionAnswering,
TrainingArguments,
Trainer,
default_data_collator,
)
import numpy as np
from collections import defaultdict

# 1. 加载数据
dataset = load_dataset("squad")

# 2. 加载模型和分词器
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# 3. 预处理
def preprocess_function(examples):
"""预处理问答数据"""
questions = [q.strip() for q in examples["question"]]
contexts = examples["context"]
answers = examples["answers"]

# 编码问题和上下文
tokenized = tokenizer(
questions,
contexts,
truncation="only_second",
max_length=384,
stride=128,
return_overflowing_tokens=True,
return_offsets_mapping=True,
padding="max_length",
)

# 计算答案的起始和结束位置
sample_mapping = tokenized.pop("overflow_to_sample_mapping")
offset_mapping = tokenized.pop("offset_mapping")

start_positions = []
end_positions = []

for i, offsets in enumerate(offset_mapping):
input_ids = tokenized["input_ids"][i]
cls_index = input_ids.index(tokenizer.cls_token_id)

sequence_ids = tokenized.sequence_ids(i)
sample_index = sample_mapping[i]
answer = answers[sample_index]

# 如果没有答案
if len(answer["answer_start"]) == 0:
start_positions.append(cls_index)
end_positions.append(cls_index)
else:
start_char = answer["answer_start"][0]
end_char = start_char + len(answer["text"][0])

# 找到 token 位置
token_start_index = 0
while sequence_ids[token_start_index] != 1:
token_start_index += 1

token_end_index = len(input_ids) - 1
while sequence_ids[token_end_index] != 1:
token_end_index -= 1

# 检查答案是否在范围内
if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
start_positions.append(cls_index)
end_positions.append(cls_index)
else:
# 找到起始和结束 token
while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
token_start_index += 1
start_positions.append(token_start_index - 1)

while offsets[token_end_index][1] >= end_char:
token_end_index -= 1
end_positions.append(token_end_index + 1)

tokenized["start_positions"] = start_positions
tokenized["end_positions"] = end_positions

return tokenized

print("Preprocessing data...")
tokenized_dataset = dataset.map(
preprocess_function,
batched=True,
remove_columns=dataset["train"].column_names
)

# 4. 训练
training_args = TrainingArguments(
output_dir="./qa_model",
evaluation_strategy="epoch",
learning_rate=3e-5,
per_device_train_batch_size=12,
num_train_epochs=2,
weight_decay=0.01,
logging_steps=100,
)

trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"].select(range(10000)),
eval_dataset=tokenized_dataset["validation"].select(range(1000)),
tokenizer=tokenizer,
data_collator=default_data_collator,
)

print("Starting training...")
trainer.train()
trainer.save_model("./qa_model_final")

# 5. 测试
from transformers import pipeline

qa_pipeline = pipeline(
"question-answering",
model="./qa_model_final",
tokenizer=tokenizer
)

context = """
Hugging Face is a company that develops tools for building applications using machine learning.
It is most notable for its Transformers library built for natural language processing applications.
"""

question = "What is Hugging Face most notable for?"
result = qa_pipeline(question=question, context=context)
print(f"\nQ: {question}")
print(f"A: {result['answer']}")

参数高效微调(PEFT)

LoRA 微调

#!/usr/bin/env python3
"""
使用 LoRA 进行参数高效微调
"""

from transformers import (
AutoModelForSequenceClassification,
AutoTokenizer,
TrainingArguments,
Trainer,
DataCollatorWithPadding,
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score

# 1. 加载数据
dataset = load_dataset("imdb")

# 2. 加载模型和分词器
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# 3. 配置 LoRA
peft_config = LoraConfig(
task_type=TaskType.SEQ_CLS, # 序列分类任务
inference_mode=False,
r=16, # LoRA 秩
lora_alpha=32, # LoRA alpha
lora_dropout=0.1, # Dropout
target_modules=["query", "key", "value", "dense"], # 目标模块
)

# 4. 应用 LoRA
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
# 可训练参数: 1,234,567 (0.42%)

# 5. 预处理
def preprocess(examples):
return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized = dataset.map(preprocess, batched=True, remove_columns=["text"])
tokenized = tokenized.rename_column("label", "labels")
tokenized.set_format("torch")

# 6. 训练
training_args = TrainingArguments(
output_dir="./lora_imdb",
evaluation_strategy="epoch",
learning_rate=1e-3, # LoRA 通常使用更大的学习率
per_device_train_batch_size=16,
num_train_epochs=3,
logging_steps=100,
fp16=True,
)

def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return {"accuracy": accuracy_score(labels, predictions)}

trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized["train"].select(range(5000)),
eval_dataset=tokenized["test"].select(range(1000)),
tokenizer=tokenizer,
data_collator=DataCollatorWithPadding(tokenizer),
compute_metrics=compute_metrics,
)

print("Starting LoRA training...")
trainer.train()

# 7. 保存 LoRA 权重
model.save_pretrained("./lora_imdb_final")

# 8. 推理(合并权重或直接使用)
from peft import PeftModel

# 加载基础模型
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# 加载 LoRA 权重
model = PeftModel.from_pretrained(base_model, "./lora_imdb_final")

# 可选:合并权重以获得完整模型
model = model.merge_and_unload()

微调最佳实践

1. 学习率选择

# 不同微调策略的学习率建议

# 全参数微调
learning_rate = 2e-5

# 冻结部分层(只训练分类头)
learning_rate = 5e-5

# LoRA 微调
learning_rate = 1e-3

# Prompt Tuning
learning_rate = 1e-2

2. 层冻结策略

# 冻结除分类层外的所有参数
for param in model.base_model.parameters():
param.requires_grad = False

# 只解冻最后几层
for param in model.base_model.encoder.layer[-2:].parameters():
param.requires_grad = True

3. 数据增强

from transformers import DataCollatorForLanguageModeling

# 使用数据增强技术
# 1. 回译(Back Translation)
# 2. 同义词替换
# 3. 随机删除/交换

4. 早停和检查点

from transformers import EarlyStoppingCallback

training_args = TrainingArguments(
# ...
load_best_model_at_end=True,
metric_for_best_model="f1",
greater_is_better=True,
save_total_limit=3, # 只保留 3 个最佳检查点
)

trainer = Trainer(
# ...
callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

下一步

掌握微调技能后,你可以继续学习:

  • 模型部署 - 将微调后的模型部署到生产环境(敬请期待)
  • 速查表 - 常用 API 和代码片段

参考资源