Transformers 知识速查表
快速参考 Hugging Face Transformers 的常用 API、代码片段和最佳实践。
快速开始
安装
# 基础安装
pip install transformers
# 完整安装
pip install transformers datasets evaluate accelerate
# 使用 conda
conda install -c huggingface transformers
最简单的使用方式
from transformers import pipeline
# 情感分析
classifier = pipeline("sentiment-analysis")
result = classifier("I love Transformers!")
# 文本生成
generator = pipeline("text-generation", model="gpt2")
result = generator("Hello, world!")
核心类速查
模型加载
from transformers import AutoModel, AutoTokenizer, AutoConfig
# 自动加载
model = AutoModel.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
config = AutoConfig.from_pretrained("bert-base-uncased")
# 任务特定模型
from transformers import (
AutoModelForSequenceClassification, # 分类
AutoModelForTokenClassification, # Token 分类
AutoModelForQuestionAnswering, # 问答
AutoModelForCausalLM, # 因果语言模型
AutoModelForMaskedLM, # 掩码语言模型
AutoModelForSeq2SeqLM, # 序列到序列
)
分词器使用
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# 基本编码
encoded = tokenizer("Hello world", return_tensors="pt")
# 批量编码
encoded = tokenizer(
["text1", "text2"],
padding=True,
truncation=True,
max_length=512,
return_tensors="pt"
)
# 解码
decoded = tokenizer.decode(encoded['input_ids'][0])
# 句子对
encoded = tokenizer("question?", "context here", return_tensors="pt")
Pipeline 任务列表
自然语言处理
| 任务 | Pipeline 名称 | 示例 |
|---|---|---|
| 情感分析 | sentiment-analysis | pipeline("sentiment-analysis") |
| 文本分类 | text-classification | pipeline("text-classification") |
| 命名实体识别 | ner | pipeline("ner") |
| 问答 | question-answering | pipeline("question-answering") |
| 文本生成 | text-generation | pipeline("text-generation", model="gpt2") |
| 摘要 | summarization | pipeline("summarization") |
| 翻译 | translation_xx_to_yy | pipeline("translation_en_to_fr") |
| 填充掩码 | fill-mask | pipeline("fill-mask", model="bert-base-uncased") |
| 零样本分类 | zero-shot-classification | pipeline("zero-shot-classification") |
| 特征提取 | feature-extraction | pipeline("feature-extraction") |
计算机视觉
| 任务 | Pipeline 名称 |
|---|---|
| 图像分类 | image-classification |
| 目标检测 | object-detection |
| 图像分割 | image-segmentation |
音频
| 任务 | Pipeline 名称 |
|---|---|
| 语音识别 | automatic-speech-recognition |
| 音频分类 | audio-classification |
训练相关
Trainer 基本配置
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
learning_rate=2e-5,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
logging_steps=100,
fp16=True, # 混合精度
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
trainer.train()
trainer.save_model("./model")
数据整理器
from transformers import (
DataCollatorWithPadding, # 动态填充
DataCollatorForTokenClassification, # Token 分类
DataCollatorForLanguageModeling, # 语言建模
DataCollatorForSeq2Seq, # 序列到序列
default_data_collator, # 默认
)
模型配置
配置参数
from transformers import AutoConfig
config = AutoConfig.from_pretrained("bert-base-uncased")
# 常见配置项
config.hidden_size # 隐藏层维度
config.num_hidden_layers # Transformer 层数
config.num_attention_heads # 注意力头数
config.vocab_size # 词汇表大小
config.max_position_embeddings # 最大位置编码
修改配置
# 修改分类数
config.num_labels = 5
model = AutoModelForSequenceClassification.from_pretrained(
"bert-base-uncased",
config=config
)
# 从头创建配置
from transformers import BertConfig
config = BertConfig(
vocab_size=30000,
hidden_size=768,
num_hidden_layers=12,
)
推理优化
GPU 推理
import torch
# 移动模型到 GPU
model = model.to("cuda")
# 移动输入到 GPU
inputs = tokenizer(text, return_tensors="pt")
inputs = {k: v.to("cuda") for k, v in inputs.items()}
# 推理
with torch.no_grad():
outputs = model(**inputs)
量化加载
# 8-bit 量化
model = AutoModel.from_pretrained(
"model_name",
load_in_8bit=True
)
# 4-bit 量化
model = AutoModel.from_pretrained(
"model_name",
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16
)
批量推理
# 批量处理比循环快
texts = ["text1", "text2", "text3"]
# 快
inputs = tokenizer(texts, padding=True, return_tensors="pt")
outputs = model(**inputs)
# 慢
for text in texts:
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
模型保存与加载
保存模型
# 保存到本地
model.save_pretrained("./my_model")
tokenizer.save_pretrained("./my_model")
# 推送到 Hub
model.push_to_hub("username/model-name")
tokenizer.push_to_hub("username/model-name")
# Trainer 保存
trainer.save_model("./my_model")
加载模型
# 从本地加载
model = AutoModel.from_pretrained("./my_model")
# 从 Hub 加载
model = AutoModel.from_pretrained("username/model-name")
# 指定版本
model = AutoModel.from_pretrained(
"bert-base-uncased",
revision="v1.0.0"
)
常用代码片段
特征提取
from transformers import AutoModel, AutoTokenizer
import torch
model = AutoModel.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
text = "Example text"
inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
# [CLS] token 表示
cls_embedding = outputs.last_hidden_state[:, 0, :]
# 平均池化
attention_mask = inputs['attention_mask']
mask_expanded = attention_mask.unsqueeze(-1).expand(outputs.last_hidden_state.size()).float()
sum_embeddings = torch.sum(outputs.last_hidden_state * mask_expanded, 1)
sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
mean_pooled = sum_embeddings / sum_mask
文本生成参数
generator = pipeline("text-generation", model="gpt2")
results = generator(
prompt,
max_length=100, # 最大长度
min_length=10, # 最小长度
temperature=0.7, # 温度(随机性)
top_k=50, # Top-K 采样
top_p=0.95, # Top-P 采样
num_return_sequences=3, # 返回数量
do_sample=True, # 是否采样
repetition_penalty=1.2, # 重复惩罚
early_stopping=True, # 早停
)
自定义损失函数
from transformers import Trainer
import torch.nn as nn
class CustomTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
labels = inputs.get("labels")
outputs = model(**inputs)
logits = outputs.get("logits")
# 自定义损失
loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0]).to(logits.device))
loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
return (loss, outputs) if return_outputs else loss
自定义评估指标
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return {
"accuracy": accuracy_score(labels, predictions),
"f1": f1_score(labels, predictions, average="weighted"),
}
常用模型列表
BERT 系列
| 模型 | 大小 | 特点 |
|---|---|---|
| bert-base-uncased | 110M | 基础版,小写 |
| bert-base-cased | 110M | 基础版,保留大小写 |
| bert-large-uncased | 340M | 大版本 |
| distilbert-base-uncased | 66M | 蒸馏版,更快 |
| roberta-base | 125M | 优化版 BERT |
GPT 系列
| 模型 | 大小 | 特点 |
|---|---|---|
| gpt2 | 124M | GPT-2 基础版 |
| gpt2-medium | 355M | 中等版本 |
| gpt2-large | 774M | 大版本 |
| gpt2-xl | 1.5B | 超大版本 |
T5 系列
| 模型 | 大小 | 特点 |
|---|---|---|
| t5-small | 60M | 小版本 |
| t5-base | 220M | 基础版 |
| t5-large | 770M | 大版本 |
中文模型
| 模型 | 说明 |
|---|---|
| bert-base-chinese | 基础中文 BERT |
| hfl/chinese-roberta-wwm-ext | 中文 RoBERTa |
| uer/roberta-base-finetuned-chinanews-chinese | 中文新闻分类 |
常见问题解决
显存不足
# 方案1:使用更小的模型
model = AutoModel.from_pretrained("distilbert-base-uncased")
# 方案2:使用量化
model = AutoModel.from_pretrained("bert-base-uncased", load_in_8bit=True)
# 方案3:使用 CPU
model = AutoModel.from_pretrained("bert-base-uncased", device_map="cpu")
# 方案4:梯度检查点
model.gradient_checkpointing_enable()
# 方案5:清空缓存
import torch
torch.cuda.empty_cache()
模型下载问题
# 设置镜像(国内用户)
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
# 使用本地缓存
model = AutoModel.from_pretrained("model_name", local_files_only=True)
# 指定缓存目录
model = AutoModel.from_pretrained(
"model_name",
cache_dir="./huggingface_cache"
)
长文本处理
# 截断
tokenizer(text, max_length=512, truncation=True)
# 滑动窗口
tokenizer(
text,
max_length=512,
truncation=True,
return_overflowing_tokens=True,
stride=128
)