跳到主要内容

Transformers 知识速查表

快速参考 Hugging Face Transformers 的常用 API、代码片段和最佳实践。

快速开始

安装

# 基础安装
pip install transformers

# 完整安装
pip install transformers datasets evaluate accelerate

# 使用 conda
conda install -c huggingface transformers

最简单的使用方式

from transformers import pipeline

# 情感分析
classifier = pipeline("sentiment-analysis")
result = classifier("I love Transformers!")

# 文本生成
generator = pipeline("text-generation", model="gpt2")
result = generator("Hello, world!")

核心类速查

模型加载

from transformers import AutoModel, AutoTokenizer, AutoConfig

# 自动加载
model = AutoModel.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
config = AutoConfig.from_pretrained("bert-base-uncased")

# 任务特定模型
from transformers import (
AutoModelForSequenceClassification, # 分类
AutoModelForTokenClassification, # Token 分类
AutoModelForQuestionAnswering, # 问答
AutoModelForCausalLM, # 因果语言模型
AutoModelForMaskedLM, # 掩码语言模型
AutoModelForSeq2SeqLM, # 序列到序列
)

分词器使用

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# 基本编码
encoded = tokenizer("Hello world", return_tensors="pt")

# 批量编码
encoded = tokenizer(
["text1", "text2"],
padding=True,
truncation=True,
max_length=512,
return_tensors="pt"
)

# 解码
decoded = tokenizer.decode(encoded['input_ids'][0])

# 句子对
encoded = tokenizer("question?", "context here", return_tensors="pt")

Pipeline 任务列表

自然语言处理

任务Pipeline 名称示例
情感分析sentiment-analysispipeline("sentiment-analysis")
文本分类text-classificationpipeline("text-classification")
命名实体识别nerpipeline("ner")
问答question-answeringpipeline("question-answering")
文本生成text-generationpipeline("text-generation", model="gpt2")
摘要summarizationpipeline("summarization")
翻译translation_xx_to_yypipeline("translation_en_to_fr")
填充掩码fill-maskpipeline("fill-mask", model="bert-base-uncased")
零样本分类zero-shot-classificationpipeline("zero-shot-classification")
特征提取feature-extractionpipeline("feature-extraction")

计算机视觉

任务Pipeline 名称
图像分类image-classification
目标检测object-detection
图像分割image-segmentation

音频

任务Pipeline 名称
语音识别automatic-speech-recognition
音频分类audio-classification

训练相关

Trainer 基本配置

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
learning_rate=2e-5,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
logging_steps=100,
fp16=True, # 混合精度
)

trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model("./model")

数据整理器

from transformers import (
DataCollatorWithPadding, # 动态填充
DataCollatorForTokenClassification, # Token 分类
DataCollatorForLanguageModeling, # 语言建模
DataCollatorForSeq2Seq, # 序列到序列
default_data_collator, # 默认
)

模型配置

配置参数

from transformers import AutoConfig

config = AutoConfig.from_pretrained("bert-base-uncased")

# 常见配置项
config.hidden_size # 隐藏层维度
config.num_hidden_layers # Transformer 层数
config.num_attention_heads # 注意力头数
config.vocab_size # 词汇表大小
config.max_position_embeddings # 最大位置编码

修改配置

# 修改分类数
config.num_labels = 5
model = AutoModelForSequenceClassification.from_pretrained(
"bert-base-uncased",
config=config
)

# 从头创建配置
from transformers import BertConfig
config = BertConfig(
vocab_size=30000,
hidden_size=768,
num_hidden_layers=12,
)

推理优化

GPU 推理

import torch

# 移动模型到 GPU
model = model.to("cuda")

# 移动输入到 GPU
inputs = tokenizer(text, return_tensors="pt")
inputs = {k: v.to("cuda") for k, v in inputs.items()}

# 推理
with torch.no_grad():
outputs = model(**inputs)

量化加载

# 8-bit 量化
model = AutoModel.from_pretrained(
"model_name",
load_in_8bit=True
)

# 4-bit 量化
model = AutoModel.from_pretrained(
"model_name",
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16
)

批量推理

# 批量处理比循环快
texts = ["text1", "text2", "text3"]

# 快
inputs = tokenizer(texts, padding=True, return_tensors="pt")
outputs = model(**inputs)

# 慢
for text in texts:
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)

模型保存与加载

保存模型

# 保存到本地
model.save_pretrained("./my_model")
tokenizer.save_pretrained("./my_model")

# 推送到 Hub
model.push_to_hub("username/model-name")
tokenizer.push_to_hub("username/model-name")

# Trainer 保存
trainer.save_model("./my_model")

加载模型

# 从本地加载
model = AutoModel.from_pretrained("./my_model")

# 从 Hub 加载
model = AutoModel.from_pretrained("username/model-name")

# 指定版本
model = AutoModel.from_pretrained(
"bert-base-uncased",
revision="v1.0.0"
)

常用代码片段

特征提取

from transformers import AutoModel, AutoTokenizer
import torch

model = AutoModel.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

text = "Example text"
inputs = tokenizer(text, return_tensors="pt")

with torch.no_grad():
outputs = model(**inputs)

# [CLS] token 表示
cls_embedding = outputs.last_hidden_state[:, 0, :]

# 平均池化
attention_mask = inputs['attention_mask']
mask_expanded = attention_mask.unsqueeze(-1).expand(outputs.last_hidden_state.size()).float()
sum_embeddings = torch.sum(outputs.last_hidden_state * mask_expanded, 1)
sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
mean_pooled = sum_embeddings / sum_mask

文本生成参数

generator = pipeline("text-generation", model="gpt2")

results = generator(
prompt,
max_length=100, # 最大长度
min_length=10, # 最小长度
temperature=0.7, # 温度(随机性)
top_k=50, # Top-K 采样
top_p=0.95, # Top-P 采样
num_return_sequences=3, # 返回数量
do_sample=True, # 是否采样
repetition_penalty=1.2, # 重复惩罚
early_stopping=True, # 早停
)

自定义损失函数

from transformers import Trainer
import torch.nn as nn

class CustomTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
labels = inputs.get("labels")
outputs = model(**inputs)
logits = outputs.get("logits")

# 自定义损失
loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0]).to(logits.device))
loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

return (loss, outputs) if return_outputs else loss

自定义评估指标

import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)

return {
"accuracy": accuracy_score(labels, predictions),
"f1": f1_score(labels, predictions, average="weighted"),
}

常用模型列表

BERT 系列

模型大小特点
bert-base-uncased110M基础版,小写
bert-base-cased110M基础版,保留大小写
bert-large-uncased340M大版本
distilbert-base-uncased66M蒸馏版,更快
roberta-base125M优化版 BERT

GPT 系列

模型大小特点
gpt2124MGPT-2 基础版
gpt2-medium355M中等版本
gpt2-large774M大版本
gpt2-xl1.5B超大版本

T5 系列

模型大小特点
t5-small60M小版本
t5-base220M基础版
t5-large770M大版本

中文模型

模型说明
bert-base-chinese基础中文 BERT
hfl/chinese-roberta-wwm-ext中文 RoBERTa
uer/roberta-base-finetuned-chinanews-chinese中文新闻分类

常见问题解决

显存不足

# 方案1:使用更小的模型
model = AutoModel.from_pretrained("distilbert-base-uncased")

# 方案2:使用量化
model = AutoModel.from_pretrained("bert-base-uncased", load_in_8bit=True)

# 方案3:使用 CPU
model = AutoModel.from_pretrained("bert-base-uncased", device_map="cpu")

# 方案4:梯度检查点
model.gradient_checkpointing_enable()

# 方案5:清空缓存
import torch
torch.cuda.empty_cache()

模型下载问题

# 设置镜像(国内用户)
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

# 使用本地缓存
model = AutoModel.from_pretrained("model_name", local_files_only=True)

# 指定缓存目录
model = AutoModel.from_pretrained(
"model_name",
cache_dir="./huggingface_cache"
)

长文本处理

# 截断
tokenizer(text, max_length=512, truncation=True)

# 滑动窗口
tokenizer(
text,
max_length=512,
truncation=True,
return_overflowing_tokens=True,
stride=128
)

参考链接