速查表
快速参考 Hugging Face Transformers 的常用 API 和代码片段。
快速开始
安装
# 基础安装
pip install transformers
# 完整安装
pip install transformers datasets evaluate accelerate
# 特定功能
pip install transformers[vision] # 视觉
pip install transformers[audio] # 音频
pip install sentencepiece # SentencePiece 分词器
基础使用
from transformers import pipeline
# 情感分析
classifier = pipeline("sentiment-analysis")
result = classifier("I love Transformers!")
# 文本生成
generator = pipeline("text-generation", model="gpt2")
result = generator("The future of AI is")
# 问答
qa = pipeline("question-answering")
result = qa(question="Who created Transformers?", context="Hugging Face created Transformers.")
模型和分词器
加载模型
from transformers import AutoModel, AutoTokenizer, AutoConfig
# 自动加载
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
config = AutoConfig.from_pretrained("bert-base-uncased")
# 任务特定模型
from transformers import (
AutoModelForSequenceClassification, # 分类
AutoModelForTokenClassification, # Token 分类
AutoModelForQuestionAnswering, # 问答
AutoModelForCausalLM, # 因果语言模型
AutoModelForMaskedLM, # 掩码语言模型
AutoModelForSeq2SeqLM, # 序列到序列
)
分词器操作
# 编码
tokens = tokenizer.encode("Hello world")
encoding = tokenizer("Hello world", padding=True, truncation=True, max_length=512)
# 批量编码
encodings = tokenizer(
["Text 1", "Text 2"],
padding=True,
truncation=True,
return_tensors="pt"
)
# 解码
text = tokenizer.decode([101, 7592, 2088, 102])
text = tokenizer.decode(tokens, skip_special_tokens=True)
# 获取词表
vocab_size = tokenizer.vocab_size
token_id = tokenizer.convert_tokens_to_ids("[CLS]")
token = tokenizer.convert_ids_to_tokens(101)
训练
TrainingArguments 常用参数
from transformers import TrainingArguments
args = TrainingArguments(
# 基础配置
output_dir="./results",
num_train_epochs=3,
# 批次配置
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
gradient_accumulation_steps=4,
# 优化器配置
learning_rate=5e-5,
weight_decay=0.01,
warmup_ratio=0.1,
# 评估和保存
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
# 日志
logging_steps=100,
# 性能
fp16=True,
dataloader_num_workers=4,
)
Trainer 基础模板
from transformers import Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
compute_metrics=compute_metrics,
)
trainer.train()
trainer.evaluate()
trainer.save_model("./my-model")
自定义训练循环
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=100,
num_training_steps=1000
)
model.train()
for batch in dataloader:
outputs = model(**batch)
loss = outputs.loss
loss.backward()
optimizer.step()
scheduler.step()
optimizer.zero_grad()
数据整理器
from transformers import (
DataCollatorWithPadding, # 动态填充
DataCollatorForTokenClassification, # Token 分类
DataCollatorForLanguageModeling, # 语言模型
DataCollatorForSeq2Seq, # 序列到序列
)
# 动态填充(最常用)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# MLM 数据整理器
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=True,
mlm_probability=0.15
)
参数高效微调 (PEFT)
LoRA
from peft import LoraConfig, get_peft_model, TaskType
lora_config = LoraConfig(
task_type=TaskType.SEQ_CLS,
r=16,
lora_alpha=32,
lora_dropout=0.1,
target_modules=["query", "key", "value"]
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
加载 LoRA 模型
from peft import PeftModel
base_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
model = PeftModel.from_pretrained(base_model, "./lora-weights")
推理优化
模型量化
# 8-bit 量化
model = AutoModel.from_pretrained(
"bert-base-uncased",
load_in_8bit=True,
device_map="auto"
)
# 4-bit 量化
model = AutoModel.from_pretrained(
"meta-llama/Llama-2-7b",
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16
)
半精度推理
import torch
model = AutoModel.from_pretrained(
"bert-base-uncased",
torch_dtype=torch.float16
)
常用数据集操作
from datasets import load_dataset, Dataset
# 加载数据集
dataset = load_dataset("imdb")
dataset = load_dataset("csv", data_files="data.csv")
# 数据集操作
dataset = dataset.shuffle(seed=42)
dataset = dataset.select(range(1000))
dataset = dataset.filter(lambda x: len(x["text"]) > 100)
# 映射处理
def preprocess(examples):
return tokenizer(examples["text"], truncation=True)
dataset = dataset.map(preprocess, batched=True)
# 保存和加载
dataset.save_to_disk("./my-dataset")
dataset = Dataset.load_from_disk("./my-dataset")
评估指标
from datasets import load_metric
import evaluate
# 加载指标
accuracy = load_metric("accuracy")
f1 = load_metric("f1")
bleu = load_metric("bleu")
rouge = load_metric("rouge")
# 计算
results = accuracy.compute(predictions=preds, references=labels)
# 使用 evaluate 库
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
回调函数
from transformers import EarlyStoppingCallback, TrainerCallback
# 早停
early_stopping = EarlyStoppingCallback(early_stopping_patience=3)
# 自定义回调
class CustomCallback(TrainerCallback):
def on_epoch_end(self, args, state, control, **kwargs):
print(f"Epoch {state.epoch} completed")
return control
配置和日志
模型配置
from transformers import BertConfig
# 创建配置
config = BertConfig(
vocab_size=30000,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12
)
# 从配置创建模型
model = BertModel(config)
# 修改现有配置
model.config.hidden_dropout_prob = 0.2
日志配置
import logging
from transformers import logging as transformers_logging
# 设置日志级别
transformers_logging.set_verbosity_info()
transformers_logging.set_verbosity_debug()
transformers_logging.set_verbosity_warning()
# 禁用日志
transformers_logging.set_verbosity_error()
实用代码片段
计算模型参数量
def count_parameters(model):
total = sum(p.numel() for p in model.parameters())
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"总参数量: {total:,}")
print(f"可训练参数量: {trainable:,}")
print(f"可训练比例: {100 * trainable / total:.2f}%")
count_parameters(model)
梯度累积计算
def calculate_effective_batch_size(
per_device_batch_size,
gradient_accumulation_steps,
num_devices=1
):
return per_device_batch_size * gradient_accumulation_steps * num_devices
学习率调度
from transformers import (
get_linear_schedule_with_warmup,
get_cosine_schedule_with_warmup,
get_constant_schedule_with_warmup,
)
# 线性衰减
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=100,
num_training_steps=1000
)
# 余弦衰减
scheduler = get_cosine_schedule_with_warmup(
optimizer,
num_warmup_steps=100,
num_training_steps=1000
)
混合精度训练检查
import torch
def check_fp16_support():
if torch.cuda.is_available():
capability = torch.cuda.get_device_capability()
print(f"GPU 计算能力: {capability}")
if capability[0] >= 7: # Volta 及更新架构
print("支持 FP16 训练")
if capability[0] >= 8: # Ampere 及更新架构
print("支持 BF16 训练")
else:
print("无 GPU 可用")
check_fp16_support()
常见任务 Pipeline
| 任务 | Pipeline 名称 | 示例 |
|---|---|---|
| 情感分析 | sentiment-analysis | pipeline("sentiment-analysis") |
| 文本生成 | text-generation | pipeline("text-generation", model="gpt2") |
| 问答 | question-answering | pipeline("question-answering") |
| 命名实体识别 | ner | pipeline("ner", grouped_entities=True) |
| 文本摘要 | summarization | pipeline("summarization") |
| 翻译 | translation_xx_to_yy | pipeline("translation_en_to_fr") |
| 填空 | fill-mask | pipeline("fill-mask", model="bert-base-uncased") |
| 零样本分类 | zero-shot-classification | pipeline("zero-shot-classification") |
| 特征提取 | feature-extraction | pipeline("feature-extraction") |
| 图像分类 | image-classification | pipeline("image-classification") |
| 目标检测 | object-detection | pipeline("object-detection") |
| 语音识别 | automatic-speech-recognition | pipeline("automatic-speech-recognition") |
常用模型名称
| 模型 | 用途 | 模型名称 |
|---|---|---|
| BERT | 理解任务 | bert-base-uncased |
| RoBERTa | 理解任务 | roberta-base |
| DistilBERT | 轻量级理解 | distilbert-base-uncased |
| GPT-2 | 文本生成 | gpt2 |
| T5 | 序列到序列 | t5-small |
| BART | 序列到序列 | facebook/bart-base |
| ELECTRA | 理解任务 | google/electra-base-discriminator |
| ALBERT | 轻量级理解 | albert-base-v2 |
| DeBERTa | 理解任务 | microsoft/deberta-base |
| LLaMA | 文本生成 | meta-llama/Llama-2-7b-hf |