跳到主要内容

速查表

快速参考 Hugging Face Transformers 的常用 API 和代码片段。

快速开始

安装

# 基础安装
pip install transformers

# 完整安装
pip install transformers datasets evaluate accelerate

# 特定功能
pip install transformers[vision] # 视觉
pip install transformers[audio] # 音频
pip install sentencepiece # SentencePiece 分词器

基础使用

from transformers import pipeline

# 情感分析
classifier = pipeline("sentiment-analysis")
result = classifier("I love Transformers!")

# 文本生成
generator = pipeline("text-generation", model="gpt2")
result = generator("The future of AI is")

# 问答
qa = pipeline("question-answering")
result = qa(question="Who created Transformers?", context="Hugging Face created Transformers.")

模型和分词器

加载模型

from transformers import AutoModel, AutoTokenizer, AutoConfig

# 自动加载
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
config = AutoConfig.from_pretrained("bert-base-uncased")

# 任务特定模型
from transformers import (
AutoModelForSequenceClassification, # 分类
AutoModelForTokenClassification, # Token 分类
AutoModelForQuestionAnswering, # 问答
AutoModelForCausalLM, # 因果语言模型
AutoModelForMaskedLM, # 掩码语言模型
AutoModelForSeq2SeqLM, # 序列到序列
)

分词器操作

# 编码
tokens = tokenizer.encode("Hello world")
encoding = tokenizer("Hello world", padding=True, truncation=True, max_length=512)

# 批量编码
encodings = tokenizer(
["Text 1", "Text 2"],
padding=True,
truncation=True,
return_tensors="pt"
)

# 解码
text = tokenizer.decode([101, 7592, 2088, 102])
text = tokenizer.decode(tokens, skip_special_tokens=True)

# 获取词表
vocab_size = tokenizer.vocab_size
token_id = tokenizer.convert_tokens_to_ids("[CLS]")
token = tokenizer.convert_ids_to_tokens(101)

训练

TrainingArguments 常用参数

from transformers import TrainingArguments

args = TrainingArguments(
# 基础配置
output_dir="./results",
num_train_epochs=3,

# 批次配置
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
gradient_accumulation_steps=4,

# 优化器配置
learning_rate=5e-5,
weight_decay=0.01,
warmup_ratio=0.1,

# 评估和保存
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,

# 日志
logging_steps=100,

# 性能
fp16=True,
dataloader_num_workers=4,
)

Trainer 基础模板

from transformers import Trainer

trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
compute_metrics=compute_metrics,
)

trainer.train()
trainer.evaluate()
trainer.save_model("./my-model")

自定义训练循环

from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=100,
num_training_steps=1000
)

model.train()
for batch in dataloader:
outputs = model(**batch)
loss = outputs.loss
loss.backward()
optimizer.step()
scheduler.step()
optimizer.zero_grad()

数据整理器

from transformers import (
DataCollatorWithPadding, # 动态填充
DataCollatorForTokenClassification, # Token 分类
DataCollatorForLanguageModeling, # 语言模型
DataCollatorForSeq2Seq, # 序列到序列
)

# 动态填充(最常用)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# MLM 数据整理器
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=True,
mlm_probability=0.15
)

参数高效微调 (PEFT)

LoRA

from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
task_type=TaskType.SEQ_CLS,
r=16,
lora_alpha=32,
lora_dropout=0.1,
target_modules=["query", "key", "value"]
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

加载 LoRA 模型

from peft import PeftModel

base_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
model = PeftModel.from_pretrained(base_model, "./lora-weights")

推理优化

模型量化

# 8-bit 量化
model = AutoModel.from_pretrained(
"bert-base-uncased",
load_in_8bit=True,
device_map="auto"
)

# 4-bit 量化
model = AutoModel.from_pretrained(
"meta-llama/Llama-2-7b",
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16
)

半精度推理

import torch

model = AutoModel.from_pretrained(
"bert-base-uncased",
torch_dtype=torch.float16
)

常用数据集操作

from datasets import load_dataset, Dataset

# 加载数据集
dataset = load_dataset("imdb")
dataset = load_dataset("csv", data_files="data.csv")

# 数据集操作
dataset = dataset.shuffle(seed=42)
dataset = dataset.select(range(1000))
dataset = dataset.filter(lambda x: len(x["text"]) > 100)

# 映射处理
def preprocess(examples):
return tokenizer(examples["text"], truncation=True)

dataset = dataset.map(preprocess, batched=True)

# 保存和加载
dataset.save_to_disk("./my-dataset")
dataset = Dataset.load_from_disk("./my-dataset")

评估指标

from datasets import load_metric
import evaluate

# 加载指标
accuracy = load_metric("accuracy")
f1 = load_metric("f1")
bleu = load_metric("bleu")
rouge = load_metric("rouge")

# 计算
results = accuracy.compute(predictions=preds, references=labels)

# 使用 evaluate 库
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

回调函数

from transformers import EarlyStoppingCallback, TrainerCallback

# 早停
early_stopping = EarlyStoppingCallback(early_stopping_patience=3)

# 自定义回调
class CustomCallback(TrainerCallback):
def on_epoch_end(self, args, state, control, **kwargs):
print(f"Epoch {state.epoch} completed")
return control

配置和日志

模型配置

from transformers import BertConfig

# 创建配置
config = BertConfig(
vocab_size=30000,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12
)

# 从配置创建模型
model = BertModel(config)

# 修改现有配置
model.config.hidden_dropout_prob = 0.2

日志配置

import logging
from transformers import logging as transformers_logging

# 设置日志级别
transformers_logging.set_verbosity_info()
transformers_logging.set_verbosity_debug()
transformers_logging.set_verbosity_warning()

# 禁用日志
transformers_logging.set_verbosity_error()

实用代码片段

计算模型参数量

def count_parameters(model):
total = sum(p.numel() for p in model.parameters())
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"总参数量: {total:,}")
print(f"可训练参数量: {trainable:,}")
print(f"可训练比例: {100 * trainable / total:.2f}%")

count_parameters(model)

梯度累积计算

def calculate_effective_batch_size(
per_device_batch_size,
gradient_accumulation_steps,
num_devices=1
):
return per_device_batch_size * gradient_accumulation_steps * num_devices

学习率调度

from transformers import (
get_linear_schedule_with_warmup,
get_cosine_schedule_with_warmup,
get_constant_schedule_with_warmup,
)

# 线性衰减
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=100,
num_training_steps=1000
)

# 余弦衰减
scheduler = get_cosine_schedule_with_warmup(
optimizer,
num_warmup_steps=100,
num_training_steps=1000
)

混合精度训练检查

import torch

def check_fp16_support():
if torch.cuda.is_available():
capability = torch.cuda.get_device_capability()
print(f"GPU 计算能力: {capability}")
if capability[0] >= 7: # Volta 及更新架构
print("支持 FP16 训练")
if capability[0] >= 8: # Ampere 及更新架构
print("支持 BF16 训练")
else:
print("无 GPU 可用")

check_fp16_support()

常见任务 Pipeline

任务Pipeline 名称示例
情感分析sentiment-analysispipeline("sentiment-analysis")
文本生成text-generationpipeline("text-generation", model="gpt2")
问答question-answeringpipeline("question-answering")
命名实体识别nerpipeline("ner", grouped_entities=True)
文本摘要summarizationpipeline("summarization")
翻译translation_xx_to_yypipeline("translation_en_to_fr")
填空fill-maskpipeline("fill-mask", model="bert-base-uncased")
零样本分类zero-shot-classificationpipeline("zero-shot-classification")
特征提取feature-extractionpipeline("feature-extraction")
图像分类image-classificationpipeline("image-classification")
目标检测object-detectionpipeline("object-detection")
语音识别automatic-speech-recognitionpipeline("automatic-speech-recognition")

常用模型名称

模型用途模型名称
BERT理解任务bert-base-uncased
RoBERTa理解任务roberta-base
DistilBERT轻量级理解distilbert-base-uncased
GPT-2文本生成gpt2
T5序列到序列t5-small
BART序列到序列facebook/bart-base
ELECTRA理解任务google/electra-base-discriminator
ALBERT轻量级理解albert-base-v2
DeBERTa理解任务microsoft/deberta-base
LLaMA文本生成meta-llama/Llama-2-7b-hf

资源链接