NLP 知识速查表
本文档提供自然语言处理常用技术、API 和概念的快速参考。
常用库安装
# 基础库
pip install numpy pandas
# NLP 核心库
pip install nltk spacy jieba
# 词向量
pip install gensim
# 深度学习
pip install torch tensorflow
# Hugging Face
pip install transformers datasets tokenizers
# 评估工具
pip install seqeval sklearn-crfsuite evaluate sacrebleu
# 中文处理
pip install opencc # 繁简转换
# 高效微调
pip install peft bitsandbytes
文本预处理
分词
# NLTK 英文分词
from nltk.tokenize import word_tokenize
tokens = word_tokenize("Hello, world!")
# spaCy 分词
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("Hello, world!")
tokens = [token.text for token in doc]
# jieba 中文分词
import jieba
seg_list = jieba.cut("自然语言处理", cut_all=False)
停用词
# NLTK 英文停用词
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
# 中文停用词(自定义)
stop_words = {'的', '了', '在', '是', '我', '有', '和', '就'}
文本清洗
import re
# 去除 HTML 标签
clean_text = re.sub(r'<[^>]+>', '', html_text)
# 去除特殊字符
clean_text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9\s]', '', text)
# 合并多余空格
clean_text = re.sub(r'\s+', ' ', text)
词向量
Word2Vec
from gensim.models import Word2Vec
# 训练
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=1)
# 获取词向量
vector = model.wv['词语']
# 相似度
similarity = model.wv.similarity('词1', '词2')
# 最相似词
similar_words = model.wv.most_similar('词语', topn=10)
FastText
from gensim.models import FastText
model = FastText(sentences, vector_size=100, window=5, min_n=3, max_n=6)
# 可处理未登录词
vector = model.wv['未登录词']
Hugging Face Transformers
Pipeline 快速使用
from transformers import pipeline
# 情感分析
classifier = pipeline("sentiment-analysis")
result = classifier("I love this!")
# 命名实体识别
ner = pipeline("ner", aggregation_strategy="simple")
result = ner("Apple is based in Cupertino.")
# 文本生成
generator = pipeline("text-generation", model="gpt2")
result = generator("Once upon a time")
# 问答
qa = pipeline("question-answering")
result = qa(question="What is NLP?", context="NLP is...")
# 翻译
translator = pipeline("translation_en_to_zh", model="Helsinki-NLP/opus-mt-en-zh")
result = translator("Hello, world!")
# 摘要
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
result = summarizer("Long text to summarize...")
# 填空
fill = pipeline("fill-mask", model="bert-base-chinese")
result = fill("自然语言处理是[MASK]智能的分支。")
加载模型
from transformers import AutoModel, AutoTokenizer
model_name = "bert-base-chinese"
# 加载分词器和模型
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
# 分词
inputs = tokenizer("自然语言处理", return_tensors="pt")
# 获取输出
outputs = model(**inputs)
文本分类
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-chinese", num_labels=2)
# 预测
inputs = tokenizer("文本内容", return_tensors="pt")
outputs = model(**inputs)
predicted_class = outputs.logits.argmax().item()
文本生成
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("gpt2")
# 生成
inputs = tokenizer("Prompt text", return_tensors="pt")
outputs = model.generate(**inputs, max_length=100, temperature=0.7)
generated_text = tokenizer.decode(outputs[0])
序列到序列任务
from transformers import AutoModelForSeq2SeqLM
# 翻译、摘要等任务
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")
inputs = tokenizer("translate English to French: Hello", return_tensors="pt")
outputs = model.generate(**inputs)
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
问答模型
from transformers import AutoModelForQuestionAnswering
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")
inputs = tokenizer(question, context, return_tensors="pt")
outputs = model(**inputs)
# 获取答案
start_idx = outputs.start_logits.argmax().item()
end_idx = outputs.end_logits.argmax().item()
answer = tokenizer.decode(inputs["input_ids"][0][start_idx:end_idx+1])
注意力机制与 Transformer
多头注意力
import torch
import torch.nn as nn
import math
class MultiHeadAttention(nn.Module):
def __init__(self, d_model, num_heads):
super().__init__()
self.d_k = d_model // num_heads
self.num_heads = num_heads
self.W_q = nn.Linear(d_model, d_model)
self.W_k = nn.Linear(d_model, d_model)
self.W_v = nn.Linear(d_model, d_model)
self.W_o = nn.Linear(d_model, d_model)
def forward(self, x, mask=None):
batch_size = x.size(0)
Q = self.W_q(x).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
K = self.W_k(x).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
V = self.W_v(x).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
if mask is not None:
scores = scores.masked_fill(mask == 0, float('-inf'))
attn = torch.softmax(scores, dim=-1)
output = torch.matmul(attn, V)
output = output.transpose(1, 2).contiguous().view(batch_size, -1, -1)
return self.W_o(output)
位置编码
class PositionalEncoding(nn.Module):
def __init__(self, d_model, max_len=5000):
super().__init__()
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len).unsqueeze(1).float()
div_term = torch.exp(torch.arange(0, d_model, 2).float() *
(-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
self.register_buffer('pe', pe.unsqueeze(0))
def forward(self, x):
return x + self.pe[:, :x.size(1)]
序列标注
BIO 标签
| 标签 | 含义 |
|---|---|
| B-X | 实体 X 开始 |
| I-X | 实体 X 内部 |
| O | 非实体 |
spaCy NER
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple CEO Tim Cook announced new products.")
for ent in doc.ents:
print(ent.text, ent.label_)
评估指标
from seqeval.metrics import precision_score, recall_score, f1_score
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
文本分类
传统方法
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
('tfidf', TfidfVectorizer()),
('clf', LogisticRegression())
])
pipeline.fit(texts, labels)
predictions = pipeline.predict(new_texts)
深度学习评估
from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred, target_names=class_names))
机器翻译
使用 Pipeline
from transformers import pipeline
# 英译中
translator = pipeline("translation_en_to_zh", model="Helsinki-NLP/opus-mt-en-zh")
result = translator("Natural language processing is fascinating.")
# 多语言翻译
translator = pipeline("translation", model="facebook/nllb-200-distilled-600M")
result = translator("Hello", src_lang="eng_Latn", tgt_lang="zho_Hans")
评估指标
import evaluate
bleu = evaluate.load("sacrebleu")
results = bleu.compute(predictions=predictions, references=references)
print(f"BLEU: {results['score']}")
问答系统
抽取式问答
from transformers import pipeline
qa = pipeline("question-answering", model="deepset/roberta-base-squad2")
result = qa(question="What is AI?", context="AI is artificial intelligence...")
print(f"答案: {result['answer']}, 置信度: {result['score']}")
评估指标
import evaluate
squad_metric = evaluate.load("squad")
results = squad_metric.compute(predictions=predictions, references=references)
print(f"Exact Match: {results['exact_match']}, F1: {results['f1']}")
文本摘要
抽取式摘要(TextRank)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
def text_rank_summary(sentences, top_k=3):
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(sentences)
similarity = cosine_similarity(tfidf_matrix)
scores = np.ones(len(sentences)) / len(sentences)
for _ in range(100):
new_scores = similarity @ scores / similarity.sum(axis=1)
if np.abs(new_scores - scores).sum() < 1e-6:
break
scores = new_scores
top_indices = np.argsort(scores)[-top_k:]
return [sentences[i] for i in sorted(top_indices)]
生成式摘要
from transformers import pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
result = summarizer(long_text, max_length=130, min_length=30, do_sample=False)
评估指标(ROUGE)
import evaluate
rouge = evaluate.load("rouge")
results = rouge.compute(predictions=predictions, references=references)
print(f"ROUGE-1: {results['rouge1']}, ROUGE-2: {results['rouge2']}, ROUGE-L: {results['rougeL']}")
信息抽取
命名实体识别
from transformers import pipeline
ner = pipeline("ner", model="ckiplab/bert-base-chinese-ner", aggregation_strategy="simple")
entities = ner("张三在北京的清华大学学习")
for ent in entities:
print(f"{ent['word']} [{ent['entity_group']}]")
关系抽取
def extract_relation(text, entity1, entity2, model, tokenizer):
"""抽取两个实体之间的关系"""
input_text = f"{entity1} [SEP] {entity2} [SEP] {text}"
inputs = tokenizer(input_text, return_tensors="pt", truncation=True)
with torch.no_grad():
outputs = model(**inputs)
predicted_label = torch.argmax(outputs.logits, dim=1).item()
return id2relation[predicted_label]
语言模型
困惑度
import math
def perplexity(loss, n):
return math.exp(loss / n)
文本生成参数
| 参数 | 说明 | 推荐值 |
|---|---|---|
| temperature | 控制随机性 | 0.7-1.0 |
| top_k | 保留前 k 个候选 | 50 |
| top_p | 核采样阈值 | 0.9-0.95 |
| max_length | 最大生成长度 | 根据任务 |
| do_sample | 是否采样 | True |
| num_beams | 束搜索大小 | 4-5 |
常见中文模型
| 模型 | 用途 | Hugging Face ID |
|---|---|---|
| BERT 中文 | 通用理解 | bert-base-chinese |
| RoBERTa 中文 | 通用理解 | hfl/chinese-roberta-wwm-ext |
| GPT2 中文 | 文本生成 | uer/gpt2-chinese-cluecorpussmall |
| BART 中文 | 序列到序列 | fnlp/bart-base-chinese |
| T5 中文 | 序列到序列 | Langboat/Mengzi-T5-base |
| Qwen | 对话生成 | Qwen/Qwen2-1.5B-Instruct |
常见任务模型选择
| 任务 | 推荐模型架构 | 示例 |
|---|---|---|
| 文本分类 | BERT、RoBERTa | BERT + 分类头 |
| 序列标注 | BERT + CRF | BERT-CRF |
| 文本生成 | GPT、BART | GPT-2 |
| 机器翻译 | T5、mBART、NLLB | T5 |
| 问答 | BERT | BERT for QA |
| 摘要生成 | BART、T5 | BART |
| 信息抽取 | BERT | BERT + NER头 |
训练参数推荐
学习率
| 模型类型 | 推荐学习率 |
|---|---|
| BERT 微调 | 2e-5 ~ 5e-5 |
| GPT 微调 | 1e-5 ~ 5e-5 |
| 从头训练 | 1e-4 ~ 1e-3 |
批次大小
| 场景 | 推荐批次 |
|---|---|
| 微调 | 16-32 |
| 预训练 | 256-512 |
| 大模型 | 1-4(配合梯度累积) |
常见问题解决
CUDA 内存不足
# 减小批次大小
batch_size = 4
# 使用梯度累积
gradient_accumulation_steps = 4
# 使用混合精度
from torch.cuda.amp import autocast
# 使用 4-bit 量化
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
处理长文本
# 截断
inputs = tokenizer(text, max_length=512, truncation=True)
# 滑动窗口
inputs = tokenizer(text, max_length=512, stride=128, return_overflowing_tokens=True)
# 分块处理
chunks = [text[i:i+512] for i in range(0, len(text), 512)]
未登录词处理
# 使用 FastText
from gensim.models import FastText
model = FastText(sentences)
vector = model.wv['未登录词']
# 使用子词分词
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
性能优化
模型量化
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16
)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
LoRA 微调
from peft import LoraConfig, get_peft_model, TaskType
config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=8,
lora_alpha=32,
lora_dropout=0.1,
target_modules=["q_proj", "v_proj"]
)
model = get_peft_model(model, config)
model.print_trainable_parameters()
常用评估指标
| 任务 | 指标 |
|---|---|
| 分类 | Accuracy、F1、Precision、Recall |
| 序列标注 | Entity-level F1 |
| 生成 | BLEU、ROUGE |
| 语言模型 | Perplexity |
| 问答 | Exact Match、F1 |
| 翻译 | BLEU、COMET |
| 摘要 | ROUGE-1/2/L |
资源链接
- Hugging Face 文档: https://huggingface.co/docs
- Hugging Face 课程: https://huggingface.co/learn
- spaCy 文档: https://spacy.io/api/doc
- NLTK 文档: https://www.nltk.org/
- jieba 文档: https://github.com/fxsjy/jieba
- Gensim 文档: https://radimrehurek.com/gensim/
- PyTorch 文档: https://pytorch.org/docs/