跳到主要内容

NLP 知识速查表

本文档提供自然语言处理常用技术、API 和概念的快速参考。

常用库安装

# 基础库
pip install numpy pandas

# NLP 核心库
pip install nltk spacy jieba

# 词向量
pip install gensim

# 深度学习
pip install torch tensorflow

# Hugging Face
pip install transformers datasets tokenizers

# 评估工具
pip install seqeval sklearn-crfsuite evaluate sacrebleu

# 中文处理
pip install opencc # 繁简转换

# 高效微调
pip install peft bitsandbytes

文本预处理

分词

# NLTK 英文分词
from nltk.tokenize import word_tokenize
tokens = word_tokenize("Hello, world!")

# spaCy 分词
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("Hello, world!")
tokens = [token.text for token in doc]

# jieba 中文分词
import jieba
seg_list = jieba.cut("自然语言处理", cut_all=False)

停用词

# NLTK 英文停用词
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# 中文停用词(自定义)
stop_words = {'的', '了', '在', '是', '我', '有', '和', '就'}

文本清洗

import re

# 去除 HTML 标签
clean_text = re.sub(r'<[^>]+>', '', html_text)

# 去除特殊字符
clean_text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9\s]', '', text)

# 合并多余空格
clean_text = re.sub(r'\s+', ' ', text)

词向量

Word2Vec

from gensim.models import Word2Vec

# 训练
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=1)

# 获取词向量
vector = model.wv['词语']

# 相似度
similarity = model.wv.similarity('词1', '词2')

# 最相似词
similar_words = model.wv.most_similar('词语', topn=10)

FastText

from gensim.models import FastText

model = FastText(sentences, vector_size=100, window=5, min_n=3, max_n=6)

# 可处理未登录词
vector = model.wv['未登录词']

Hugging Face Transformers

Pipeline 快速使用

from transformers import pipeline

# 情感分析
classifier = pipeline("sentiment-analysis")
result = classifier("I love this!")

# 命名实体识别
ner = pipeline("ner", aggregation_strategy="simple")
result = ner("Apple is based in Cupertino.")

# 文本生成
generator = pipeline("text-generation", model="gpt2")
result = generator("Once upon a time")

# 问答
qa = pipeline("question-answering")
result = qa(question="What is NLP?", context="NLP is...")

# 翻译
translator = pipeline("translation_en_to_zh", model="Helsinki-NLP/opus-mt-en-zh")
result = translator("Hello, world!")

# 摘要
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
result = summarizer("Long text to summarize...")

# 填空
fill = pipeline("fill-mask", model="bert-base-chinese")
result = fill("自然语言处理是[MASK]智能的分支。")

加载模型

from transformers import AutoModel, AutoTokenizer

model_name = "bert-base-chinese"

# 加载分词器和模型
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# 分词
inputs = tokenizer("自然语言处理", return_tensors="pt")

# 获取输出
outputs = model(**inputs)

文本分类

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-chinese", num_labels=2)

# 预测
inputs = tokenizer("文本内容", return_tensors="pt")
outputs = model(**inputs)
predicted_class = outputs.logits.argmax().item()

文本生成

from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("gpt2")

# 生成
inputs = tokenizer("Prompt text", return_tensors="pt")
outputs = model.generate(**inputs, max_length=100, temperature=0.7)
generated_text = tokenizer.decode(outputs[0])

序列到序列任务

from transformers import AutoModelForSeq2SeqLM

# 翻译、摘要等任务
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")

inputs = tokenizer("translate English to French: Hello", return_tensors="pt")
outputs = model.generate(**inputs)
result = tokenizer.decode(outputs[0], skip_special_tokens=True)

问答模型

from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")

inputs = tokenizer(question, context, return_tensors="pt")
outputs = model(**inputs)

# 获取答案
start_idx = outputs.start_logits.argmax().item()
end_idx = outputs.end_logits.argmax().item()
answer = tokenizer.decode(inputs["input_ids"][0][start_idx:end_idx+1])

注意力机制与 Transformer

多头注意力

import torch
import torch.nn as nn
import math

class MultiHeadAttention(nn.Module):
def __init__(self, d_model, num_heads):
super().__init__()
self.d_k = d_model // num_heads
self.num_heads = num_heads
self.W_q = nn.Linear(d_model, d_model)
self.W_k = nn.Linear(d_model, d_model)
self.W_v = nn.Linear(d_model, d_model)
self.W_o = nn.Linear(d_model, d_model)

def forward(self, x, mask=None):
batch_size = x.size(0)

Q = self.W_q(x).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
K = self.W_k(x).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
V = self.W_v(x).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)

scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
if mask is not None:
scores = scores.masked_fill(mask == 0, float('-inf'))

attn = torch.softmax(scores, dim=-1)
output = torch.matmul(attn, V)

output = output.transpose(1, 2).contiguous().view(batch_size, -1, -1)
return self.W_o(output)

位置编码

class PositionalEncoding(nn.Module):
def __init__(self, d_model, max_len=5000):
super().__init__()
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len).unsqueeze(1).float()
div_term = torch.exp(torch.arange(0, d_model, 2).float() *
(-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
self.register_buffer('pe', pe.unsqueeze(0))

def forward(self, x):
return x + self.pe[:, :x.size(1)]

序列标注

BIO 标签

标签含义
B-X实体 X 开始
I-X实体 X 内部
O非实体

spaCy NER

import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple CEO Tim Cook announced new products.")

for ent in doc.ents:
print(ent.text, ent.label_)

评估指标

from seqeval.metrics import precision_score, recall_score, f1_score

precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

文本分类

传统方法

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
('tfidf', TfidfVectorizer()),
('clf', LogisticRegression())
])

pipeline.fit(texts, labels)
predictions = pipeline.predict(new_texts)

深度学习评估

from sklearn.metrics import classification_report

print(classification_report(y_true, y_pred, target_names=class_names))

机器翻译

使用 Pipeline

from transformers import pipeline

# 英译中
translator = pipeline("translation_en_to_zh", model="Helsinki-NLP/opus-mt-en-zh")
result = translator("Natural language processing is fascinating.")

# 多语言翻译
translator = pipeline("translation", model="facebook/nllb-200-distilled-600M")
result = translator("Hello", src_lang="eng_Latn", tgt_lang="zho_Hans")

评估指标

import evaluate

bleu = evaluate.load("sacrebleu")
results = bleu.compute(predictions=predictions, references=references)
print(f"BLEU: {results['score']}")

问答系统

抽取式问答

from transformers import pipeline

qa = pipeline("question-answering", model="deepset/roberta-base-squad2")
result = qa(question="What is AI?", context="AI is artificial intelligence...")
print(f"答案: {result['answer']}, 置信度: {result['score']}")

评估指标

import evaluate

squad_metric = evaluate.load("squad")
results = squad_metric.compute(predictions=predictions, references=references)
print(f"Exact Match: {results['exact_match']}, F1: {results['f1']}")

文本摘要

抽取式摘要(TextRank)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def text_rank_summary(sentences, top_k=3):
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(sentences)
similarity = cosine_similarity(tfidf_matrix)

scores = np.ones(len(sentences)) / len(sentences)
for _ in range(100):
new_scores = similarity @ scores / similarity.sum(axis=1)
if np.abs(new_scores - scores).sum() < 1e-6:
break
scores = new_scores

top_indices = np.argsort(scores)[-top_k:]
return [sentences[i] for i in sorted(top_indices)]

生成式摘要

from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
result = summarizer(long_text, max_length=130, min_length=30, do_sample=False)

评估指标(ROUGE)

import evaluate

rouge = evaluate.load("rouge")
results = rouge.compute(predictions=predictions, references=references)
print(f"ROUGE-1: {results['rouge1']}, ROUGE-2: {results['rouge2']}, ROUGE-L: {results['rougeL']}")

信息抽取

命名实体识别

from transformers import pipeline

ner = pipeline("ner", model="ckiplab/bert-base-chinese-ner", aggregation_strategy="simple")
entities = ner("张三在北京的清华大学学习")

for ent in entities:
print(f"{ent['word']} [{ent['entity_group']}]")

关系抽取

def extract_relation(text, entity1, entity2, model, tokenizer):
"""抽取两个实体之间的关系"""
input_text = f"{entity1} [SEP] {entity2} [SEP] {text}"
inputs = tokenizer(input_text, return_tensors="pt", truncation=True)

with torch.no_grad():
outputs = model(**inputs)

predicted_label = torch.argmax(outputs.logits, dim=1).item()
return id2relation[predicted_label]

语言模型

困惑度

import math

def perplexity(loss, n):
return math.exp(loss / n)

文本生成参数

参数说明推荐值
temperature控制随机性0.7-1.0
top_k保留前 k 个候选50
top_p核采样阈值0.9-0.95
max_length最大生成长度根据任务
do_sample是否采样True
num_beams束搜索大小4-5

常见中文模型

模型用途Hugging Face ID
BERT 中文通用理解bert-base-chinese
RoBERTa 中文通用理解hfl/chinese-roberta-wwm-ext
GPT2 中文文本生成uer/gpt2-chinese-cluecorpussmall
BART 中文序列到序列fnlp/bart-base-chinese
T5 中文序列到序列Langboat/Mengzi-T5-base
Qwen对话生成Qwen/Qwen2-1.5B-Instruct

常见任务模型选择

任务推荐模型架构示例
文本分类BERT、RoBERTaBERT + 分类头
序列标注BERT + CRFBERT-CRF
文本生成GPT、BARTGPT-2
机器翻译T5、mBART、NLLBT5
问答BERTBERT for QA
摘要生成BART、T5BART
信息抽取BERTBERT + NER头

训练参数推荐

学习率

模型类型推荐学习率
BERT 微调2e-5 ~ 5e-5
GPT 微调1e-5 ~ 5e-5
从头训练1e-4 ~ 1e-3

批次大小

场景推荐批次
微调16-32
预训练256-512
大模型1-4(配合梯度累积)

常见问题解决

CUDA 内存不足

# 减小批次大小
batch_size = 4

# 使用梯度累积
gradient_accumulation_steps = 4

# 使用混合精度
from torch.cuda.amp import autocast

# 使用 4-bit 量化
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True)

处理长文本

# 截断
inputs = tokenizer(text, max_length=512, truncation=True)

# 滑动窗口
inputs = tokenizer(text, max_length=512, stride=128, return_overflowing_tokens=True)

# 分块处理
chunks = [text[i:i+512] for i in range(0, len(text), 512)]

未登录词处理

# 使用 FastText
from gensim.models import FastText
model = FastText(sentences)
vector = model.wv['未登录词']

# 使用子词分词
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

性能优化

模型量化

from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16
)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)

LoRA 微调

from peft import LoraConfig, get_peft_model, TaskType

config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=8,
lora_alpha=32,
lora_dropout=0.1,
target_modules=["q_proj", "v_proj"]
)
model = get_peft_model(model, config)
model.print_trainable_parameters()

常用评估指标

任务指标
分类Accuracy、F1、Precision、Recall
序列标注Entity-level F1
生成BLEU、ROUGE
语言模型Perplexity
问答Exact Match、F1
翻译BLEU、COMET
摘要ROUGE-1/2/L

资源链接