NLP 知识速查表

本文档提供自然语言处理常用技术、API 和概念的快速参考。

常用库安装

# 基础库
pip install numpy pandas

# NLP 核心库
pip install nltk spacy jieba

# 词向量
pip install gensim

# 深度学习
pip install torch tensorflow

# Hugging Face
pip install transformers datasets tokenizers

# 评估工具
pip install seqeval sklearn-crfsuite evaluate sacrebleu

# 中文处理
pip install opencc  # 繁简转换

# 高效微调
pip install peft bitsandbytes

文本预处理

分词

# NLTK 英文分词
from nltk.tokenize import word_tokenize
tokens = word_tokenize("Hello, world!")

# spaCy 分词
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("Hello, world!")
tokens = [token.text for token in doc]

# jieba 中文分词
import jieba
seg_list = jieba.cut("自然语言处理", cut_all=False)

停用词

# NLTK 英文停用词
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# 中文停用词（自定义）
stop_words = {'的', '了', '在', '是', '我', '有', '和', '就'}

文本清洗

import re

# 去除 HTML 标签
clean_text = re.sub(r'<[^>]+>', '', html_text)

# 去除特殊字符
clean_text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9\s]', '', text)

# 合并多余空格
clean_text = re.sub(r'\s+', ' ', text)

词向量

Word2Vec

from gensim.models import Word2Vec

# 训练
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=1)

# 获取词向量
vector = model.wv['词语']

# 相似度
similarity = model.wv.similarity('词1', '词2')

# 最相似词
similar_words = model.wv.most_similar('词语', topn=10)

FastText

from gensim.models import FastText

model = FastText(sentences, vector_size=100, window=5, min_n=3, max_n=6)

# 可处理未登录词
vector = model.wv['未登录词']

Hugging Face Transformers

Pipeline 快速使用

from transformers import pipeline

# 情感分析
classifier = pipeline("sentiment-analysis")
result = classifier("I love this!")

# 命名实体识别
ner = pipeline("ner", aggregation_strategy="simple")
result = ner("Apple is based in Cupertino.")

# 文本生成
generator = pipeline("text-generation", model="gpt2")
result = generator("Once upon a time")

# 问答
qa = pipeline("question-answering")
result = qa(question="What is NLP?", context="NLP is...")

# 翻译
translator = pipeline("translation_en_to_zh", model="Helsinki-NLP/opus-mt-en-zh")
result = translator("Hello, world!")

# 摘要
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
result = summarizer("Long text to summarize...")

# 填空
fill = pipeline("fill-mask", model="bert-base-chinese")
result = fill("自然语言处理是[MASK]智能的分支。")

加载模型

from transformers import AutoModel, AutoTokenizer

model_name = "bert-base-chinese"

# 加载分词器和模型
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# 分词
inputs = tokenizer("自然语言处理", return_tensors="pt")

# 获取输出
outputs = model(**inputs)

文本分类

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-chinese", num_labels=2)

# 预测
inputs = tokenizer("文本内容", return_tensors="pt")
outputs = model(**inputs)
predicted_class = outputs.logits.argmax().item()

文本生成

from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("gpt2")

# 生成
inputs = tokenizer("Prompt text", return_tensors="pt")
outputs = model.generate(**inputs, max_length=100, temperature=0.7)
generated_text = tokenizer.decode(outputs[0])

序列到序列任务

from transformers import AutoModelForSeq2SeqLM

# 翻译、摘要等任务
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")

inputs = tokenizer("translate English to French: Hello", return_tensors="pt")
outputs = model.generate(**inputs)
result = tokenizer.decode(outputs[0], skip_special_tokens=True)

问答模型

from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")

inputs = tokenizer(question, context, return_tensors="pt")
outputs = model(**inputs)

# 获取答案
start_idx = outputs.start_logits.argmax().item()
end_idx = outputs.end_logits.argmax().item()
answer = tokenizer.decode(inputs["input_ids"][0][start_idx:end_idx+1])

注意力机制与 Transformer

多头注意力

import torch
import torch.nn as nn
import math

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_k = d_model // num_heads
        self.num_heads = num_heads
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
    
    def forward(self, x, mask=None):
        batch_size = x.size(0)
        
        Q = self.W_q(x).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        K = self.W_k(x).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v(x).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))
        
        attn = torch.softmax(scores, dim=-1)
        output = torch.matmul(attn, V)
        
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, -1)
        return self.W_o(output)

位置编码

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * 
                            (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))
    
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

序列标注

BIO 标签

标签	含义
B-X	实体 X 开始
I-X	实体 X 内部
O	非实体

spaCy NER

import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple CEO Tim Cook announced new products.")

for ent in doc.ents:
    print(ent.text, ent.label_)

评估指标

from seqeval.metrics import precision_score, recall_score, f1_score

precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

文本分类

传统方法

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression())
])

pipeline.fit(texts, labels)
predictions = pipeline.predict(new_texts)

深度学习评估

from sklearn.metrics import classification_report

print(classification_report(y_true, y_pred, target_names=class_names))

机器翻译

使用 Pipeline

from transformers import pipeline

# 英译中
translator = pipeline("translation_en_to_zh", model="Helsinki-NLP/opus-mt-en-zh")
result = translator("Natural language processing is fascinating.")

# 多语言翻译
translator = pipeline("translation", model="facebook/nllb-200-distilled-600M")
result = translator("Hello", src_lang="eng_Latn", tgt_lang="zho_Hans")

评估指标

import evaluate

bleu = evaluate.load("sacrebleu")
results = bleu.compute(predictions=predictions, references=references)
print(f"BLEU: {results['score']}")

问答系统

抽取式问答

from transformers import pipeline

qa = pipeline("question-answering", model="deepset/roberta-base-squad2")
result = qa(question="What is AI?", context="AI is artificial intelligence...")
print(f"答案: {result['answer']}, 置信度: {result['score']}")

评估指标

import evaluate

squad_metric = evaluate.load("squad")
results = squad_metric.compute(predictions=predictions, references=references)
print(f"Exact Match: {results['exact_match']}, F1: {results['f1']}")

文本摘要

抽取式摘要（TextRank）

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def text_rank_summary(sentences, top_k=3):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    similarity = cosine_similarity(tfidf_matrix)
    
    scores = np.ones(len(sentences)) / len(sentences)
    for _ in range(100):
        new_scores = similarity @ scores / similarity.sum(axis=1)
        if np.abs(new_scores - scores).sum() < 1e-6:
            break
        scores = new_scores
    
    top_indices = np.argsort(scores)[-top_k:]
    return [sentences[i] for i in sorted(top_indices)]

生成式摘要

from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
result = summarizer(long_text, max_length=130, min_length=30, do_sample=False)

评估指标（ROUGE）

import evaluate

rouge = evaluate.load("rouge")
results = rouge.compute(predictions=predictions, references=references)
print(f"ROUGE-1: {results['rouge1']}, ROUGE-2: {results['rouge2']}, ROUGE-L: {results['rougeL']}")

信息抽取

命名实体识别

from transformers import pipeline

ner = pipeline("ner", model="ckiplab/bert-base-chinese-ner", aggregation_strategy="simple")
entities = ner("张三在北京的清华大学学习")

for ent in entities:
    print(f"{ent['word']} [{ent['entity_group']}]")

关系抽取

def extract_relation(text, entity1, entity2, model, tokenizer):
    """抽取两个实体之间的关系"""
    input_text = f"{entity1} [SEP] {entity2} [SEP] {text}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True)
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    predicted_label = torch.argmax(outputs.logits, dim=1).item()
    return id2relation[predicted_label]

语言模型

困惑度

import math

def perplexity(loss, n):
    return math.exp(loss / n)

文本生成参数

参数	说明	推荐值
temperature	控制随机性	0.7-1.0
top_k	保留前 k 个候选	50
top_p	核采样阈值	0.9-0.95
max_length	最大生成长度	根据任务
do_sample	是否采样	True
num_beams	束搜索大小	4-5

常见中文模型

模型	用途	Hugging Face ID
BERT 中文	通用理解	bert-base-chinese
RoBERTa 中文	通用理解	hfl/chinese-roberta-wwm-ext
GPT2 中文	文本生成	uer/gpt2-chinese-cluecorpussmall
BART 中文	序列到序列	fnlp/bart-base-chinese
T5 中文	序列到序列	Langboat/Mengzi-T5-base
Qwen	对话生成	Qwen/Qwen2-1.5B-Instruct

常见任务模型选择

任务	推荐模型架构	示例
文本分类	BERT、RoBERTa	BERT + 分类头
序列标注	BERT + CRF	BERT-CRF
文本生成	GPT、BART	GPT-2
机器翻译	T5、mBART、NLLB	T5
问答	BERT	BERT for QA
摘要生成	BART、T5	BART
信息抽取	BERT	BERT + NER头

训练参数推荐

学习率

模型类型	推荐学习率
BERT 微调	2e-5 ~ 5e-5
GPT 微调	1e-5 ~ 5e-5
从头训练	1e-4 ~ 1e-3

批次大小

场景	推荐批次
微调	16-32
预训练	256-512
大模型	1-4（配合梯度累积）

常见问题解决

CUDA 内存不足

# 减小批次大小
batch_size = 4

# 使用梯度累积
gradient_accumulation_steps = 4

# 使用混合精度
from torch.cuda.amp import autocast

# 使用 4-bit 量化
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True)

处理长文本

# 截断
inputs = tokenizer(text, max_length=512, truncation=True)

# 滑动窗口
inputs = tokenizer(text, max_length=512, stride=128, return_overflowing_tokens=True)

# 分块处理
chunks = [text[i:i+512] for i in range(0, len(text), 512)]

未登录词处理

# 使用 FastText
from gensim.models import FastText
model = FastText(sentences)
vector = model.wv['未登录词']

# 使用子词分词
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

性能优化

模型量化

from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)

LoRA 微调

from peft import LoraConfig, get_peft_model, TaskType

config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"]
)
model = get_peft_model(model, config)
model.print_trainable_parameters()

常用评估指标

任务	指标
分类	Accuracy、F1、Precision、Recall
序列标注	Entity-level F1
生成	BLEU、ROUGE
语言模型	Perplexity
问答	Exact Match、F1
翻译	BLEU、COMET
摘要	ROUGE-1/2/L

资源链接

Hugging Face 文档: https://huggingface.co/docs
Hugging Face 课程: https://huggingface.co/learn
spaCy 文档: https://spacy.io/api/doc
NLTK 文档: https://www.nltk.org/
jieba 文档: https://github.com/fxsjy/jieba
Gensim 文档: https://radimrehurek.com/gensim/
PyTorch 文档: https://pytorch.org/docs/

常用库安装​

文本预处理​

分词​

停用词​

文本清洗​

词向量​

Word2Vec​

FastText​

Hugging Face Transformers​

Pipeline 快速使用​

加载模型​

文本分类​

文本生成​

序列到序列任务​

问答模型​

注意力机制与 Transformer​

多头注意力​

位置编码​

序列标注​

BIO 标签​

spaCy NER​

评估指标​

文本分类​

传统方法​

深度学习评估​

机器翻译​

使用 Pipeline​

评估指标​

问答系统​

抽取式问答​

评估指标​

文本摘要​

抽取式摘要（TextRank）​

生成式摘要​

评估指标（ROUGE）​

信息抽取​

命名实体识别​

关系抽取​

语言模型​

困惑度​

文本生成参数​

常见中文模型​

常见任务模型选择​

训练参数推荐​

学习率​

批次大小​

常见问题解决​

CUDA 内存不足​

处理长文本​

未登录词处理​

性能优化​

模型量化​

LoRA 微调​

常用评估指标​

资源链接​

常用库安装

文本预处理

分词

停用词

文本清洗

词向量

Word2Vec

FastText

Hugging Face Transformers

Pipeline 快速使用

加载模型

文本分类

文本生成

序列到序列任务

问答模型

注意力机制与 Transformer

多头注意力

位置编码

序列标注

BIO 标签

spaCy NER

评估指标

文本分类

传统方法

深度学习评估

机器翻译

使用 Pipeline

评估指标

问答系统

抽取式问答

评估指标

文本摘要

抽取式摘要（TextRank）

生成式摘要

评估指标（ROUGE）

信息抽取

命名实体识别

关系抽取

语言模型

困惑度

文本生成参数

常见中文模型

常见任务模型选择

训练参数推荐

学习率

批次大小

常见问题解决

CUDA 内存不足

处理长文本

未登录词处理

性能优化

模型量化

LoRA 微调

常用评估指标

资源链接