语言模型
语言模型(Language Model)是自然语言处理的核心技术之一,用于计算文本序列的概率或预测下一个词。语言模型的发展经历了从统计方法到神经网络方法的演进。
什么是语言模型
语言模型的核心任务是估计文本序列的概率。给定一个词序列 ,语言模型计算该序列出现的概率:
根据链式法则,可以分解为:
语言模型的核心应用是预测下一个词:给定前面的词,预测下一个词的概率分布。
N-gram 语言模型
N-gram 是最经典的统计语言模型,基于马尔可夫假设:当前词只依赖于前面 n-1 个词。
原理
对于 n-gram 模型:
常见的 n-gram 模型:
- Unigram(1-gram):不考虑上下文,
- Bigram(2-gram):只考虑前一个词,
- Trigram(3-gram):考虑前两个词,
实现
from collections import defaultdict
import numpy as np
class NGramLM:
def __init__(self, n=2):
self.n = n
self.ngram_counts = defaultdict(int)
self.context_counts = defaultdict(int)
self.vocab = set()
def train(self, corpus):
"""训练 n-gram 模型"""
for sentence in corpus:
tokens = ['<s>'] * (self.n - 1) + sentence + ['</s>']
self.vocab.update(sentence)
for i in range(self.n - 1, len(tokens)):
ngram = tuple(tokens[i-self.n+1:i+1])
context = tuple(tokens[i-self.n+1:i])
self.ngram_counts[ngram] += 1
self.context_counts[context] += 1
def probability(self, word, context):
"""计算条件概率 P(word|context)"""
ngram = context + (word,)
context_count = self.context_counts.get(context, 0)
if context_count == 0:
return 1 / len(self.vocab) # 平滑
return self.ngram_counts.get(ngram, 0) / context_count
def predict_next(self, context, top_k=5):
"""预测下一个词"""
context = tuple(context[-(self.n-1):])
probs = {}
for word in self.vocab:
probs[word] = self.probability(word, context)
# 返回概率最高的 k 个词
sorted_probs = sorted(probs.items(), key=lambda x: x[1], reverse=True)
return sorted_probs[:top_k]
def sentence_probability(self, sentence):
"""计算句子概率"""
tokens = ['<s>'] * (self.n - 1) + sentence + ['</s>']
prob = 1.0
for i in range(self.n - 1, len(tokens)):
context = tuple(tokens[i-self.n+1:i])
word = tokens[i]
prob *= self.probability(word, context)
return prob
# 训练示例
corpus = [
['我', '喜欢', '学习', '自然语言处理'],
['我', '喜欢', '学习', '机器学习'],
['自然语言处理', '是', '人工智能', '的', '分支'],
['机器学习', '是', '人工智能', '的', '重要', '技术']
]
lm = NGramLM(n=2)
lm.train(corpus)
# 预测下一个词
context = ['我', '喜欢']
predictions = lm.predict_next(context)
print(f"给定 {' '.join(context)},预测下一个词:")
for word, prob in predictions:
print(f" {word}: {prob:.4f}")
平滑技术
N-gram 模型面临数据稀疏问题:很多 n-gram 组合在训练语料中未出现。平滑技术用于处理未登录的 n-gram。
class NGramLMWithSmoothing(NGramLM):
def __init__(self, n=2, alpha=1.0):
super().__init__(n)
self.alpha = alpha # Laplace 平滑参数
def probability(self, word, context):
"""使用 Laplace 平滑计算概率"""
ngram = context + (word,)
context_count = self.context_counts.get(context, 0)
vocab_size = len(self.vocab)
# Laplace 平滑:P = (count + alpha) / (total + alpha * vocab_size)
ngram_count = self.ngram_counts.get(ngram, 0)
return (ngram_count + self.alpha) / (context_count + self.alpha * vocab_size)
常见的平滑方法:
| 方法 | 公式 | 特点 |
|---|---|---|
| Laplace 平滑 | 简单,但概率质量分配不均 | |
| Add-k 平滑 | 可调节参数 k | |
| Good-Turing | 用低频计数估计未见事件 | 理论基础好 |
| Kneser-Ney | 回退 + 折扣 | 效果最好 |
N-gram 的局限性
维度灾难:n 增大时,参数量指数增长。
长距离依赖:无法捕捉长距离的词依赖关系。
泛化能力差:无法利用词之间的语义相似性。
神经语言模型
神经语言模型使用神经网络来建模语言,能够克服 n-gram 的局限性。
前馈神经网络语言模型
最早由 Bengio 等人于 2003 年提出,使用词嵌入和前馈网络。
import torch
import torch.nn as nn
class FNNLanguageModel(nn.Module):
def __init__(self, vocab_size, embed_dim, hidden_dim, context_size):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.fc1 = nn.Linear(context_size * embed_dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim, vocab_size)
def forward(self, x):
# x: (batch_size, context_size)
embeds = self.embedding(x) # (batch_size, context_size, embed_dim)
embeds = embeds.view(embeds.size(0), -1) # 展平
hidden = torch.relu(self.fc1(embeds))
output = self.fc2(hidden)
return output
# 使用示例
vocab_size = 10000
embed_dim = 128
hidden_dim = 256
context_size = 3 # 使用前 3 个词预测
model = FNNLanguageModel(vocab_size, embed_dim, hidden_dim, context_size)
print(model)
循环神经网络语言模型
RNN 能够处理变长序列,捕捉长距离依赖。
class RNNLanguageModel(nn.Module):
def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers=1):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.rnn = nn.RNN(embed_dim, hidden_dim, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_dim, vocab_size)
self.hidden_dim = hidden_dim
def forward(self, x, hidden=None):
# x: (batch_size, seq_len)
embeds = self.embedding(x) # (batch_size, seq_len, embed_dim)
output, hidden = self.rnn(embeds, hidden)
output = self.fc(output) # (batch_size, seq_len, vocab_size)
return output, hidden
# 使用 LSTM 改进
class LSTMLanguageModel(nn.Module):
def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers=2, dropout=0.3):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.lstm = nn.LSTM(
embed_dim, hidden_dim, num_layers,
batch_first=True, dropout=dropout
)
self.fc = nn.Linear(hidden_dim, vocab_size)
def forward(self, x, hidden=None):
embeds = self.embedding(x)
output, hidden = self.lstm(embeds, hidden)
output = self.fc(output)
return output, hidden
# 使用示例
vocab_size = 10000
model = LSTMLanguageModel(vocab_size, embed_dim=256, hidden_dim=512)
print(model)
训练神经语言模型
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
class TextDataset(Dataset):
def __init__(self, texts, vocab, seq_length=20):
self.vocab = vocab
self.seq_length = seq_length
self.data = []
for text in texts:
tokens = [vocab.get(w, vocab['<UNK>']) for w in text]
for i in range(0, len(tokens) - seq_length):
self.data.append((
tokens[i:i+seq_length],
tokens[i+1:i+seq_length+1]
))
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
x, y = self.data[idx]
return torch.tensor(x), torch.tensor(y)
def train_model(model, dataloader, epochs=10, lr=0.001):
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
model.train()
for epoch in range(epochs):
total_loss = 0
for batch_x, batch_y in dataloader:
optimizer.zero_grad()
output, _ = model(batch_x)
# 重塑输出用于计算损失
output = output.view(-1, output.size(-1))
batch_y = batch_y.view(-1)
loss = criterion(output, batch_y)
loss.backward()
optimizer.step()
total_loss += loss.item()
avg_loss = total_loss / len(dataloader)
print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")
# 文本生成
def generate_text(model, vocab, start_tokens, max_length=50):
model.eval()
idx_to_word = {v: k for k, v in vocab.items()}
tokens = [vocab.get(w, vocab['<UNK>']) for w in start_tokens]
hidden = None
with torch.no_grad():
for _ in range(max_length):
x = torch.tensor([tokens[-20:]]) # 使用最后 20 个词
output, hidden = model(x, hidden)
# 获取最后一个位置的预测
probs = torch.softmax(output[0, -1], dim=-1)
next_token = torch.multinomial(probs, 1).item()
tokens.append(next_token)
# 遇到结束符则停止
if idx_to_word.get(next_token) == '<EOS>':
break
return [idx_to_word.get(t, '<UNK>') for t in tokens]
Transformer 语言模型
Transformer 架构彻底改变了语言模型的发展,成为现代大语言模型的基础。
Transformer 架构
Transformer 使用自注意力机制替代循环结构,能够并行计算并捕捉长距离依赖。
import torch
import torch.nn as nn
import math
class MultiHeadAttention(nn.Module):
def __init__(self, embed_dim, num_heads):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.head_dim = embed_dim // num_heads
self.q_proj = nn.Linear(embed_dim, embed_dim)
self.k_proj = nn.Linear(embed_dim, embed_dim)
self.v_proj = nn.Linear(embed_dim, embed_dim)
self.out_proj = nn.Linear(embed_dim, embed_dim)
def forward(self, x, mask=None):
batch_size, seq_len, _ = x.shape
# 线性投影
q = self.q_proj(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
k = self.k_proj(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
v = self.v_proj(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
# 计算注意力分数
scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
# 应用掩码(用于语言模型,防止看到未来信息)
if mask is not None:
scores = scores.masked_fill(mask == 0, float('-inf'))
# Softmax 和加权求和
attn_weights = torch.softmax(scores, dim=-1)
output = torch.matmul(attn_weights, v)
# 重组并投影
output = output.transpose(1, 2).contiguous().view(batch_size, seq_len, self.embed_dim)
return self.out_proj(output)
class TransformerBlock(nn.Module):
def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
super().__init__()
self.attention = MultiHeadAttention(embed_dim, num_heads)
self.norm1 = nn.LayerNorm(embed_dim)
self.norm2 = nn.LayerNorm(embed_dim)
self.ff = nn.Sequential(
nn.Linear(embed_dim, ff_dim),
nn.ReLU(),
nn.Linear(ff_dim, embed_dim)
)
self.dropout = nn.Dropout(dropout)
def forward(self, x, mask=None):
# 自注意力 + 残差连接 + 层归一化
attn_out = self.attention(x, mask)
x = self.norm1(x + self.dropout(attn_out))
# 前馈网络 + 残差连接 + 层归一化
ff_out = self.ff(x)
x = self.norm2(x + self.dropout(ff_out))
return x
class TransformerLM(nn.Module):
def __init__(self, vocab_size, embed_dim, num_heads, num_layers, ff_dim, max_seq_len=512):
super().__init__()
self.token_embedding = nn.Embedding(vocab_size, embed_dim)
self.position_embedding = nn.Embedding(max_seq_len, embed_dim)
self.layers = nn.ModuleList([
TransformerBlock(embed_dim, num_heads, ff_dim)
for _ in range(num_layers)
])
self.fc_out = nn.Linear(embed_dim, vocab_size)
# 创建因果掩码
self.register_buffer('causal_mask',
torch.tril(torch.ones(max_seq_len, max_seq_len)).unsqueeze(0).unsqueeze(0))
def forward(self, x):
seq_len = x.size(1)
# 词嵌入 + 位置嵌入
positions = torch.arange(seq_len, device=x.device).unsqueeze(0)
x = self.token_embedding(x) + self.position_embedding(positions)
# 因果掩码(只看当前位置之前的词)
mask = self.causal_mask[:, :, :seq_len, :seq_len]
# 通过 Transformer 层
for layer in self.layers:
x = layer(x, mask)
# 输出层
return self.fc_out(x)
# 使用示例
vocab_size = 10000
model = TransformerLM(
vocab_size=vocab_size,
embed_dim=512,
num_heads=8,
num_layers=6,
ff_dim=2048
)
print(f"模型参数量: {sum(p.numel() for p in model.parameters()):,}")
使用 Hugging Face 语言模型
Hugging Face 提供了大量预训练语言模型,可以直接使用。
GPT-2 文本生成
from transformers import GPT2LMHeadModel, GPT2Tokenizer
# 加载模型和分词器
model_name = "gpt2" # 或 "gpt2-medium", "gpt2-large", "gpt2-xl"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
# 文本生成
def generate_text(prompt, max_length=100):
inputs = tokenizer.encode(prompt, return_tensors="pt")
outputs = model.generate(
inputs,
max_length=max_length,
num_return_sequences=1,
temperature=0.7,
top_k=50,
top_p=0.95,
do_sample=True
)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
# 生成文本
prompt = "Natural language processing is"
generated = generate_text(prompt)
print(generated)
中文语言模型
from transformers import BertForMaskedLM, BertTokenizer
# 加载中文 BERT 模型
model_name = "bert-base-chinese"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)
# 填空任务
def fill_mask(text):
inputs = tokenizer(text, return_tensors="pt")
# 找到 [MASK] 位置
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
# 预测
with torch.no_grad():
outputs = model(**inputs)
# 获取预测结果
mask_token_logits = outputs.logits[0, mask_token_index, :]
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
print(f"原文: {text}")
print("预测结果:")
for token in top_5_tokens:
predicted_text = text.replace(tokenizer.mask_token, tokenizer.decode([token]))
print(f" {predicted_text}")
# 使用示例
fill_mask("自然语言处理是[MASK]智能的重要分支。")
语言模型评估
困惑度(Perplexity)
困惑度是评估语言模型的标准指标,定义为:
困惑度越低,模型越好。
import torch
import math
def calculate_perplexity(model, dataloader, criterion):
"""计算困惑度"""
model.eval()
total_loss = 0
total_tokens = 0
with torch.no_grad():
for batch_x, batch_y in dataloader:
output, _ = model(batch_x)
output = output.view(-1, output.size(-1))
batch_y = batch_y.view(-1)
loss = criterion(output, batch_y)
total_loss += loss.item() * batch_y.size(0)
total_tokens += batch_y.size(0)
avg_loss = total_loss / total_tokens
perplexity = math.exp(avg_loss)
return perplexity
语言模型的发展历程
| 年份 | 模型 | 特点 |
|---|---|---|
| 2003 | NNLM | 首个神经语言模型 |
| 2013 | Word2Vec | 高效词向量训练 |
| 2017 | Transformer | 自注意力机制 |
| 2018 | BERT | 双向预训练 |
| 2018 | GPT | 自回归生成 |
| 2020 | GPT-3 | 大规模涌现能力 |
| 2022 | ChatGPT | 对话能力突破 |
| 2023 | GPT-4 | 多模态能力 |
总结
语言模型是 NLP 的核心技术,本章介绍了:
- N-gram 模型:基于统计的经典方法,简单但能力有限
- 神经语言模型:使用 RNN/LSTM 建模序列
- Transformer 语言模型:现代大语言模型的基础架构
- 预训练语言模型:BERT、GPT 等模型的使用
语言模型的发展推动了整个 NLP 领域的进步,从简单的统计方法到如今的大语言模型,语言模型的能力不断提升。下一章将介绍序列标注任务。