跳到主要内容

序列标注

序列标注(Sequence Labeling)是自然语言处理中的一类重要任务,目标是为输入序列中的每个元素分配一个标签。常见的序列标注任务包括词性标注、命名实体识别、分词等。

什么是序列标注

给定一个输入序列 x=(x1,x2,...,xn)x = (x_1, x_2, ..., x_n),序列标注的目标是找到一个标签序列 y=(y1,y2,...,yn)y = (y_1, y_2, ..., y_n),使得条件概率 P(yx)P(y|x) 最大。

与普通分类任务不同,序列标注需要考虑标签之间的依赖关系。例如,在命名实体识别中,"B-PER"(人名开始)后面通常不会跟着 "B-ORG"(组织名开始)。

词性标注

词性标注(Part-of-Speech Tagging,POS Tagging)是为每个词标注其语法类别,如名词、动词、形容词等。

使用 NLTK

import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

text = "The quick brown fox jumps over the lazy dog."

# 分词
tokens = word_tokenize(text)

# 词性标注
tagged = pos_tag(tokens)

for word, tag in tagged:
print(f"{word:<15} {tag:<5} {nltk.help.upenn_tagset(tag)[:50] if hasattr(nltk.help, 'upenn_tagset') else ''}")

使用 spaCy

import spacy

nlp = spacy.load("en_core_web_sm")

text = "Apple is looking at buying U.K. startup for $1 billion."

doc = nlp(text)

for token in doc:
print(f"{token.text:<15} {token.pos_:<10} {token.tag_:<5} {spacy.explain(token.tag_)}")

中文词性标注

import jieba.posseg as pseg

text = "我爱自然语言处理"

words = pseg.cut(text)

for word, flag in words:
print(f"{word:<10} {flag}")

jieba 词性标注标记集:

标记含义
a形容词
d副词
n名词
v动词
r代词
m数词
p介词
c连词

命名实体识别

命名实体识别(Named Entity Recognition,NER)是从文本中识别出具有特定意义的实体,如人名、地名、机构名等。

使用 spaCy

import spacy

nlp = spacy.load("en_core_web_sm")

text = "Apple CEO Tim Cook announced new products in California yesterday."

doc = nlp(text)

for ent in doc.ents:
print(f"{ent.text:<25} {ent.label_:<10} {spacy.explain(ent.label_)}")

spaCy 支持的实体类型:

标签含义
PERSON人名
ORG组织机构
GPE地理政治实体(国家、城市)
DATE日期
MONEY金额
PRODUCT产品

使用 Hugging Face

from transformers import pipeline

# 创建 NER 管道
ner = pipeline("ner", aggregation_strategy="simple")

text = "Apple CEO Tim Cook announced new products in California."

results = ner(text)

for result in results:
print(f"实体: {result['word']:<15} 类型: {result['entity_group']:<8} 置信度: {result['score']:.4f}")

中文命名实体识别

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# 使用中文 NER 模型
model_name = "ckiplab/bert-base-chinese-ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

ner = pipeline("ner", model=model, tokenizer=tokenizer)

text = "张三在北京的清华大学学习"

results = ner(text)

for result in results:
print(f"实体: {result['word']:<10} 类型: {result['entity']:<10} 位置: {result['start']}-{result['end']}")

序列标注模型

BIO 标注方案

BIO(Beginning-Inside-Outside)是最常用的序列标注方案:

  • B-X:实体 X 的开始
  • I-X:实体 X 的内部
  • O:不属于任何实体
# 示例:命名实体识别的 BIO 标注
sentence = ["张三", "在", "北京", "的", "清华大学", "学习"]
labels = ["B-PER", "O", "B-LOC", "O", "B-ORG", "I-ORG", "O"]

for word, label in zip(sentence, labels):
print(f"{word:<10} {label}")

条件随机场(CRF)

CRF 是序列标注的经典方法,能够建模标签之间的依赖关系。

from sklearn_crfsuite import CRF

def word2features(sent, i):
"""提取词特征"""
word = sent[i]
features = {
'bias': 1.0,
'word.lower()': word.lower(),
'word[-3:]': word[-3:],
'word[-2:]': word[-2:],
'word.isupper()': word.isupper(),
'word.istitle()': word.istitle(),
'word.isdigit()': word.isdigit(),
}

if i > 0:
word1 = sent[i-1]
features.update({
'-1:word.lower()': word1.lower(),
'-1:word.istitle()': word1.istitle(),
})
else:
features['BOS'] = True

if i < len(sent)-1:
word1 = sent[i+1]
features.update({
'+1:word.lower()': word1.lower(),
'+1:word.istitle()': word1.istitle(),
})
else:
features['EOS'] = True

return features

def sent2features(sent):
return [word2features(sent, i) for i in range(len(sent))]

# 训练数据
train_sents = [
['Apple', 'is', 'based', 'in', 'Cupertino'],
['Google', 'was', 'founded', 'in', 'California'],
]

train_labels = [
['B-ORG', 'O', 'O', 'O', 'B-LOC'],
['B-ORG', 'O', 'O', 'O', 'B-LOC'],
]

# 提取特征
X_train = [sent2features(s) for s in train_sents]

# 训练 CRF 模型
crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100)
crf.fit(X_train, train_labels)

# 预测
test_sent = ['Microsoft', 'is', 'located', 'in', 'Washington']
X_test = sent2features(test_sent)
predictions = crf.predict([X_test])

for word, label in zip(test_sent, predictions[0]):
print(f"{word:<15} {label}")

BiLSTM-CRF 模型

BiLSTM-CRF 是深度学习时代序列标注的经典架构,结合了 BiLSTM 的序列建模能力和 CRF 的标签依赖建模能力。

import torch
import torch.nn as nn
from torchcrf import CRF

class BiLSTMCRF(nn.Module):
def __init__(self, vocab_size, embed_dim, hidden_dim, num_tags):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.lstm = nn.LSTM(
embed_dim, hidden_dim // 2,
num_layers=2, bidirectional=True, batch_first=True
)
self.hidden2tag = nn.Linear(hidden_dim, num_tags)
self.crf = CRF(num_tags, batch_first=True)

def forward(self, x, tags=None, mask=None):
embeds = self.embedding(x)
lstm_out, _ = self.lstm(embeds)
emissions = self.hidden2tag(lstm_out)

if tags is not None:
# 训练时返回负对数似然
loss = -self.crf(emissions, tags, mask=mask)
return loss
else:
# 预测时返回最优路径
return self.crf.decode(emissions, mask=mask)

# 使用示例
vocab_size = 10000
embed_dim = 128
hidden_dim = 256
num_tags = 9 # O, B-PER, I-PER, B-ORG, I-ORG, B-LOC, I-LOC, B-MISC, I-MISC

model = BiLSTMCRF(vocab_size, embed_dim, hidden_dim, num_tags)

# 模拟输入
batch_size = 2
seq_len = 10
x = torch.randint(0, vocab_size, (batch_size, seq_len))
tags = torch.randint(0, num_tags, (batch_size, seq_len))
mask = torch.ones(batch_size, seq_len, dtype=torch.bool)

# 训练
loss = model(x, tags, mask)
print(f"Loss: {loss.item():.4f}")

# 预测
predictions = model(x, mask=mask)
print(f"预测序列: {predictions}")

使用 Transformers 进行序列标注

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer
import torch

# 加载预训练模型
model_name = "bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
model_name,
num_labels=9 # 根据任务调整
)

# 准备数据
def tokenize_and_align_labels(examples):
tokenized_inputs = tokenizer(
examples["tokens"],
truncation=True,
is_split_into_words=True
)

labels = []
for i, label in enumerate(examples["ner_tags"]):
word_ids = tokenized_inputs.word_ids(batch_index=i)
previous_word_idx = None
label_ids = []
for word_idx in word_ids:
if word_idx is None:
label_ids.append(-100)
elif word_idx != previous_word_idx:
label_ids.append(label[word_idx])
else:
label_ids.append(-100)
previous_word_idx = word_idx
labels.append(label_ids)

tokenized_inputs["labels"] = labels
return tokenized_inputs

# 预测
def predict_ner(text, model, tokenizer, id2label):
inputs = tokenizer(text, return_tensors="pt")

with torch.no_grad():
outputs = model(**inputs)

predictions = torch.argmax(outputs.logits, dim=2)

tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
labels = [id2label[p.item()] for p in predictions[0]]

return list(zip(tokens, labels))

# 使用示例
id2label = {
0: "O", 1: "B-PER", 2: "I-PER",
3: "B-ORG", 4: "I-ORG",
5: "B-LOC", 6: "I-LOC"
}

text = "张三在北京的清华大学学习"
# results = predict_ner(text, model, tokenizer, id2label)

序列标注评估指标

序列标注任务通常使用精确率、召回率和 F1 值进行评估。

from seqeval.metrics import precision_score, recall_score, f1_score
from seqeval.metrics import classification_report

# 真实标签和预测标签
y_true = [
['O', 'O', 'B-ORG', 'I-ORG', 'O', 'B-LOC'],
['B-PER', 'I-PER', 'O', 'B-LOC', 'O']
]

y_pred = [
['O', 'O', 'B-ORG', 'I-ORG', 'O', 'B-LOC'],
['B-PER', 'I-PER', 'O', 'B-LOC', 'O']
]

# 计算指标
print("精确率:", precision_score(y_true, y_pred))
print("召回率:", recall_score(y_true, y_pred))
print("F1 值:", f1_score(y_true, y_pred))

# 详细报告
print(classification_report(y_true, y_pred))

实战:构建中文 NER 系统

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForTokenClassification
from transformers import AdamW
from tqdm import tqdm

class NERDataset(Dataset):
def __init__(self, texts, labels, tokenizer, label2id, max_length=128):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
self.label2id = label2id
self.max_length = max_length

def __len__(self):
return len(self.texts)

def __getitem__(self, idx):
text = self.texts[idx]
labels = self.labels[idx]

# 分词
encoding = self.tokenizer(
text,
max_length=self.max_length,
padding='max_length',
truncation=True,
return_tensors='pt'
)

# 对齐标签
label_ids = []
word_ids = encoding.word_ids()

for word_idx in word_ids:
if word_idx is None:
label_ids.append(-100)
else:
label_ids.append(self.label2id.get(labels[word_idx], 0))

return {
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'labels': torch.tensor(label_ids)
}

# 标签映射
label2id = {
'O': 0,
'B-PER': 1, 'I-PER': 2,
'B-ORG': 3, 'I-ORG': 4,
'B-LOC': 5, 'I-LOC': 6,
'B-TIME': 7, 'I-TIME': 8
}
id2label = {v: k for k, v in label2id.items()}

# 示例数据
texts = [
"张三在北京工作",
"李四去了上海的阿里巴巴",
"王五明天去广州"
]

labels = [
['B-PER', 'I-PER', 'O', 'B-LOC', 'O', 'O'],
['B-PER', 'I-PER', 'O', 'O', 'B-LOC', 'O', 'B-ORG', 'I-ORG', 'I-ORG'],
['B-PER', 'I-PER', 'B-TIME', 'O', 'B-LOC']
]

# 创建数据集
tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')
dataset = NERDataset(texts, labels, tokenizer, label2id)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# 创建模型
model = BertForTokenClassification.from_pretrained(
'bert-base-chinese',
num_labels=len(label2id),
id2label=id2label,
label2id=label2id
)

# 训练
optimizer = AdamW(model.parameters(), lr=2e-5)

model.train()
for epoch in range(3):
total_loss = 0
for batch in dataloader:
optimizer.zero_grad()

outputs = model(
input_ids=batch['input_ids'],
attention_mask=batch['attention_mask'],
labels=batch['labels']
)

loss = outputs.loss
loss.backward()
optimizer.step()

total_loss += loss.item()

print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}")

# 预测函数
def predict(text):
model.eval()
encoding = tokenizer(text, return_tensors='pt')

with torch.no_grad():
outputs = model(**encoding)

predictions = torch.argmax(outputs.logits, dim=2)
tokens = tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])
labels = [id2label[p.item()] for p in predictions[0]]

# 提取实体
entities = []
current_entity = None

for token, label in zip(tokens, labels):
if label.startswith('B-'):
if current_entity:
entities.append(current_entity)
entity_type = label[2:]
current_entity = {'text': token, 'type': entity_type}
elif label.startswith('I-') and current_entity:
current_entity['text'] += token
else:
if current_entity:
entities.append(current_entity)
current_entity = None

if current_entity:
entities.append(current_entity)

return entities

# 测试预测
result = predict("张三在北京的清华大学学习")
print("识别的实体:", result)

总结

序列标注是 NLP 的基础任务,本章介绍了:

  • 词性标注:为词标注语法类别
  • 命名实体识别:识别文本中的实体
  • BIO 标注方案:序列标注的标准格式
  • CRF 模型:经典的统计方法
  • BiLSTM-CRF:深度学习方法
  • 基于 Transformers 的方法:使用预训练模型

序列标注技术广泛应用于信息抽取、知识图谱构建等领域。下一章将介绍文本分类任务。