信息抽取
信息抽取(Information Extraction,IE)是从非结构化文本中自动提取结构化信息的技术。它是将海量文本数据转化为可用知识的关键步骤,广泛应用于知识图谱构建、智能搜索、数据分析等领域。
什么是信息抽取
信息抽取的目标是从自然语言文本中识别和提取特定类型的信息,并将其组织成结构化的形式。例如,从新闻文本中提取公司并购事件:谁收购了谁、收购金额是多少、收购时间是什么时候。
信息抽取的核心任务包括:
命名实体识别(NER):识别文本中的实体,如人名、地名、机构名、时间、数值等。
关系抽取(RE):识别实体之间的语义关系,如"张三是阿里巴巴的员工"中的雇佣关系。
事件抽取(EE):识别文本中描述的事件,包括事件类型、参与者、时间、地点等要素。
共指消解(Coreference Resolution):识别文本中指向同一实体的不同表述,如"张三"和"他"。
命名实体识别
命名实体识别是信息抽取的基础任务,目标是从文本中识别出特定类别的实体。
实体类型
常见的实体类型包括:
| 类型 | 说明 | 示例 |
|---|---|---|
| PER | 人名 | 张三、李四 |
| ORG | 组织机构 | 阿里巴巴、清华大学 |
| LOC | 地理位置 | 北京、上海 |
| TIME | 时间表达 | 2024年、昨天 |
| NUM | 数值 | 100万、50% |
使用预训练模型
from transformers import pipeline
# 创建 NER 管道
ner = pipeline("ner", model="ckiplab/bert-base-chinese-ner", aggregation_strategy="simple")
text = "张三在北京的清华大学学习,他计划明年去上海工作。"
results = ner(text)
print("识别的实体:")
for result in results:
entity_type = result['entity_group']
entity_text = result['word']
confidence = result['score']
print(f" {entity_text} [{entity_type}] (置信度: {confidence:.4f})")
使用 spaCy
import spacy
# 加载中文模型(需要先安装:python -m spacy download zh_core_web_sm)
nlp = spacy.load("zh_core_web_sm")
text = "苹果公司在加州库比蒂诺发布了新款iPhone,CEO蒂姆·库克出席了发布会。"
doc = nlp(text)
print("识别的实体:")
for ent in doc.ents:
print(f" {ent.text} [{ent.label_}]")
自定义 NER 模型
对于特定领域的实体识别,可能需要训练自定义模型:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset
import torch
# 实体标签
label_list = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}
# 准备训练数据
train_data = [
{
"tokens": ["张", "三", "在", "北", "京", "工", "作"],
"labels": ["B-PER", "I-PER", "O", "B-LOC", "I-LOC", "O", "O"]
},
{
"tokens": ["阿", "里", "巴", "巴", "是", "科", "技", "公", "司"],
"labels": ["B-ORG", "I-ORG", "I-ORG", "I-ORG", "O", "O", "O", "O", "O"]
}
]
# 加载模型和分词器
model_name = "bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
model_name,
num_labels=len(label_list),
id2label=id2label,
label2id=label2id
)
# 预处理函数
def tokenize_and_align_labels(examples):
tokenized_inputs = tokenizer(
examples["tokens"],
truncation=True,
is_split_into_words=True
)
labels = []
for i, label in enumerate(examples["labels"]):
word_ids = tokenized_inputs.word_ids(batch_index=i)
previous_word_idx = None
label_ids = []
for word_idx in word_ids:
if word_idx is None:
label_ids.append(-100)
elif word_idx != previous_word_idx:
label_ids.append(label2id[label[word_idx]])
else:
label_ids.append(-100)
previous_word_idx = word_idx
labels.append(label_ids)
tokenized_inputs["labels"] = labels
return tokenized_inputs
# 预测函数
def predict_entities(text, model, tokenizer, id2label):
"""预测文本中的实体"""
inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
predictions = torch.argmax(outputs.logits, dim=2)
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
labels = [id2label[p.item()] for p in predictions[0]]
# 提取实体
entities = []
current_entity = None
for token, label in zip(tokens, labels):
if label.startswith("B-"):
if current_entity:
entities.append(current_entity)
entity_type = label[2:]
current_entity = {"text": token, "type": entity_type}
elif label.startswith("I-") and current_entity:
current_entity["text"] += token
else:
if current_entity:
entities.append(current_entity)
current_entity = None
if current_entity:
entities.append(current_entity)
return entities
# 使用示例
text = "李四在上海的腾讯公司工作"
entities = predict_entities(text, model, tokenizer, id2label)
for entity in entities:
print(f"实体: {entity['text']}, 类型: {entity['type']}")
关系抽取
关系抽取的目标是识别文本中实体之间的语义关系。
关系类型
常见的关系类型包括:
| 关系类型 | 说明 | 示例 |
|---|---|---|
| 雇佣 | 人与组织 | 张三在阿里巴巴工作 |
| 位于 | 实体与地点 | 清华大学位于北京 |
| 创始人 | 人与公司 | 马云创立了阿里巴巴 |
| 亲属 | 人与人 | 张三是李四的父亲 |
| 产品 | 公司与产品 | 苹果公司生产iPhone |
基于模式的关系抽取
简单的关系可以通过模式匹配提取:
import re
import jieba
def extract_relations_by_pattern(text):
"""基于模式的关系抽取"""
relations = []
# 定义模式
patterns = [
(r'(.+?)是(.+?)的创始人', '创始人'),
(r'(.+?)在(.+?)工作', '雇佣'),
(r'(.+?)位于(.+?)', '位于'),
(r'(.+?)创立了(.+?)', '创始人'),
]
for pattern, relation_type in patterns:
matches = re.finditer(pattern, text)
for match in matches:
entity1 = match.group(1).strip()
entity2 = match.group(2).strip()
relations.append({
'entity1': entity1,
'relation': relation_type,
'entity2': entity2
})
return relations
# 示例
text = "马云创立了阿里巴巴公司。张三在北京工作。清华大学位于海淀区。"
relations = extract_relations_by_pattern(text)
for rel in relations:
print(f"{rel['entity1']} --[{rel['relation']}]--> {rel['entity2']}")
基于深度学习的关系抽取
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
class RelationExtractor:
"""关系抽取器"""
def __init__(self, model_name="bert-base-chinese"):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
# 假设有 4 种关系类型
self.model = AutoModelForSequenceClassification.from_pretrained(
model_name,
num_labels=4
)
self.id2relation = {
0: "雇佣",
1: "位于",
2: "创始人",
3: "无关系"
}
def extract(self, text, entity1, entity2):
"""抽取两个实体之间的关系"""
# 构造输入,用特殊标记标注实体位置
marked_text = text.replace(entity1, f"[E1]{entity1}[/E1]")
marked_text = marked_text.replace(entity2, f"[E2]{entity2}[/E2]")
inputs = self.tokenizer(
marked_text,
return_tensors="pt",
truncation=True,
max_length=128
)
with torch.no_grad():
outputs = self.model(**inputs)
predicted_label = torch.argmax(outputs.logits, dim=1).item()
relation = self.id2relation[predicted_label]
return {
'entity1': entity1,
'entity2': entity2,
'relation': relation,
'confidence': torch.softmax(outputs.logits, dim=1)[0][predicted_label].item()
}
# 使用示例
extractor = RelationExtractor()
text = "张三在阿里巴巴公司工作"
result = extractor.extract(text, "张三", "阿里巴巴")
print(f"{result['entity1']} --[{result['relation']}]--> {result['entity2']}")
联合抽取
将实体识别和关系抽取结合:
def joint_extraction(text, ner_model, re_model, tokenizer):
"""联合实体和关系抽取"""
# 第一步:实体识别
entities = predict_entities(text, ner_model, tokenizer, id2label)
# 第二步:关系抽取(对每对实体)
relations = []
for i, e1 in enumerate(entities):
for e2 in entities[i+1:]:
# 抽取关系
result = re_model.extract(text, e1['text'], e2['text'])
if result['relation'] != '无关系':
relations.append(result)
return entities, relations
# 示例输出格式
# 实体: [张三 (PER), 阿里巴巴 (ORG), 北京 (LOC)]
# 关系: [张三 --雇佣--> 阿里巴巴, 阿里巴巴 --位于--> 北京]
事件抽取
事件抽取是识别文本中描述的事件,提取事件的类型、触发词、论元等信息。
事件结构
一个事件通常包含:
触发词(Trigger):最能表明事件发生的词,如"收购"、"出生"、"袭击"。
事件类型(Event Type):事件的类别,如"并购"、"出生"、"攻击"。
论元(Argument):事件的参与者,如收购方、被收购方、收购金额等。
事件抽取示例
import re
from dataclasses import dataclass
from typing import List, Dict, Optional
@dataclass
class Event:
"""事件类"""
event_type: str
trigger: str
arguments: Dict[str, str]
class EventExtractor:
"""事件抽取器"""
def __init__(self):
# 定义事件模式
self.event_patterns = {
"收购": {
"trigger_patterns": [r'收购', r'并购', r'买下', r'购入'],
"argument_patterns": {
"收购方": r'(.+?)(?:公司|集团|企业)',
"被收购方": r'收购(?:了)?(.+?)(?:公司|集团|企业)?',
"金额": r'(\d+(?:\.\d+)?(?:亿|万|千万)?(?:美元|人民币|元))'
}
},
"出生": {
"trigger_patterns": [r'出生', r'生于'],
"argument_patterns": {
"人物": r'(.+?)(?:出生|生于)',
"地点": r'(?:出生|生于)(.+?)(?:,|。|年)',
"时间": r'(\d{4}年\d{1,2}月\d{1,2}日)'
}
}
}
def extract(self, text):
"""抽取事件"""
events = []
for event_type, config in self.event_patterns.items():
# 查找触发词
for trigger_pattern in config["trigger_patterns"]:
if re.search(trigger_pattern, text):
trigger = re.search(trigger_pattern, text).group()
# 提取论元
arguments = {}
for arg_name, arg_pattern in config["argument_patterns"].items():
match = re.search(arg_pattern, text)
if match:
arguments[arg_name] = match.group(1).strip()
if arguments: # 至少有一个论元
events.append(Event(
event_type=event_type,
trigger=trigger,
arguments=arguments
))
break
return events
# 使用示例
extractor = EventExtractor()
texts = [
"腾讯公司收购了搜狗公司,交易金额为12亿美元。",
"马云出生于1964年,浙江杭州。"
]
for text in texts:
events = extractor.extract(text)
for event in events:
print(f"事件类型: {event.event_type}")
print(f"触发词: {event.trigger}")
print(f"论元: {event.arguments}")
print()
使用深度学习进行事件抽取
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
class DeepEventExtractor:
"""基于深度学习的事件抽取"""
def __init__(self, model_name="bert-base-chinese"):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
# 触发词识别模型
self.trigger_model = AutoModelForTokenClassification.from_pretrained(
model_name,
num_labels=10 # 假设有9种事件类型 + O
)
# 论元角色识别模型
self.argument_model = AutoModelForTokenClassification.from_pretrained(
model_name,
num_labels=15 # 假设有14种论元角色 + O
)
def extract(self, text):
"""抽取事件"""
inputs = self.tokenizer(text, return_tensors="pt", truncation=True)
# 识别触发词
with torch.no_grad():
trigger_outputs = self.trigger_model(**inputs)
trigger_predictions = torch.argmax(trigger_outputs.logits, dim=2)
# 识别论元角色
with torch.no_grad():
arg_outputs = self.argument_model(**inputs)
arg_predictions = torch.argmax(arg_outputs.logits, dim=2)
# 解析结果(简化版本)
tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
events = []
current_event = None
for i, (token, trig_label, arg_label) in enumerate(
zip(tokens, trigger_predictions[0], arg_predictions[0])
):
# 触发词识别
if trig_label.item() > 0: # 不是 O
if current_event:
events.append(current_event)
current_event = {
"trigger": token,
"type": f"事件类型{trig_label.item()}",
"arguments": {}
}
# 论元识别
if arg_label.item() > 0 and current_event:
arg_role = f"角色{arg_label.item()}"
if arg_role in current_event["arguments"]:
current_event["arguments"][arg_role] += token
else:
current_event["arguments"][arg_role] = token
if current_event:
events.append(current_event)
return events
共指消解
共指消解是识别文本中指向同一实体的不同表述。
共指类型
代词共指:张三说他会来。("他"指"张三")
名词共指:马云是阿里巴巴的创始人。这位企业家出生于杭州。("这位企业家"指"马云")
名称变体:北京是中国的首都。这座城市有悠久的历史。("这座城市"指"北京")
共指消解实现
import spacy
from collections import defaultdict
class CoreferenceResolver:
"""共指消解器"""
def __init__(self):
# 加载英文模型(中文模型支持有限)
self.nlp = spacy.load("en_core_web_sm")
def resolve(self, text):
"""执行共指消解"""
doc = self.nlp(text)
# 获取共指链
clusters = []
for cluster in doc.doc._.coref_clusters:
mentions = [m.text for m in cluster.mentions]
main = cluster.main.text
clusters.append({
'main': main,
'mentions': mentions
})
return clusters
def replace_pronouns(self, text):
"""用实体替换代词"""
doc = self.nlp(text)
resolved_text = text
for token in doc:
if token.pos_ == "PRON" and token._.in_coref:
# 找到共指的主要实体
main_entity = token._.coref_clusters[0].main.text
resolved_text = resolved_text.replace(token.text, main_entity, 1)
return resolved_text
# 使用简单的基于规则的共指消解(中文)
def simple_coreference_resolution(text, entities):
"""简单的中文共指消解"""
# 代词映射
pronouns = {'他': 'PER', '她': 'PER', '它': 'ORG/LOC', '他们': 'PER', '她们': 'PER'}
resolved_text = text
for pronoun, entity_type in pronouns.items():
if pronoun in text:
# 找到最近的前向实体
pronoun_pos = text.index(pronoun)
closest_entity = None
min_distance = float('inf')
for entity in entities:
if entity_type in entity.get('type', ''):
entity_pos = text.rfind(entity['text'], 0, pronoun_pos)
if entity_pos != -1:
distance = pronoun_pos - entity_pos
if distance < min_distance:
min_distance = distance
closest_entity = entity
if closest_entity:
resolved_text = resolved_text.replace(pronoun, closest_entity['text'], 1)
return resolved_text
知识图谱构建
信息抽取的一个重要应用是构建知识图谱。
知识图谱三元组
知识图谱以三元组形式存储知识:(头实体, 关系, 尾实体)
from typing import List, Tuple
from dataclasses import dataclass
@dataclass
class Triple:
"""知识图谱三元组"""
head: str
relation: str
tail: str
confidence: float = 1.0
class KnowledgeGraphBuilder:
"""知识图谱构建器"""
def __init__(self):
self.triples = []
def add_triple(self, head, relation, tail, confidence=1.0):
"""添加三元组"""
triple = Triple(head, relation, tail, confidence)
self.triples.append(triple)
def add_from_text(self, text, ner_model, re_model):
"""从文本中抽取并添加三元组"""
# 实体识别
entities = predict_entities(text, ner_model, tokenizer, id2label)
# 关系抽取
for i, e1 in enumerate(entities):
for e2 in entities[i+1:]:
result = re_model.extract(text, e1['text'], e2['text'])
if result['relation'] != '无关系':
self.add_triple(
e1['text'],
result['relation'],
e2['text'],
result['confidence']
)
def query(self, entity=None, relation=None):
"""查询知识图谱"""
results = []
for triple in self.triples:
match = True
if entity and triple.head != entity and triple.tail != entity:
match = False
if relation and triple.relation != relation:
match = False
if match:
results.append(triple)
return results
def export_to_csv(self, filepath):
"""导出为 CSV"""
import csv
with open(filepath, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['头实体', '关系', '尾实体', '置信度'])
for triple in self.triples:
writer.writerow([triple.head, triple.relation, triple.tail, triple.confidence])
# 使用示例
kg = KnowledgeGraphBuilder()
# 手动添加三元组
kg.add_triple("张三", "工作于", "阿里巴巴")
kg.add_triple("阿里巴巴", "位于", "杭州")
kg.add_triple("马云", "创立", "阿里巴巴")
# 查询
print("与阿里巴巴相关的知识:")
for triple in kg.query(entity="阿里巴巴"):
print(f" {triple.head} --{triple.relation}--> {triple.tail}")
可视化知识图谱
import networkx as nx
import matplotlib.pyplot as plt
def visualize_knowledge_graph(triples, output_file=None):
"""可视化知识图谱"""
G = nx.DiGraph()
# 添加节点和边
for triple in triples:
G.add_node(triple.head)
G.add_node(triple.tail)
G.add_edge(triple.head, triple.tail, label=triple.relation)
# 绘图
plt.figure(figsize=(12, 8))
pos = nx.spring_layout(G)
# 绘制节点
nx.draw_networkx_nodes(G, pos, node_color='lightblue', node_size=2000)
nx.draw_networkx_labels(G, pos, font_size=12, font_family='SimHei')
# 绘制边
nx.draw_networkx_edges(G, pos, edge_color='gray', arrows=True)
# 绘制边标签
edge_labels = {(u, v): d['label'] for u, v, d in G.edges(data=True)}
nx.draw_networkx_edge_labels(G, pos, edge_labels, font_size=10)
plt.axis('off')
plt.title('知识图谱可视化')
if output_file:
plt.savefig(output_file, dpi=300, bbox_inches='tight')
else:
plt.show()
plt.close()
# 使用示例
triples = [
Triple("张三", "工作于", "阿里巴巴"),
Triple("阿里巴巴", "位于", "杭州"),
Triple("马云", "创立", "阿里巴巴"),
Triple("李四", "工作于", "腾讯"),
Triple("腾讯", "位于", "深圳"),
]
visualize_knowledge_graph(triples)
评估指标
NER 评估
from seqeval.metrics import precision_score, recall_score, f1_score
def evaluate_ner(y_true, y_pred):
"""评估 NER 性能"""
return {
'precision': precision_score(y_true, y_pred),
'recall': recall_score(y_true, y_pred),
'f1': f1_score(y_true, y_pred)
}
# 示例
y_true = [['B-PER', 'I-PER', 'O', 'B-LOC', 'O']]
y_pred = [['B-PER', 'I-PER', 'O', 'B-LOC', 'O']]
metrics = evaluate_ner(y_true, y_pred)
print(f"精确率: {metrics['precision']:.4f}")
print(f"召回率: {metrics['recall']:.4f}")
print(f"F1值: {metrics['f1']:.4f}")
关系抽取评估
def evaluate_relations(y_true, y_pred):
"""评估关系抽取"""
true_set = set((r['head'], r['relation'], r['tail']) for r in y_true)
pred_set = set((r['head'], r['relation'], r['tail']) for r in y_pred)
tp = len(true_set & pred_set)
fp = len(pred_set - true_set)
fn = len(true_set - pred_set)
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
return {'precision': precision, 'recall': recall, 'f1': f1}
实际应用案例
新闻信息抽取系统
from dataclasses import dataclass
from typing import List, Dict
@dataclass
class NewsEvent:
"""新闻事件"""
event_type: str
participants: List[str]
time: str
location: str
details: Dict
class NewsInformationExtractor:
"""新闻信息抽取系统"""
def __init__(self):
self.ner = pipeline("ner", model="ckiplab/bert-base-chinese-ner", aggregation_strategy="simple")
def extract(self, news_text):
"""从新闻中抽取信息"""
# 实体识别
entities = self.ner(news_text)
# 组织实体
result = {
'persons': [],
'organizations': [],
'locations': [],
'times': []
}
for ent in entities:
if ent['entity_group'] == 'PER':
result['persons'].append(ent['word'])
elif ent['entity_group'] == 'ORG':
result['organizations'].append(ent['word'])
elif ent['entity_group'] == 'LOC':
result['locations'].append(ent['word'])
# 提取时间(简单正则)
import re
time_pattern = r'\d{4}年\d{1,2}月\d{1,2}日|\d{4}-\d{2}-\d{2}|昨天|今天|明天'
result['times'] = re.findall(time_pattern, news_text)
return result
# 使用示例
news = """
2024年3月15日,阿里巴巴集团在杭州宣布收购饿了么。阿里巴巴CEO张勇出席了发布会。
"""
extractor = NewsInformationExtractor()
result = extractor.extract(news)
print("抽取结果:")
print(f"人物: {result['persons']}")
print(f"组织: {result['organizations']}")
print(f"地点: {result['locations']}")
print(f"时间: {result['times']}")
简历信息抽取
class ResumeExtractor:
"""简历信息抽取"""
def __init__(self):
self.patterns = {
'name': r'姓\s*名[::]\s*(\S+)',
'phone': r'(?:电话|手机|联系)[::]\s*(\d{11}|\d{3,4}-\d{7,8})',
'email': r'[::]?\s*([\w.-]+@[\w.-]+\.\w+)',
'education': r'(?:学历|学位)[::]\s*(\S+)',
'university': r'(?:毕业院校|学校)[::]\s*(\S+)',
'experience': r'(?:工作经历|经历)[::]\s*(.+?)(?=教育|项目|技能|$)'
}
def extract(self, resume_text):
"""从简历中提取信息"""
import re
result = {}
for field, pattern in self.patterns.items():
match = re.search(pattern, resume_text, re.DOTALL)
if match:
result[field] = match.group(1).strip()
return result
# 使用示例
resume = """
姓名:张三
联系电话:13812345678
邮箱:[email protected]
学历:硕士
毕业院校:清华大学
工作经历:2018-2022 阿里巴巴软件工程师
"""
extractor = ResumeExtractor()
info = extractor.extract(resume)
for key, value in info.items():
print(f"{key}: {value}")
总结
信息抽取是将非结构化文本转化为结构化知识的关键技术,本章介绍了:
- 命名实体识别:识别人名、地名、机构名等实体
- 关系抽取:识别实体之间的语义关系
- 事件抽取:识别事件类型、触发词和论元
- 共指消解:识别指向同一实体的不同表述
- 知识图谱构建:将抽取结果组织成结构化知识
- 实际应用:新闻信息抽取、简历解析
信息抽取技术是构建智能系统的核心组件,与问答系统、知识图谱、智能搜索等应用紧密相关。随着预训练语言模型的发展,信息抽取的准确率和召回率不断提升,但领域适应性和长尾实体识别仍是研究热点。