文本分类
文本分类是自然语言处理中最常见的任务之一,目标是将文本分配到一个或多个预定义的类别中。应用场景包括情感分析、垃圾邮件检测、新闻分类、意图识别等。
文本分类概述
文本分类可以分为以下几类:
二分类:文本属于两个类别之一,如垃圾邮件检测(垃圾邮件/正常邮件)。
多分类:文本属于多个类别之一,如新闻分类(体育、财经、科技等)。
多标签分类:文本可以同时属于多个类别,如文章标签预测。
传统机器学习方法
特征提取
传统方法需要手动提取文本特征。
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
# 示例数据
texts = [
"这部电影非常好看,推荐观看",
"剧情太无聊了,浪费时间",
"演员演技很棒,故事感人",
"特效很差,不推荐",
"非常喜欢这部电影",
"太失望了,期待很高但结果很差"
]
labels = ["positive", "negative", "positive", "negative", "positive", "negative"]
# 创建管道:TF-IDF + 逻辑回归
pipeline = Pipeline([
('tfidf', TfidfVectorizer(max_features=1000)),
('clf', LogisticRegression())
])
# 训练
pipeline.fit(texts, labels)
# 预测
new_texts = ["这部电影太棒了", "太无聊了"]
predictions = pipeline.predict(new_texts)
for text, pred in zip(new_texts, predictions):
print(f"文本: {text} -> 情感: {pred}")
多种分类器比较
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
classifiers = {
'Naive Bayes': MultinomialNB(),
'Logistic Regression': LogisticRegression(max_iter=1000),
'SVM': SVC(kernel='linear'),
'Random Forest': RandomForestClassifier(n_estimators=100)
}
# 比较不同分类器
for name, clf in classifiers.items():
pipeline = Pipeline([
('tfidf', TfidfVectorizer()),
('clf', clf)
])
scores = cross_val_score(pipeline, texts, labels, cv=3, scoring='accuracy')
print(f"{name}: 平均准确率 {scores.mean():.4f} (+/- {scores.std():.4f})")
深度学习方法
使用 PyTorch 构建文本分类模型
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
class TextDataset(Dataset):
def __init__(self, texts, labels, vocab, max_length=100):
self.texts = texts
self.labels = labels
self.vocab = vocab
self.max_length = max_length
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = self.texts[idx]
label = self.labels[idx]
# 将文本转换为索引序列
indices = [self.vocab.get(w, self.vocab['<UNK>']) for w in text.split()]
# 截断或填充
if len(indices) > self.max_length:
indices = indices[:self.max_length]
else:
indices = indices + [self.vocab['<PAD>']] * (self.max_length - len(indices))
return torch.tensor(indices), torch.tensor(label)
class TextClassifier(nn.Module):
def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, num_layers=2):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
self.fc = nn.Linear(hidden_dim * 2, num_classes)
self.dropout = nn.Dropout(0.3)
def forward(self, x):
# x: (batch_size, seq_len)
embedded = self.embedding(x) # (batch_size, seq_len, embed_dim)
lstm_out, (hidden, _) = self.lstm(embedded)
# 使用最后一个隐藏状态
hidden = torch.cat((hidden[-2], hidden[-1]), dim=1) # (batch_size, hidden_dim * 2)
hidden = self.dropout(hidden)
output = self.fc(hidden)
return output
# 构建词汇表
def build_vocab(texts):
vocab = {'<PAD>': 0, '<UNK>': 1}
for text in texts:
for word in text.split():
if word not in vocab:
vocab[word] = len(vocab)
return vocab
# 训练函数
def train_model(model, dataloader, criterion, optimizer, epochs=10):
model.train()
for epoch in range(epochs):
total_loss = 0
correct = 0
total = 0
for batch_x, batch_y in dataloader:
optimizer.zero_grad()
outputs = model(batch_x)
loss = criterion(outputs, batch_y)
loss.backward()
optimizer.step()
total_loss += loss.item()
_, predicted = torch.max(outputs, 1)
correct += (predicted == batch_y).sum().item()
total += batch_y.size(0)
print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader):.4f}, Accuracy: {correct/total:.4f}")
# 使用示例
texts = [
"这部电影 非常 好看",
"剧情 太 无聊 了",
"演员 演技 很 棒",
"特效 很 差",
"非常 喜欢 这部 电影",
"太 失望 了"
]
labels = [1, 0, 1, 0, 1, 0] # 1: positive, 0: negative
vocab = build_vocab(texts)
dataset = TextDataset(texts, labels, vocab)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)
model = TextClassifier(len(vocab), embed_dim=64, hidden_dim=128, num_classes=2)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
train_model(model, dataloader, criterion, optimizer, epochs=10)
使用 CNN 进行文本分类
TextCNN 是经典的文本分类模型,使用卷积神经网络提取局部特征。
class TextCNN(nn.Module):
def __init__(self, vocab_size, embed_dim, num_filters, filter_sizes, num_classes, dropout=0.5):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
# 多个不同大小的卷积核
self.convs = nn.ModuleList([
nn.Conv2d(1, num_filters, (fs, embed_dim))
for fs in filter_sizes
])
self.fc = nn.Linear(num_filters * len(filter_sizes), num_classes)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
# x: (batch_size, seq_len)
embedded = self.embedding(x) # (batch_size, seq_len, embed_dim)
embedded = embedded.unsqueeze(1) # (batch_size, 1, seq_len, embed_dim)
# 卷积 + 池化
conv_outputs = []
for conv in self.convs:
conv_out = torch.relu(conv(embedded)).squeeze(3) # (batch_size, num_filters, seq_len - fs + 1)
pooled = torch.max(conv_out, dim=2)[0] # (batch_size, num_filters)
conv_outputs.append(pooled)
# 拼接所有特征
features = torch.cat(conv_outputs, dim=1) # (batch_size, num_filters * len(filter_sizes))
features = self.dropout(features)
output = self.fc(features)
return output
# 使用示例
model = TextCNN(
vocab_size=len(vocab),
embed_dim=64,
num_filters=100,
filter_sizes=[3, 4, 5],
num_classes=2
)
print(model)
使用 Transformers 进行文本分类
使用 Pipeline 快速分类
from transformers import pipeline
# 情感分析
classifier = pipeline("sentiment-analysis")
texts = [
"I love this movie!",
"This is terrible.",
"It's okay, nothing special."
]
results = classifier(texts)
for text, result in zip(texts, results):
print(f"文本: {text}")
print(f" 情感: {result['label']}, 置信度: {result['score']:.4f}")
中文情感分析
from transformers import pipeline
# 使用中文情感分析模型
classifier = pipeline("sentiment-analysis", model="uer/roberta-base-finetuned-chinanews-chinese")
texts = [
"这部电影非常好看,强烈推荐!",
"太失望了,浪费时间和金钱。",
"一般般吧,没有想象中那么好。"
]
results = classifier(texts)
for text, result in zip(texts, results):
print(f"文本: {text}")
print(f" 情感: {result['label']}, 置信度: {result['score']:.4f}")
微调 BERT 进行文本分类
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import TrainingArguments, Trainer
from torch.utils.data import Dataset
import torch
class TextClassificationDataset(Dataset):
def __init__(self, texts, labels, tokenizer, max_length=128):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = self.texts[idx]
label = self.labels[idx]
encoding = self.tokenizer(
text,
max_length=self.max_length,
padding='max_length',
truncation=True,
return_tensors='pt'
)
return {
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'labels': torch.tensor(label)
}
# 准备数据
texts = [
"这部电影非常好看",
"剧情太无聊了",
"演员演技很棒",
"特效很差",
"非常喜欢这部电影",
"太失望了"
]
labels = [1, 0, 1, 0, 1, 0] # 1: positive, 0: negative
# 加载模型和分词器
model_name = "bert-base-chinese"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
# 创建数据集
dataset = TextClassificationDataset(texts, labels, tokenizer)
# 训练参数
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=3,
per_device_train_batch_size=8,
learning_rate=2e-5,
logging_steps=10
)
# 训练
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset
)
trainer.train()
# 预测函数
def predict(text):
model.eval()
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
outputs = model(**inputs)
predicted_class = torch.argmax(outputs.logits, dim=1).item()
return "正面" if predicted_class == 1 else "负面"
# 测试
result = predict("这部电影真的很棒!")
print(f"预测结果: {result}")
多标签分类
多标签分类中,一个样本可以属于多个类别。
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
class MultiLabelClassifier(nn.Module):
def __init__(self, model_name, num_labels):
super().__init__()
self.bert = BertModel.from_pretrained(model_name)
self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
def forward(self, input_ids, attention_mask):
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
pooled_output = outputs.pooler_output
logits = self.classifier(pooled_output)
return logits
# 使用示例
model_name = "bert-base-chinese"
num_labels = 5 # 假设有5个标签:科技、体育、财经、娱乐、教育
model = MultiLabelClassifier(model_name, num_labels)
tokenizer = BertTokenizer.from_pretrained(model_name)
# 多标签分类使用 sigmoid + 阈值
def predict_multi_label(text, threshold=0.5):
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
logits = model(inputs['input_ids'], inputs['attention_mask'])
probs = torch.sigmoid(logits)
predictions = (probs > threshold).int()
labels = ['科技', '体育', '财经', '娱乐', '教育']
predicted_labels = [labels[i] for i, p in enumerate(predictions[0]) if p == 1]
return predicted_labels
# 示例
text = "科技公司发布新款智能手机"
labels = predict_multi_label(text)
print(f"预测标签: {labels}")
文本分类评估
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
def evaluate_classification(y_true, y_pred, labels=None):
"""评估分类结果"""
print("=" * 50)
print("分类评估报告")
print("=" * 50)
# 基本指标
print(f"准确率 (Accuracy): {accuracy_score(y_true, y_pred):.4f}")
print(f"精确率 (Precision): {precision_score(y_true, y_pred, average='weighted'):.4f}")
print(f"召回率 (Recall): {recall_score(y_true, y_pred, average='weighted'):.4f}")
print(f"F1 值: {f1_score(y_true, y_pred, average='weighted'):.4f}")
# 详细报告
print("\n详细分类报告:")
print(classification_report(y_true, y_pred, target_names=labels))
# 混淆矩阵
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=labels, yticklabels=labels)
plt.xlabel('预测标签')
plt.ylabel('真实标签')
plt.title('混淆矩阵')
plt.show()
# 使用示例
y_true = ['positive', 'negative', 'positive', 'negative', 'positive']
y_pred = ['positive', 'negative', 'negative', 'negative', 'positive']
evaluate_classification(y_true, y_pred, labels=['negative', 'positive'])
实战:新闻分类
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
class NewsDataset(Dataset):
def __init__(self, texts, labels, tokenizer, max_length=128):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
encoding = self.tokenizer(
self.texts[idx],
max_length=self.max_length,
padding='max_length',
truncation=True,
return_tensors='pt'
)
return {
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'labels': torch.tensor(self.labels[idx])
}
def train_news_classifier():
# 模拟数据(实际使用时替换为真实数据)
data = {
'text': [
"科技公司发布新款智能手机",
"股市今日大涨,投资者信心增强",
"国足在亚洲杯取得胜利",
"新电影票房突破十亿",
"人工智能技术取得重大突破",
"央行宣布调整利率政策",
"篮球比赛激烈进行",
"明星演唱会门票售罄"
],
'category': ['科技', '财经', '体育', '娱乐', '科技', '财经', '体育', '娱乐']
}
df = pd.DataFrame(data)
# 标签编码
label2id = {label: i for i, label in enumerate(df['category'].unique())}
id2label = {i: label for label, i in label2id.items()}
df['label'] = df['category'].map(label2id)
# 划分数据集
train_texts, val_texts, train_labels, val_labels = train_test_split(
df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)
# 加载模型和分词器
model_name = "bert-base-chinese"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(
model_name,
num_labels=len(label2id),
id2label=id2label,
label2id=label2id
)
# 创建数据集
train_dataset = NewsDataset(train_texts, train_labels, tokenizer)
val_dataset = NewsDataset(val_texts, val_labels, tokenizer)
# 训练参数
training_args = TrainingArguments(
output_dir='./news_classifier',
num_train_epochs=3,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
learning_rate=2e-5,
evaluation_strategy='epoch',
save_strategy='epoch',
load_best_model_at_end=True
)
# 训练
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset
)
trainer.train()
return model, tokenizer, id2label
# 预测函数
def predict_news_category(text, model, tokenizer, id2label):
model.eval()
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
outputs = model(**inputs)
predicted_id = torch.argmax(outputs.logits, dim=1).item()
return id2label[predicted_id]
# 使用
# model, tokenizer, id2label = train_news_classifier()
# category = predict_news_category("新的人工智能模型发布", model, tokenizer, id2label)
# print(f"新闻类别: {category}")
总结
文本分类是 NLP 最基础的应用之一,本章介绍了:
- 传统方法:TF-IDF + 机器学习分类器
- 深度学习方法:RNN、CNN 等模型
- 预训练模型:使用 BERT 等模型进行文本分类
- 多标签分类:处理一个文本属于多个类别的情况
- 评估方法:准确率、精确率、召回率、F1 值等指标
文本分类技术广泛应用于情感分析、垃圾邮件检测、新闻分类等场景。下一章将介绍预训练语言模型。