跳到主要内容

Scikit-learn 知识速查表

本页面汇总了 scikit-learn 的核心知识点,方便快速查阅。

核心概念速查

估计器(Estimator)接口

所有机器学习算法的统一接口:

方法说明
fit(X, y)训练模型
predict(X)预测结果
score(X, y)评估模型
predict_proba(X)预测概率(部分模型支持)

转换器(Transformer)接口

数据预处理的统一接口:

方法说明
fit(X)学习转换参数
transform(X)应用转换
fit_transform(X)一步完成学习和转换

常用导入

# 分类算法
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
RandomForestClassifier,
GradientBoostingClassifier,
AdaBoostClassifier
)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB

# 回归算法
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

# 聚类算法
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture

# 降维算法
from sklearn.decomposition import PCA, TruncatedSVD, NMF, FactorAnalysis
from sklearn.manifold import TSNE

# 预处理
from sklearn.preprocessing import (
StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler,
LabelEncoder, OneHotEncoder, OrdinalEncoder,
Normalizer, PowerTransformer, QuantileTransformer
)
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import FeatureUnion

# 特征提取
from sklearn.feature_extraction import DictVectorizer, FeatureHasher
from sklearn.feature_extraction.text import (
CountVectorizer, TfidfVectorizer, TfidfTransformer, HashingVectorizer
)

# 模型选择
from sklearn.model_selection import (
train_test_split,
cross_val_score,
GridSearchCV,
RandomizedSearchCV,
StratifiedKFold
)

# 模型校准
from sklearn.calibration import CalibratedClassifierCV, calibration_curve

# 评估指标
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
confusion_matrix, classification_report,
mean_squared_error, mean_absolute_error, r2_score,
silhouette_score, adjusted_rand_score,
brier_score_loss, log_loss
)

# 流水线
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# 高斯过程
from sklearn.gaussian_process import GaussianProcessRegressor, GaussianProcessClassifier
from sklearn.gaussian_process.kernels import (
RBF, Matern, RationalQuadratic, ExpSineSquared,
DotProduct, WhiteKernel, ConstantKernel
)

# 多输出学习
from sklearn.multioutput import (
MultiOutputClassifier, MultiOutputRegressor,
ClassifierChain, RegressorChain
)
from sklearn.multiclass import (
OneVsRestClassifier, OneVsOneClassifier,
OutputCodeClassifier
)

数据处理

数据加载

# 内置数据集
from sklearn.datasets import load_iris, load_digits, load_diabetes, load_wine

iris = load_iris()
X, y = iris.data, iris.target

# 生成数据
from sklearn.datasets import make_classification, make_regression, make_blobs

# 分类数据
X, y = make_classification(
n_samples=1000, n_features=20, n_informative=15,
n_classes=3, random_state=42
)

# 回归数据
X, y = make_regression(
n_samples=1000, n_features=20, n_informative=15,
noise=0.1, random_state=42
)

# 聚类数据
X, y = make_blobs(
n_samples=300, centers=4, cluster_std=0.8, random_state=42
)

数据划分

from sklearn.model_selection import train_test_split

# 基本划分
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)

# 分层划分(推荐用于分类)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)

特征缩放

# 标准化(均值0,标准差1)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) # 用训练集的参数

# 归一化(缩放到[0,1])
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

# 稳健缩放(对异常值不敏感)
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

类别编码

# 标签编码(用于目标变量)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# 有序编码(用于有序类别)
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(categories=[['低', '中', '高']])
X_encoded = oe.fit_transform(X)

# 独热编码(用于无序类别)
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_encoded = ohe.fit_transform(X)

缺失值处理

from sklearn.impute import SimpleImputer

# 均值填充
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# 中位数填充
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)

# 众数填充(类别特征)
imputer = SimpleImputer(strategy='most_frequent')
X_imputed = imputer.fit_transform(X)

模型训练与预测

基本流程

from sklearn.ensemble import RandomForestClassifier

# 1. 创建模型
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# 2. 训练
clf.fit(X_train, y_train)

# 3. 预测
y_pred = clf.predict(X_test)

# 4. 评估
accuracy = clf.score(X_test, y_test)

使用 Pipeline

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([
('scaler', StandardScaler()),
('clf', LogisticRegression())
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

处理混合类型特征

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# 定义列类型
numeric_features = ['age', 'income']
categorical_features = ['city', 'gender']

# 创建预处理器
preprocessor = ColumnTransformer([
('num', StandardScaler(), numeric_features),
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

# 与模型组合
pipeline = Pipeline([
('preprocessor', preprocessor),
('clf', RandomForestClassifier())
])

模型评估

分类指标

from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
confusion_matrix, classification_report
)

# 基本指标
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# 混淆矩阵
cm = confusion_matrix(y_test, y_pred)

# 完整报告
print(classification_report(y_test, y_pred))

回归指标

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

聚类指标

from sklearn.metrics import silhouette_score, adjusted_rand_score

# 无标签评估
score = silhouette_score(X, labels)

# 有标签评估
ari = adjusted_rand_score(y_true, labels)

交叉验证

from sklearn.model_selection import cross_val_score, cross_validate

# 简单交叉验证
scores = cross_val_score(clf, X, y, cv=5)
print(f"平均得分: {scores.mean():.3f} (+/- {scores.std()*2:.3f})")

# 多指标评估
scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted']
results = cross_validate(clf, X, y, cv=5, scoring=scoring)

超参数调优

网格搜索

from sklearn.model_selection import GridSearchCV

param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [3, 5, 10, None],
'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(
RandomForestClassifier(random_state=42),
param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1
)

grid_search.fit(X_train, y_train)
print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳得分: {grid_search.best_score_:.3f}")

best_model = grid_search.best_estimator_

随机搜索

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_dist = {
'n_estimators': randint(50, 300),
'max_depth': [3, 5, 10, None],
'min_samples_split': randint(2, 20)
}

random_search = RandomizedSearchCV(
RandomForestClassifier(random_state=42),
param_dist,
n_iter=50,
cv=5,
n_jobs=-1,
random_state=42
)

特征选择

from sklearn.feature_selection import (
VarianceThreshold,
SelectKBest,
f_classif,
RFE,
SelectFromModel
)

# 方差阈值
selector = VarianceThreshold(threshold=0.1)
X_selected = selector.fit_transform(X)

# K 最佳特征
selector = SelectKBest(f_classif, k=10)
X_selected = selector.fit_transform(X, y)

# 递归特征消除
from sklearn.linear_model import LogisticRegression
selector = RFE(LogisticRegression(), n_features_to_select=10)
X_selected = selector.fit_transform(X, y)

# 基于模型的特征选择
from sklearn.ensemble import RandomForestClassifier
selector = SelectFromModel(RandomForestClassifier(), threshold='median')
X_selected = selector.fit_transform(X, y)

降维

from sklearn.decomposition import PCA

# PCA
pca = PCA(n_components=0.95) # 保留 95% 方差
X_pca = pca.fit_transform(X)

# 查看解释方差
print(pca.explained_variance_ratio_)

# t-SNE(用于可视化)
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
X_tsne = tsne.fit_transform(X)

特征提取

文本特征提取

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# 词袋模型
count_vec = CountVectorizer(
max_features=5000, # 最大特征数
ngram_range=(1, 2), # 1-gram 和 2-gram
stop_words='english', # 移除停用词
min_df=2 # 最小文档频率
)
X_counts = count_vec.fit_transform(texts)

# TF-IDF
tfidf_vec = TfidfVectorizer(
max_features=5000,
ngram_range=(1, 2)
)
X_tfidf = tfidf_vec.fit_transform(texts)

# 获取词汇表
print(tfidf_vec.get_feature_names_out())

# 大规模文本:使用哈希向量化
from sklearn.feature_extraction.text import HashingVectorizer
hash_vec = HashingVectorizer(n_features=2**20)
X_hash = hash_vec.transform(texts) # 无需 fit

字典特征提取

from sklearn.feature_extraction import DictVectorizer

# 字典数据
data = [{'city': '北京', 'age': 25}, {'city': '上海', 'age': 30}]

vec = DictVectorizer()
X = vec.fit_transform(data)
print(vec.get_feature_names_out()) # ['age', 'city=北京', 'city=上海']

模型校准

from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.metrics import brier_score_loss

# 校准模型
calibrated = CalibratedClassifierCV(
model,
method='sigmoid', # 'sigmoid' 或 'isotonic'
cv=5
)
calibrated.fit(X_train, y_train)

# 获取校准后的概率
y_proba = calibrated.predict_proba(X_test)[:, 1]

# 评估校准质量
brier = brier_score_loss(y_test, y_proba) # 越小越好

# 绘制校准曲线
prob_true, prob_pred = calibration_curve(y_test, y_proba, n_bins=10)

校准方法选择

  • sigmoid:样本量小(<1000),校准误差对称
  • isotonic:样本量大(>1000),更灵活但易过拟合
  • temperature:多分类问题,只有一个参数

模型持久化

import joblib

# 保存模型
joblib.dump(model, 'model.joblib')

# 保存带压缩
joblib.dump(model, 'model.joblib', compress=3)

# 加载模型
model = joblib.load('model.joblib')

# 保存 Pipeline(推荐)
joblib.dump(pipeline, 'pipeline.joblib')
pipeline = joblib.load('pipeline.joblib')

# 安全性:只加载可信来源的模型
# pickle/joblib 可以执行任意代码

持久化最佳实践

import json
import sklearn
from datetime import datetime

# 保存模型时记录元数据
def save_model_with_metadata(model, filepath):
joblib.dump(model, filepath)
metadata = {
'sklearn_version': sklearn.__version__,
'timestamp': datetime.now().isoformat(),
'model_type': type(model).__name__
}
with open(filepath + '.meta', 'w') as f:
json.dump(metadata, f)

# 加载时验证版本
def load_model_with_check(filepath):
model = joblib.load(filepath)
with open(filepath + '.meta', 'r') as f:
metadata = json.load(f)
if metadata['sklearn_version'] != sklearn.__version__:
print(f"警告:版本不一致 {metadata['sklearn_version']} vs {sklearn.__version__}")
return model

常用数据集

数据集样本数特征数类别数任务类型
Iris15043多分类
Digits17976410多分类
Wine178133多分类
Breast Cancer569302二分类
Diabetes44210-回归

算法选择指南

分类算法选择

场景推荐算法
基线模型逻辑回归
需要可解释性决策树、逻辑回归
追求准确率随机森林、梯度提升
小数据集SVM、KNN
大数据集逻辑回归、随机森林
稀疏特征朴素贝叶斯

聚类算法选择

场景推荐算法
球形簇K-Means
任意形状DBSCAN
需要层次结构层次聚类
大数据集Mini-Batch K-Means

降维方法选择

场景推荐方法
通用降维PCA
可视化t-SNE
稀疏数据截断 SVD
非负数据NMF

模型解释性

特征重要性

from sklearn.ensemble import RandomForestClassifier

# 训练模型
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# 内置特征重要性
importances = rf.feature_importances_

# 可视化
import matplotlib.pyplot as plt
import numpy as np

indices = np.argsort(importances)[::-1]
plt.barh(range(len(importances)), importances[indices])
plt.yticks(range(len(importances)), [feature_names[i] for i in indices])
plt.xlabel('重要性')
plt.show()

排列重要性

from sklearn.inspection import permutation_importance

# 计算排列重要性
result = permutation_importance(
model, X_test, y_test,
n_repeats=10,
random_state=42
)

# 结果
print("重要性均值:", result.importances_mean)
print("重要性标准差:", result.importances_std)

部分依赖图

from sklearn.inspection import PartialDependenceDisplay

# 单特征部分依赖图
display = PartialDependenceDisplay.from_estimator(
model, X_train, [0, 1], # 特征索引
feature_names=feature_names
)

# 二维交互图
display = PartialDependenceDisplay.from_estimator(
model, X_train, [(0, 1)], # 特征对
feature_names=feature_names
)

# ICE 图(个体条件期望)
display = PartialDependenceDisplay.from_estimator(
model, X_train, [0],
kind='individual', # 或 'both' 同时显示 PDP
feature_names=feature_names
)

半监督学习

自训练

from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.ensemble import RandomForestClassifier

# 创建半监督标签:-1 表示无标签
y_semi = y_train.copy()
y_semi[unlabeled_mask] = -1

# 自训练分类器
self_training = SelfTrainingClassifier(
RandomForestClassifier(n_estimators=100),
threshold=0.9, # 置信度阈值
max_iter=10 # 最大迭代次数
)
self_training.fit(X_train, y_semi)

标签传播

from sklearn.semi_supervised import LabelPropagation, LabelSpreading

# 标签传播
lp = LabelPropagation(kernel='rbf', gamma=20)
lp.fit(X_train, y_semi)
y_pred = lp.predict(X_test)

# 标签扩散(更稳健)
ls = LabelSpreading(kernel='rbf', gamma=20, alpha=0.2)
ls.fit(X_train, y_semi)
y_pred = ls.predict(X_test)

高斯过程

高斯过程回归

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern, WhiteKernel, ConstantKernel

# 定义核函数
kernel = ConstantKernel(1.0) * RBF(length_scale=1.0) + WhiteKernel(noise_level=0.1)

# 创建模型
gpr = GaussianProcessRegressor(
kernel=kernel,
alpha=0.1, # 噪声水平
n_restarts_optimizer=10, # 优化器重启次数
random_state=42
)

# 训练
gpr.fit(X_train, y_train)

# 预测(包含不确定性)
y_pred, y_std = gpr.predict(X_test, return_std=True)

# 从先验采样
samples = gpr.sample_y(X_test, n_samples=5)

高斯过程分类

from sklearn.gaussian_process import GaussianProcessClassifier

# 创建模型
gpc = GaussianProcessClassifier(
kernel=RBF(length_scale=1.0),
multi_class='one_vs_rest', # 或 'one_vs_one'
random_state=42
)

# 训练
gpc.fit(X_train, y_train)

# 预测概率
y_prob = gpc.predict_proba(X_test)

常用核函数

from sklearn.gaussian_process.kernels import (
RBF, # 径向基函数(平滑)
Matern, # Matérn核(可调平滑度)
RationalQuadratic, # 有理二次核(多尺度)
ExpSineSquared, # 周期核
DotProduct, # 点积核(线性)
WhiteKernel, # 白噪声核
ConstantKernel # 常数核
)

# RBF核(最常用)
rbf = RBF(length_scale=1.0)

# Matérn核(控制平滑度)
matern = Matern(length_scale=1.0, nu=1.5) # nu=0.5,1.5,2.5

# 周期核
periodic = ExpSineSquared(length_scale=1.0, periodicity=2*np.pi)

# 核函数组合
kernel = ConstantKernel(1.0) * RBF(length_scale=1.0) + WhiteKernel(noise_level=0.1)

核函数选择指南

数据特点推荐核函数
平滑函数RBF
有突变或不连续Matérn (低 ν)
周期性数据ExpSineSquared
多尺度变化RationalQuadratic
线性趋势DotProduct 或 RBF + DotProduct

多输出学习

多标签分类

from sklearn.multioutput import MultiOutputClassifier, ClassifierChain
from sklearn.preprocessing import MultiLabelBinarizer

# 多标签目标格式
y_multi = np.array([[1, 0, 1], [0, 1, 1], [0, 0, 0]]) # 每行是一个样本的多个标签

# MultiOutputClassifier(独立训练)
clf = MultiOutputClassifier(RandomForestClassifier(), n_jobs=-1)
clf.fit(X_train, y_multi)
y_pred = clf.predict(X_test)

# 分类器链(利用标签相关性)
chain = ClassifierChain(LogisticRegression(), order='random', random_state=42)
chain.fit(X_train, y_multi)
y_pred = chain.predict(X_test)

多输出回归

from sklearn.multioutput import MultiOutputRegressor, RegressorChain

# 原生支持多输出的模型:RandomForestRegressor, LinearRegression, Ridge等

# MultiOutputRegressor(用于不原生支持的模型)
mor = MultiOutputRegressor(GradientBoostingRegressor(), n_jobs=-1)
mor.fit(X_train, Y_train) # Y_train 是 (n_samples, n_outputs)
Y_pred = mor.predict(X_test)

# 回归器链
chain = RegressorChain(Ridge(), order=[1, 0, 2])
chain.fit(X_train, Y_train)
Y_pred = chain.predict(X_test)

多分类策略

from sklearn.multiclass import (
OneVsRestClassifier,
OneVsOneClassifier,
OutputCodeClassifier
)

# 一对多策略(推荐默认使用)
ovr = OneVsRestClassifier(LinearSVC())
ovr.fit(X_train, y_train)

# 一对一策略(核方法推荐)
ovo = OneVsOneClassifier(LinearSVC())
ovo.fit(X_train, y_train)

# 误差纠错输出码
occ = OutputCodeClassifier(LinearSVC(), code_size=2)
occ.fit(X_train, y_train)

多标签评估指标

from sklearn.metrics import (
accuracy_score, # 严格准确率
hamming_loss, # 汉明损失
f1_score, # F1分数
precision_score, # 精确率
recall_score # 召回率
)

# 严格准确率(所有标签都正确)
acc = accuracy_score(y_true, y_pred)

# 汉明损失(错误标签比例)
hl = hamming_loss(y_true, y_pred)

# F1分数
f1 = f1_score(y_true, y_pred, average='micro') # 或 'macro', 'samples'

异常检测

from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM

# Isolation Forest
iso = IsolationForest(contamination=0.1, random_state=42)
y_pred = iso.fit_predict(X) # -1 为异常,1 为正常

# LOF
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
y_pred = lof.fit_predict(X)

# One-Class SVM
ocsvm = OneClassSVM(nu=0.05, kernel='rbf')
ocsvm.fit(X_train) # 训练集应为正常数据
y_pred = ocsvm.predict(X_test)

自定义评估器

自定义转换器

from sklearn.base import BaseEstimator, TransformerMixin

class CustomTransformer(BaseEstimator, TransformerMixin):
def __init__(self, param=1.0):
self.param = param

def fit(self, X, y=None):
# 学习参数
self.mean_ = X.mean(axis=0)
return self

def transform(self, X):
# 应用转换
return X - self.mean_ * self.param

# 使用
pipeline = Pipeline([
('custom', CustomTransformer(param=0.5)),
('clf', LogisticRegression())
])

自定义评分函数

from sklearn.metrics import make_scorer

def custom_score(y_true, y_pred):
"""自定义评分函数"""
# 例如:对正类召回率加权
from sklearn.metrics import recall_score
return recall_score(y_true, y_pred, pos_label=1)

# 创建评分器
scorer = make_scorer(custom_score, greater_is_better=True)

# 在交叉验证中使用
cross_val_score(clf, X, y, cv=5, scoring=scorer)

学习曲线与验证曲线

学习曲线

from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = learning_curve(
estimator,
X, y,
train_sizes=np.linspace(0.1, 1.0, 10),
cv=5
)

# 可视化
plt.plot(train_sizes, train_scores.mean(axis=1), 'o-', label='训练得分')
plt.plot(train_sizes, test_scores.mean(axis=1), 'o-', label='验证得分')
plt.xlabel('训练样本数')
plt.ylabel('得分')
plt.legend()
plt.show()

验证曲线

from sklearn.model_selection import validation_curve

param_range = [1, 3, 5, 10, 20]
train_scores, test_scores = validation_curve(
estimator,
X, y,
param_name='max_depth',
param_range=param_range,
cv=5
)

# 可视化
plt.plot(param_range, train_scores.mean(axis=1), 'o-', label='训练得分')
plt.plot(param_range, test_scores.mean(axis=1), 'o-', label='验证得分')
plt.xlabel('max_depth')
plt.ylabel('得分')
plt.legend()
plt.show()

常见问题速查

类别不平衡处理

# 方法1:调整类别权重
clf = LogisticRegression(class_weight='balanced')

# 方法2:过采样/欠采样(需要 imbalanced-learn)
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

避免数据泄露

# 错误:在划分前预处理
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X) # 泄露测试集信息
X_train, X_test = train_test_split(X_scaled, ...)

# 正确:使用 Pipeline
pipeline = Pipeline([
('scaler', StandardScaler()),
('clf', LogisticRegression())
])
cross_val_score(pipeline, X, y, cv=5)

设置随机种子保证可复现

import numpy as np
import random

# 设置所有随机种子
np.random.seed(42)
random.seed(42)

# sklearn 模型
clf = RandomForestClassifier(random_state=42)

# 数据划分
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)

性能优化技巧

并行计算

# 模型并行训练
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1) # 使用所有CPU核心

# 交叉验证并行
cross_val_score(clf, X, y, cv=5, n_jobs=-1)

# 网格搜索并行
GridSearchCV(clf, param_grid, cv=5, n_jobs=-1)

增量学习

from sklearn.linear_model import SGDClassifier

# 创建支持增量学习的模型
clf = SGDClassifier(random_state=42)

# 分批训练
for batch_X, batch_y in data_batches:
clf.partial_fit(batch_X, batch_y, classes=[0, 1, 2])

内存优化

# 使用稀疏矩阵
from scipy import sparse
X_sparse = sparse.csr_matrix(X)

# 降低精度
X = X.astype(np.float32)

# 使用稀疏数据的缩放器
from sklearn.preprocessing import MaxAbsScaler
scaler = MaxAbsScaler()
X_scaled = scaler.fit_transform(X_sparse)

官方资源