跳到主要内容

Scikit-learn 知识速查表

本页面汇总了 scikit-learn 的核心知识点,方便快速查阅。

常用算法速查

分类算法

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

回归算法

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

聚类算法

from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering

降维算法

from sklearn.decomposition import PCA

数据加载

# 内置数据集
from sklearn.datasets import load_iris, load_digits, load_diabetes

# 生成数据
from sklearn.datasets import make_classification, make_regression, make_blobs

数据划分

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)

数据预处理

# 标准化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 归一化
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

# 编码标签
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

# 独热编码
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
X_encoded = encoder.fit_transform(X)

模型训练和预测

# 创建模型
clf = LogisticRegression()

# 训练
clf.fit(X_train, y_train)

# 预测
y_pred = clf.predict(X_test)

# 预测概率
y_prob = clf.predict_proba(X_test)

模型评估

from sklearn.metrics import (
accuracy_score, # 准确率
precision_score, # 精确率
recall_score, # 召回率
f1_score, # F1 分数
confusion_matrix, # 混淆矩阵
classification_report # 分类报告
)

# 基本评估
accuracy = accuracy_score(y_test, y_pred)

# 完整报告
print(classification_report(y_test, y_pred))

交叉验证

from sklearn.model_selection import cross_val_score

scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
print(f"平均准确率: {scores.mean():.2f}")

超参数调优

from sklearn.model_selection import GridSearchCV

param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [3, 5, 7]
}

grid_search = GridSearchCV(
RandomForestClassifier(),
param_grid,
cv=5,
scoring='accuracy'
)

grid_search.fit(X_train, y_train)
print(f"最佳参数: {grid_search.best_params_}")

管道 Pipeline

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

pipeline = Pipeline([
('scaler', StandardScaler()),
('svm', SVC())
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

特征选择

from sklearn.feature_selection import (
VarianceThreshold,
SelectKBest,
f_classif
)

# 方差阈值
selector = VarianceThreshold(threshold=0.1)
X_selected = selector.fit_transform(X)

# K 最佳特征
selector = SelectKBest(f_classif, k=10)
X_selected = selector.fit_transform(X, y)

常用数据集信息

数据集样本数特征数任务类型
Iris1504多分类
Digits179764多分类
Diabetes44210回归
Wine17813多分类
Breast Cancer56930二分类

版本信息

import sklearn
print(sklearn.__version__) # 显示版本

官方资源