Scikit-learn 知识速查表
本页面汇总了 scikit-learn 的核心知识点,方便快速查阅。
常用算法速查
分类算法
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
回归算法
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
聚类算法
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
降维算法
from sklearn.decomposition import PCA
数据加载
# 内置数据集
from sklearn.datasets import load_iris, load_digits, load_diabetes
# 生成数据
from sklearn.datasets import make_classification, make_regression, make_blobs
数据划分
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
数据预处理
# 标准化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 归一化
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)
# 编码标签
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)
# 独热编码
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
X_encoded = encoder.fit_transform(X)
模型训练和预测
# 创建模型
clf = LogisticRegression()
# 训练
clf.fit(X_train, y_train)
# 预测
y_pred = clf.predict(X_test)
# 预测概率
y_prob = clf.predict_proba(X_test)
模型评估
from sklearn.metrics import (
accuracy_score, # 准确率
precision_score, # 精确率
recall_score, # 召回率
f1_score, # F1 分数
confusion_matrix, # 混淆矩阵
classification_report # 分类报告
)
# 基本评估
accuracy = accuracy_score(y_test, y_pred)
# 完整报告
print(classification_report(y_test, y_pred))
交叉验证
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
print(f"平均准确率: {scores.mean():.2f}")
超参数调优
from sklearn.model_selection import GridSearchCV
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [3, 5, 7]
}
grid_search = GridSearchCV(
RandomForestClassifier(),
param_grid,
cv=5,
scoring='accuracy'
)
grid_search.fit(X_train, y_train)
print(f"最佳参数: {grid_search.best_params_}")
管道 Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
pipeline = Pipeline([
('scaler', StandardScaler()),
('svm', SVC())
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
特征选择
from sklearn.feature_selection import (
VarianceThreshold,
SelectKBest,
f_classif
)
# 方差阈值
selector = VarianceThreshold(threshold=0.1)
X_selected = selector.fit_transform(X)
# K 最佳特征
selector = SelectKBest(f_classif, k=10)
X_selected = selector.fit_transform(X, y)
常用数据集信息
| 数据集 | 样本数 | 特征数 | 任务类型 |
|---|---|---|---|
| Iris | 150 | 4 | 多分类 |
| Digits | 1797 | 64 | 多分类 |
| Diabetes | 442 | 10 | 回归 |
| Wine | 178 | 13 | 多分类 |
| Breast Cancer | 569 | 30 | 二分类 |
版本信息
import sklearn
print(sklearn.__version__) # 显示版本