Python 数据分析速查表

本文档汇总了 Python 数据分析中最常用的命令和技巧，方便快速查阅。

NumPy 速查

数组创建

import numpy as np

# 从列表创建
arr = np.array([1, 2, 3])

# 创建特殊数组
np.zeros(5)           # 全零数组
np.ones(5)            # 全一数组
np.full(5, 7)        # 指定值数组
np.arange(10)         # 序列数组 [0,1,2,...,9]
np.linspace(0, 10, 5) # 等间距数组
np.eye(3)            # 单位矩阵

# 随机数组
np.random.rand(5)            # 0-1均匀分布
np.random.randn(5)           # 标准正态分布
np.random.randint(0, 10, 5)  # 整数随机
np.random.choice([1,2,3], 5) # 随机选择

数组属性

arr.ndim        # 维度数量
arr.shape       # 形状 (2, 3)
arr.size        # 元素总数
arr.dtype       # 数据类型
arr.itemsize    # 元素字节数

数组索引

arr[0]           # 单个元素
arr[-1]          # 最后一个元素
arr[1:4]         # 切片
arr[arr > 3]     # 布尔索引
arr[[0, 2, 4]]   # 整数数组索引

# 二维数组
arr[0]           # 第一行
arr[0, 0]         # 第一行第一列
arr[:, 0]        # 第一列
arr[0:2, 0:2]    # 子矩阵

数组运算

# 算术运算
arr + 1          # 加
arr - 1          # 减
arr * 2          # 乘
arr / 2          # 除
arr ** 2          # 幂

# 聚合函数
arr.sum()        # 求和
arr.mean()       # 平均值
arr.std()        # 标准差
arr.min()        # 最小值
arr.max()        # 最大值

# 按轴运算
arr.sum(axis=0)  # 按列
arr.sum(axis=1)  # 按行

数组操作

arr.reshape(3, 4)  # 改变形状
arr.flatten()       # 展平
arr.T               # 转置
np.concatenate([a, b])  # 拼接
np.split(arr, 3)    # 分割

Pandas 速查

DataFrame 创建

import pandas as pd

# 从字典创建
df = pd.DataFrame({
    'name': ['张三', '李四'],
    'age': [25, 30]
})

# 从CSV读取
df = pd.read_csv('file.csv')

# 从Excel读取
df = pd.read_excel('file.xlsx')

# 从字典列表创建
data = [{'name': '张三', 'age': 25}]
df = pd.DataFrame(data)

数据查看

df.head()        # 前5行
df.tail()        # 后5行
df.shape         # 形状 (行, 列)
df.columns       # 列名
df.dtypes        # 数据类型
df.info()        # 信息摘要
df.describe()   # 统计摘要

数据选择

# 选择列
df['name']            # 单列 (Series)
df[['name', 'age']]  # 多列 (DataFrame)

# 选择行
df.loc[0]            # 按标签
df.iloc[0]           # 按位置

# 条件筛选
df[df['age'] > 25]
df.query('age > 25')
df[df['name'].isin(['张三', '李四'])]

数据操作

# 添加列
df['new_col'] = df['age'] * 2

# 删除列
df.drop('col', axis=1)
df.drop(columns=['a', 'b'])

# 删除行
df.drop(0)

# 排序
df.sort_values('age')
df.sort_index()

# 去重
df.drop_duplicates()

缺失值处理

df.isnull()           # 检测缺失值
df.isnull().sum()     # 缺失值数量
df.fillna(0)          # 填充缺失值
df.dropna()           # 删除缺失值
df.interpolate()      # 插值填充

分组聚合

# 分组
df.groupby('column')

# 聚合
df.groupby('column')['value'].sum()
df.groupby('column')['value'].agg(['sum', 'mean', 'count'])

# 透视表
pd.pivot_table(df, values='value', index='row', columns='col')

数据合并

# 连接
pd.concat([df1, df2])           # 纵向拼接
pd.concat([df1, df2], axis=1)  # 横向拼接

# 合并
pd.merge(df1, df2, on='key')           # 内连接
pd.merge(df1, df2, on='key', how='left')  # 左连接

数据读写

# CSV
df.to_csv('file.csv', index=False)

# Excel
df.to_excel('file.xlsx', sheet_name='Sheet1')

# JSON
df.to_json('file.json', orient='records')

Matplotlib 速查

基础图表

import matplotlib.pyplot as plt

# 折线图
plt.plot(x, y)

# 散点图
plt.scatter(x, y)

# 柱状图
plt.bar(x, height)

# 直方图
plt.hist(data, bins=30)

# 饼图
plt.pie(sizes, labels=labels)

# 箱线图
plt.boxplot(data)

图表元素

plt.title('标题', fontsize=16)
plt.xlabel('X轴标签')
plt.ylabel('Y轴标签')
plt.legend()           # 图例
plt.grid(True)         # 网格
plt.xlim(0, 10)        # X轴范围
plt.ylim(0, 10)        # Y轴范围
plt.xticks([0, 5, 10], ['零', '五', '十'])  # 刻度

样式和颜色

# 颜色
plt.plot(x, y, color='red')
plt.plot(x, y, c='#FF5733')
plt.plot(x, y, color='0.5')  # 灰度

# 线条样式
plt.plot(x, y, linestyle='--')  # 虚线
plt.plot(x, y, linewidth=2)      # 线宽
plt.plot(x, y, marker='o')      # 标记

# 子图
fig, axes = plt.subplots(2, 2)  # 2x2子图
axes[0, 0].plot(x, y)

保存图表

plt.savefig('figure.png', dpi=300, bbox_inches='tight')
plt.savefig('figure.pdf')

Seaborn 速查

常用图表

import seaborn as sns

# 散点图
sns.scatterplot(data=df, x='x', y='y', hue='category')

# 线图
sns.lineplot(data=df, x='x', y='y', hue='category')

# 直方图
sns.histplot(data=df, x='col', hue='category', kde=True)

# 箱线图
sns.boxplot(data=df, x='category', y='value')

# 小提琴图
sns.violinplot(data=df, x='category', y='value')

# 条形图
sns.barplot(data=df, x='category', y='value')

# 热力图
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')

# 配对图
sns.pairplot(df, hue='category')

# 联合分布图
sns.jointplot(data=df, x='x', y='y', kind='scatter')

样式设置

# 设置主题
sns.set_theme(style='whitegrid')  # white, dark, whitegrid, darkgrid, ticks

# 设置调色板
sns.set_palette('husl')  # deep, muted, pastel, bright, dark, colorblind

# 设置上下文
sns.set_context('notebook')  # paper, notebook, talk, poster

多面板图

# relplot：关系图多面板
sns.relplot(data=df, x='x', y='y', col='cat1', row='cat2')

# displot：分布图多面板
sns.displot(data=df, x='col', col='category', kde=True)

# catplot：分类图多面板
sns.catplot(data=df, x='cat', y='value', col='group', kind='box')

# lmplot：回归图多面板
sns.lmplot(data=df, x='x', y='y', col='category')

数据合并速查

concat 拼接

# 纵向拼接
pd.concat([df1, df2])

# 横向拼接
pd.concat([df1, df2], axis=1)

# 忽略索引
pd.concat([df1, df2], ignore_index=True)

# 添加层级标识
pd.concat([df1, df2], keys=['A', 'B'])

# 内连接（只保留共有列）
pd.concat([df1, df2], join='inner')

merge 连接

# 内连接
pd.merge(df1, df2, on='key')

# 左连接
pd.merge(df1, df2, on='key', how='left')

# 右连接
pd.merge(df1, df2, on='key', how='right')

# 外连接
pd.merge(df1, df2, on='key', how='outer')

# 不同列名连接
pd.merge(df1, df2, left_on='key1', right_on='key2')

# 多列连接
pd.merge(df1, df2, on=['key1', 'key2'])

# 基于索引连接
pd.merge(df1, df2, left_index=True, right_index=True)

# 验证连接关系
pd.merge(df1, df2, on='key', validate='one_to_one')

join 方法

# 基于索引连接
df1.join(df2)

# 指定连接方式
df1.join(df2, how='outer')

# 连接多个DataFrame
df1.join([df2, df3])

merge_ordered 和 merge_asof

# 有序合并
pd.merge_ordered(df1, df2, on='key', fill_method='ffill')

# 近似匹配（时间序列对齐）
pd.merge_asof(trades, quotes, on='time', direction='backward')

时间序列速查

创建时间序列

# 生成日期范围
pd.date_range('2024-01-01', periods=10, freq='D')

# 字符串转日期
pd.to_datetime(['2024-01-01', '2024-01-02'])

# 指定格式
pd.to_datetime(dates, format='%Y-%m-%d')

# 处理无效日期
pd.to_datetime(dates, errors='coerce')  # 无效值变NaT

时间属性

# 访问时间属性
df.index.year        # 年
df.index.month       # 月
df.index.day         # 日
df.index.hour        # 小时
df.index.dayofweek   # 周几 (0-6)
df.index.day_name()  # 周几名称
df.index.quarter     # 季度

重采样

# 降采样（日->月）
df.resample('M').mean()
df.resample('M').agg({'col': ['sum', 'mean']})

# 升采样（日->小时）
df.resample('h').asfreq()
df.resample('h').ffill()     # 前向填充
df.resample('h').interpolate()  # 插值

# 频率别名
# D-天, h-小时, min-分钟, s-秒
# W-周, ME-月末, MS-月初, QE-季末, YE-年末
# B-工作日, BM-月末工作日

时间偏移

# 使用 Timedelta
ts + pd.Timedelta(days=3)
ts - pd.Timedelta(hours=2)

# 使用 DateOffset
from pandas.tseries.offsets import BDay, MonthEnd
ts + BDay(1)        # 下一个工作日
ts + MonthEnd(0)    # 移动到月末

# shift 移动数据
df.shift(1)         # 数据向后移动
df.shift(1, freq='D')  # 索引向前移动

滚动窗口

# 移动平均
df.rolling(window=5).mean()

# 移动标准差
df.rolling(window=5).std()

# 指定最小观测数
df.rolling(window=5, min_periods=3).mean()

# 基于时间的窗口
df.rolling('5D').mean()

# 指数加权移动平均
df.ewm(span=5).mean()

# 扩展窗口
df.expanding().mean()

时区处理

# 本地化时区
ts.tz_localize('UTC')

# 转换时区
ts.tz_convert('Asia/Shanghai')

# 创建带时区的时间序列
pd.date_range('2024-01-01', periods=5, tz='Asia/Shanghai')

常用技巧

数据类型转换

# 转换为数值
pd.to_numeric(df['col'])
df['col'].astype(int)

# 转换为日期
pd.to_datetime(df['date'])

# 转换为字符串
df['col'].astype(str)

字符串操作

df['col'].str.lower()       # 转小写
df['col'].str.upper()       # 转大写
df['col'].str.strip()       # 去除空白
df['col'].str.replace('a', 'b')  # 替换
df['col'].str.contains('a') # 包含
df['col'].str.split(',')    # 分割

条件逻辑

# if-else
df['new_col'] = df['col'].apply(lambda x: 'A' if x > 10 else 'B')

# np.where
df['new_col'] = np.where(df['col'] > 10, 'A', 'B')

# 多条件
df['new_col'] = np.select(
    [cond1, cond2, cond3],
    [val1, val2, val3],
    default='other'
)

迭代操作

# 遍历DataFrame
for index, row in df.iterrows():
    print(row['name'])

# 遍历列
for col in df.columns:
    print(df[col])

随机操作

# 随机抽样
df.sample(n=10)              # 随机10行
df.sample(frac=0.5)         # 随机50%

# 随机打乱
df.sample(frac=1).reset_index(drop=True)

常用统计

# 描述统计
df.describe()
df['col'].describe()

# 相关性
df.corr()

# 分位数
df.quantile(0.5)   # 中位数
df.quantile([0.25, 0.5, 0.75])  # 四分位数

管道操作

(df
    .pipe(clean_data)
    .filter(conditions)
    .groupby('column')
    .agg({'value': 'sum'})
)

Cheat Sheet 速查表

NumPy 速查

功能	命令
创建数组	`np.array([1,2,3])`
零数组	`np.zeros(5)`
序列	`np.arange(10)`
形状	`arr.shape`
索引	`arr[0]`
切片	`arr[1:4]`
布尔索引	`arr[arr>5]`
求和	`arr.sum()`
平均值	`arr.mean()`

Pandas 速查

功能	命令
读取CSV	`pd.read_csv()`
查看数据	`df.head()`
选择列	`df['col']`
条件筛选	`df[df['col']>10]`
分组	`df.groupby('col')`
聚合	`.agg({'col': 'sum'})`
合并	`pd.merge()`
透视表	`pd.pivot_table()`
保存	`df.to_csv()`

Matplotlib 速查

功能	命令
折线图	`plt.plot(x, y)`
散点图	`plt.scatter(x, y)`
柱状图	`plt.bar(x, height)`
直方图	`plt.hist(data)`
标题	`plt.title('标题')`
标签	`plt.xlabel()`
图例	`plt.legend()`
网格	`plt.grid(True)`
保存	`plt.savefig()`

目录​

NumPy 速查​

数组创建​

数组属性​

数组索引​

数组运算​

数组操作​

Pandas 速查​

DataFrame 创建​

数据查看​

数据选择​

数据操作​

缺失值处理​

分组聚合​

数据合并​

数据读写​

Matplotlib 速查​

基础图表​

图表元素​

样式和颜色​

保存图表​

Seaborn 速查​

常用图表​

样式设置​

多面板图​

数据合并速查​

concat 拼接​

merge 连接​

join 方法​

merge_ordered 和 merge_asof​

时间序列速查​

创建时间序列​

时间属性​

重采样​

时间偏移​

滚动窗口​

时区处理​

常用技巧​

数据类型转换​

字符串操作​

条件逻辑​

迭代操作​

随机操作​

常用统计​

管道操作​

Cheat Sheet 速查表​

NumPy 速查​

Pandas 速查​

Matplotlib 速查​

参考资源​

目录

NumPy 速查

数组创建

数组属性

数组索引

数组运算

数组操作

Pandas 速查

DataFrame 创建

数据查看

数据选择

数据操作

缺失值处理

分组聚合

数据合并

数据读写

Matplotlib 速查

基础图表

图表元素

样式和颜色

保存图表

Seaborn 速查

常用图表

样式设置

多面板图

数据合并速查

concat 拼接

merge 连接

join 方法

merge_ordered 和 merge_asof

时间序列速查

创建时间序列

时间属性

重采样

时间偏移

滚动窗口

时区处理

常用技巧

数据类型转换

字符串操作

条件逻辑

迭代操作

随机操作

常用统计

管道操作

Cheat Sheet 速查表

NumPy 速查

Pandas 速查

Matplotlib 速查

参考资源