跳到主要内容

机器学习应用

机器学习在量化交易中的应用日益广泛,从因子挖掘到信号预测,从组合优化到风险控制,机器学习技术正在改变量化投资的面貌。本章将介绍机器学习在量化交易中的典型应用场景和实现方法。

机器学习与量化交易

为什么使用机器学习?

传统量化方法依赖人工构建的因子和规则,存在以下局限:

信息利用不充分:人工构建的因子只能捕捉有限的市场规律,大量数据中的潜在模式被忽略。

非线性关系:金融市场中存在大量非线性关系,传统线性模型难以有效捕捉。

高维数据处理:随着数据维度的增加,传统方法面临维度灾难,而机器学习擅长处理高维数据。

自适应能力:市场环境不断变化,机器学习模型可以通过再训练适应新的市场特征。

机器学习的挑战

在量化交易中使用机器学习也面临独特挑战:

过拟合风险:金融数据信噪比低,机器学习模型容易过度拟合历史噪声。

非平稳性:金融时间序列具有非平稳性,历史规律可能在未来失效。

样本量有限:相比图像、文本等领域,金融数据样本量相对有限。

解释性要求:金融领域对模型解释性有一定要求,黑盒模型可能面临监管和风控挑战。

数据准备

特征工程

特征工程是机器学习成功的关键,好的特征比复杂的模型更重要。

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

def create_features(df):
"""
创建机器学习特征

参数:
df: 包含OHLCV的DataFrame

返回:
特征DataFrame
"""
features = pd.DataFrame(index=df.index)

# 价格特征
features['return_1d'] = df['Close'].pct_change(1)
features['return_5d'] = df['Close'].pct_change(5)
features['return_20d'] = df['Close'].pct_change(20)

# 波动率特征
features['volatility_5d'] = df['Close'].pct_change().rolling(5).std()
features['volatility_20d'] = df['Close'].pct_change().rolling(20).std()

# 动量特征
features['momentum_5d'] = df['Close'] / df['Close'].shift(5) - 1
features['momentum_20d'] = df['Close'] / df['Close'].shift(20) - 1

# 均线偏离
features['ma5_bias'] = (df['Close'] - df['Close'].rolling(5).mean()) / df['Close'].rolling(5).mean()
features['ma20_bias'] = (df['Close'] - df['Close'].rolling(20).mean()) / df['Close'].rolling(20).mean()

# 成交量特征
features['volume_ratio'] = df['Volume'] / df['Volume'].rolling(20).mean()
features['volume_change'] = df['Volume'].pct_change()

# 价格位置
features['high_low_position'] = (df['Close'] - df['Low'].rolling(20).min()) / (df['High'].rolling(20).max() - df['Low'].rolling(20).min())

# 技术指标
# RSI
delta = df['Close'].diff()
gain = delta.where(delta > 0, 0).rolling(14).mean()
loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
features['RSI'] = 100 - (100 / (1 + gain / loss))

# MACD
ema12 = df['Close'].ewm(span=12).mean()
ema26 = df['Close'].ewm(span=26).mean()
features['MACD'] = ema12 - ema26
features['MACD_signal'] = features['MACD'].ewm(span=9).mean()

return features

def create_labels(df, forward_period=5, threshold=0.02):
"""
创建标签(预测目标)

参数:
df: 包含Close的DataFrame
forward_period: 预测期
threshold: 分类阈值

返回:
标签Series
"""
# 未来收益
future_return = df['Close'].shift(-forward_period) / df['Close'] - 1

# 分类标签:上涨(1)、下跌(-1)、横盘(0)
labels = pd.Series(0, index=df.index)
labels[future_return > threshold] = 1
labels[future_return < -threshold] = -1

return labels

数据清洗和标准化

def prepare_ml_data(features, labels, train_ratio=0.8):
"""
准备机器学习数据

参数:
features: 特征DataFrame
labels: 标签Series
train_ratio: 训练集比例

返回:
X_train, X_test, y_train, y_test
"""
from sklearn.model_selection import train_test_split

# 对齐索引
common_index = features.index.intersection(labels.dropna().index)
features = features.loc[common_index]
labels = labels.loc[common_index]

# 删除缺失值
mask = ~(features.isna().any(axis=1) | labels.isna())
features = features[mask]
labels = labels[mask]

# 时间序列分割(不能用随机分割)
split_idx = int(len(features) * train_ratio)
X_train = features.iloc[:split_idx]
X_test = features.iloc[split_idx:]
y_train = labels.iloc[:split_idx]
y_test = labels.iloc[split_idx:]

# 标准化
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train),
index=X_train.index,
columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test),
index=X_test.index,
columns=X_test.columns)

return X_train_scaled, X_test_scaled, y_train, y_test, scaler

常用机器学习模型

随机森林

随机森林是量化交易中最常用的机器学习模型之一,具有抗过拟合、可解释性强的特点。

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

def train_random_forest(X_train, y_train, n_estimators=100, max_depth=10):
"""
训练随机森林模型

参数:
X_train: 训练特征
y_train: 训练标签
n_estimators: 树的数量
max_depth: 最大深度

返回:
训练好的模型
"""
model = RandomForestClassifier(
n_estimators=n_estimators,
max_depth=max_depth,
min_samples_split=5,
min_samples_leaf=2,
random_state=42,
n_jobs=-1
)

model.fit(X_train, y_train)

return model

def evaluate_model(model, X_test, y_test):
"""
评估模型性能

参数:
model: 训练好的模型
X_test: 测试特征
y_test: 测试标签

返回:
评估结果字典
"""
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)

print("分类报告:")
print(classification_report(y_test, y_pred))

print("\n混淆矩阵:")
print(confusion_matrix(y_test, y_pred))

return {
'predictions': y_pred,
'probabilities': y_prob,
'accuracy': (y_pred == y_test).mean()
}

# 特征重要性分析
def plot_feature_importance(model, feature_names, top_n=20):
"""
绘制特征重要性
"""
import matplotlib.pyplot as plt

importance = pd.Series(model.feature_importances_, index=feature_names)
importance = importance.sort_values(ascending=True).tail(top_n)

plt.figure(figsize=(10, 8))
importance.plot(kind='barh')
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

梯度提升树

梯度提升树(XGBoost、LightGBM)通常比随机森林有更好的性能。

from lightgbm import LGBMClassifier

def train_lightgbm(X_train, y_train, params=None):
"""
训练LightGBM模型

参数:
X_train: 训练特征
y_train: 训练标签
params: 模型参数

返回:
训练好的模型
"""
if params is None:
params = {
'n_estimators': 200,
'max_depth': 6,
'learning_rate': 0.05,
'num_leaves': 31,
'min_child_samples': 20,
'subsample': 0.8,
'colsample_bytree': 0.8,
'random_state': 42,
'n_jobs': -1,
'verbose': -1
}

model = LGBMClassifier(**params)
model.fit(X_train, y_train)

return model

支持向量机

from sklearn.svm import SVC

def train_svm(X_train, y_train, kernel='rbf', C=1.0):
"""
训练SVM模型

参数:
X_train: 训练特征
y_train: 训练标签
kernel: 核函数
C: 正则化参数

返回:
训练好的模型
"""
model = SVC(kernel=kernel, C=C, probability=True, random_state=42)
model.fit(X_train, y_train)

return model

时间序列模型

LSTM

LSTM适合处理时间序列数据,能够捕捉长期依赖关系。

import torch
import torch.nn as nn

class LSTMModel(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, output_size, dropout=0.2):
super(LSTMModel, self).__init__()

self.hidden_size = hidden_size
self.num_layers = num_layers

self.lstm = nn.LSTM(
input_size=input_size,
hidden_size=hidden_size,
num_layers=num_layers,
batch_first=True,
dropout=dropout
)

self.fc = nn.Linear(hidden_size, output_size)

def forward(self, x):
# LSTM输出
lstm_out, _ = self.lstm(x)

# 取最后一个时间步的输出
out = lstm_out[:, -1, :]

# 全连接层
out = self.fc(out)

return out

def create_sequences(data, seq_length):
"""
创建时间序列样本

参数:
data: 特征数据
seq_length: 序列长度

返回:
序列数据
"""
sequences = []
for i in range(len(data) - seq_length):
sequences.append(data[i:i+seq_length])
return np.array(sequences)

def train_lstm_model(X_train, y_train, seq_length=20, epochs=100, batch_size=32):
"""
训练LSTM模型

参数:
X_train: 训练特征
y_train: 训练标签
seq_length: 序列长度
epochs: 训练轮数
batch_size: 批大小

返回:
训练好的模型
"""
# 创建序列
X_seq = create_sequences(X_train.values, seq_length)
y_seq = y_train.values[seq_length:]

# 转换为Tensor
X_tensor = torch.FloatTensor(X_seq)
y_tensor = torch.LongTensor(y_seq + 1) # 标签从0开始

# 模型参数
input_size = X_train.shape[1]
hidden_size = 64
num_layers = 2
output_size = 3 # 三分类

# 创建模型
model = LSTMModel(input_size, hidden_size, num_layers, output_size)

# 损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 训练
model.train()
for epoch in range(epochs):
# 前向传播
outputs = model(X_tensor)
loss = criterion(outputs, y_tensor)

# 反向传播
optimizer.zero_grad()
loss.backward()
optimizer.step()

if (epoch + 1) % 10 == 0:
print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

return model

模型评估与选择

交叉验证

对于时间序列数据,需要使用时间序列交叉验证。

from sklearn.model_selection import TimeSeriesSplit

def time_series_cv(model_class, X, y, n_splits=5, **model_params):
"""
时间序列交叉验证

参数:
model_class: 模型类
X: 特征
y: 标签
n_splits: 分割数
model_params: 模型参数

返回:
各折结果
"""
tscv = TimeSeriesSplit(n_splits=n_splits)
results = []

for fold, (train_idx, test_idx) in enumerate(tscv.split(X)):
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

# 训练模型
model = model_class(**model_params)
model.fit(X_train, y_train)

# 预测
y_pred = model.predict(X_test)
accuracy = (y_pred == y_test).mean()

results.append({
'fold': fold,
'accuracy': accuracy,
'train_size': len(train_idx),
'test_size': len(test_idx)
})

print(f"Fold {fold}: Accuracy = {accuracy:.4f}")

return pd.DataFrame(results)

回测集成

将机器学习预测集成到回测框架中。

import backtrader as bt

class MLStrategy(bt.Strategy):
"""机器学习策略"""

params = (
('model', None),
('scaler', None),
('feature_func', None),
('seq_length', 20),
)

def __init__(self):
self.data_buffer = []
self.order = None

def next(self):
# 收集数据
current_data = {
'Close': self.data.close[0],
'High': self.data.high[0],
'Low': self.data.low[0],
'Volume': self.data.volume[0],
}
self.data_buffer.append(current_data)

# 等待足够的数据
if len(self.data_buffer) < self.params.seq_length:
return

# 创建特征
df = pd.DataFrame(self.data_buffer[-self.params.seq_length:])
features = self.params.feature_func(df).iloc[-1:].values

# 标准化
features_scaled = self.params.scaler.transform(features)

# 预测
prediction = self.params.model.predict(features_scaled)[0]

# 交易逻辑
if not self.position:
if prediction == 1: # 预测上涨
self.order = self.buy()
else:
if prediction == -1: # 预测下跌
self.order = self.sell()

防止过拟合

正则化

from sklearn.linear_model import Lasso, Ridge

def train_regularized_model(X_train, y_train, alpha=0.1, model_type='lasso'):
"""
训练正则化模型

参数:
X_train: 训练特征
y_train: 训练标签
alpha: 正则化强度
model_type: 模型类型

返回:
训练好的模型
"""
if model_type == 'lasso':
model = Lasso(alpha=alpha)
else:
model = Ridge(alpha=alpha)

model.fit(X_train, y_train)

return model

早停

from lightgbm import LGBMClassifier

def train_with_early_stopping(X_train, y_train, X_val, y_val):
"""
使用早停训练模型

参数:
X_train: 训练特征
y_train: 训练标签
X_val: 验证特征
y_val: 验证标签

返回:
训练好的模型
"""
model = LGBMClassifier(
n_estimators=1000,
learning_rate=0.05,
early_stopping_rounds=50,
random_state=42,
verbose=-1
)

model.fit(
X_train, y_train,
eval_set=[(X_val, y_val)],
eval_metric='multi_logloss'
)

print(f"最佳迭代次数: {model.best_iteration_}")

return model

特征选择

from sklearn.feature_selection import SelectKBest, mutual_info_classif

def select_features(X_train, y_train, X_test, k=20):
"""
特征选择

参数:
X_train: 训练特征
y_train: 训练标签
X_test: 测试特征
k: 选择的特征数量

返回:
选择后的特征
"""
selector = SelectKBest(score_func=mutual_info_classif, k=k)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

selected_features = X_train.columns[selector.get_support()].tolist()
print(f"选择的特征: {selected_features}")

return X_train_selected, X_test_selected, selected_features

小结

机器学习为量化交易提供了强大的工具,但也带来了过拟合等挑战。使用机器学习时需要注意:

  1. 特征工程是关键:好的特征比复杂的模型更重要
  2. 防止过拟合:使用正则化、早停、特征选择等技术
  3. 时间序列特性:使用时间序列交叉验证,不能用随机分割
  4. 模型解释:关注特征重要性,理解模型决策逻辑
  5. 持续监控:模型上线后需要持续监控,及时发现问题

机器学习不是万能的,它只是工具之一。在实际应用中,应该将机器学习与传统量化方法结合,发挥各自优势。下一章将介绍高频交易的基础知识。