机器学习应用
机器学习在量化交易中的应用日益广泛,从因子挖掘到信号预测,从组合优化到风险控制,机器学习技术正在改变量化投资的面貌。本章将介绍机器学习在量化交易中的典型应用场景和实现方法。
机器学习与量化交易
为什么使用机器学习?
传统量化方法依赖人工构建的因子和规则,存在以下局限:
信息利用不充分:人工构建的因子只能捕捉有限的市场规律,大量数据中的潜在模式被忽略。
非线性关系:金融市场中存在大量非线性关系,传统线性模型难以有效捕捉。
高维数据处理:随着数据维度的增加,传统方法面临维度灾难,而机器学习擅长处理高维数据。
自适应能力:市场环境不断变化,机器学习模型可以通过再训练适应新的市场特征。
机器学习的挑战
在量化交易中使用机器学习也面临独特挑战:
过拟合风险:金融数据信噪比低,机器学习模型容易过度拟合历史噪声。
非平稳性:金融时间序列具有非平稳性,历史规律可能在未来失效。
样本量有限:相比图像、文本等领域,金融数据样本量相对有限。
解释性要求:金融领域对模型解释性有一定要求,黑盒模型可能面临监管和风控挑战。
数据准备
特征工程
特征工程是机器学习成功的关键,好的特征比复杂的模型更重要。
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
def create_features(df):
"""
创建机器学习特征
参数:
df: 包含OHLCV的DataFrame
返回:
特征DataFrame
"""
features = pd.DataFrame(index=df.index)
# 价格特征
features['return_1d'] = df['Close'].pct_change(1)
features['return_5d'] = df['Close'].pct_change(5)
features['return_20d'] = df['Close'].pct_change(20)
# 波动率特征
features['volatility_5d'] = df['Close'].pct_change().rolling(5).std()
features['volatility_20d'] = df['Close'].pct_change().rolling(20).std()
# 动量特征
features['momentum_5d'] = df['Close'] / df['Close'].shift(5) - 1
features['momentum_20d'] = df['Close'] / df['Close'].shift(20) - 1
# 均线偏离
features['ma5_bias'] = (df['Close'] - df['Close'].rolling(5).mean()) / df['Close'].rolling(5).mean()
features['ma20_bias'] = (df['Close'] - df['Close'].rolling(20).mean()) / df['Close'].rolling(20).mean()
# 成交量特征
features['volume_ratio'] = df['Volume'] / df['Volume'].rolling(20).mean()
features['volume_change'] = df['Volume'].pct_change()
# 价格位置
features['high_low_position'] = (df['Close'] - df['Low'].rolling(20).min()) / (df['High'].rolling(20).max() - df['Low'].rolling(20).min())
# 技术指标
# RSI
delta = df['Close'].diff()
gain = delta.where(delta > 0, 0).rolling(14).mean()
loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
features['RSI'] = 100 - (100 / (1 + gain / loss))
# MACD
ema12 = df['Close'].ewm(span=12).mean()
ema26 = df['Close'].ewm(span=26).mean()
features['MACD'] = ema12 - ema26
features['MACD_signal'] = features['MACD'].ewm(span=9).mean()
return features
def create_labels(df, forward_period=5, threshold=0.02):
"""
创建标签(预测目标)
参数:
df: 包含Close的DataFrame
forward_period: 预测期
threshold: 分类阈值
返回:
标签Series
"""
# 未来收益
future_return = df['Close'].shift(-forward_period) / df['Close'] - 1
# 分类标签:上涨(1)、下跌(-1)、横盘(0)
labels = pd.Series(0, index=df.index)
labels[future_return > threshold] = 1
labels[future_return < -threshold] = -1
return labels
数据清洗和标准化
def prepare_ml_data(features, labels, train_ratio=0.8):
"""
准备机器学习数据
参数:
features: 特征DataFrame
labels: 标签Series
train_ratio: 训练集比例
返回:
X_train, X_test, y_train, y_test
"""
from sklearn.model_selection import train_test_split
# 对齐索引
common_index = features.index.intersection(labels.dropna().index)
features = features.loc[common_index]
labels = labels.loc[common_index]
# 删除缺失值
mask = ~(features.isna().any(axis=1) | labels.isna())
features = features[mask]
labels = labels[mask]
# 时间序列分割(不能用随机分割)
split_idx = int(len(features) * train_ratio)
X_train = features.iloc[:split_idx]
X_test = features.iloc[split_idx:]
y_train = labels.iloc[:split_idx]
y_test = labels.iloc[split_idx:]
# 标准化
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train),
index=X_train.index,
columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test),
index=X_test.index,
columns=X_test.columns)
return X_train_scaled, X_test_scaled, y_train, y_test, scaler
常用机器学习模型
随机森林
随机森林是量化交易中最常用的机器学习模型之一,具有抗过拟合、可解释性强的特点。
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
def train_random_forest(X_train, y_train, n_estimators=100, max_depth=10):
"""
训练随机森林模型
参数:
X_train: 训练特征
y_train: 训练标签
n_estimators: 树的数量
max_depth: 最大深度
返回:
训练好的模型
"""
model = RandomForestClassifier(
n_estimators=n_estimators,
max_depth=max_depth,
min_samples_split=5,
min_samples_leaf=2,
random_state=42,
n_jobs=-1
)
model.fit(X_train, y_train)
return model
def evaluate_model(model, X_test, y_test):
"""
评估模型性能
参数:
model: 训练好的模型
X_test: 测试特征
y_test: 测试标签
返回:
评估结果字典
"""
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)
print("分类报告:")
print(classification_report(y_test, y_pred))
print("\n混淆矩阵:")
print(confusion_matrix(y_test, y_pred))
return {
'predictions': y_pred,
'probabilities': y_prob,
'accuracy': (y_pred == y_test).mean()
}
# 特征重要性分析
def plot_feature_importance(model, feature_names, top_n=20):
"""
绘制特征重要性
"""
import matplotlib.pyplot as plt
importance = pd.Series(model.feature_importances_, index=feature_names)
importance = importance.sort_values(ascending=True).tail(top_n)
plt.figure(figsize=(10, 8))
importance.plot(kind='barh')
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()
梯度提升树
梯度提升树(XGBoost、LightGBM)通常比随机森林有更好的性能。
from lightgbm import LGBMClassifier
def train_lightgbm(X_train, y_train, params=None):
"""
训练LightGBM模型
参数:
X_train: 训练特征
y_train: 训练标签
params: 模型参数
返回:
训练好的模型
"""
if params is None:
params = {
'n_estimators': 200,
'max_depth': 6,
'learning_rate': 0.05,
'num_leaves': 31,
'min_child_samples': 20,
'subsample': 0.8,
'colsample_bytree': 0.8,
'random_state': 42,
'n_jobs': -1,
'verbose': -1
}
model = LGBMClassifier(**params)
model.fit(X_train, y_train)
return model
支持向量机
from sklearn.svm import SVC
def train_svm(X_train, y_train, kernel='rbf', C=1.0):
"""
训练SVM模型
参数:
X_train: 训练特征
y_train: 训练标签
kernel: 核函数
C: 正则化参数
返回:
训练好的模型
"""
model = SVC(kernel=kernel, C=C, probability=True, random_state=42)
model.fit(X_train, y_train)
return model
时间序列模型
LSTM
LSTM适合处理时间序列数据,能够捕捉长期依赖关系。
import torch
import torch.nn as nn
class LSTMModel(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, output_size, dropout=0.2):
super(LSTMModel, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.lstm = nn.LSTM(
input_size=input_size,
hidden_size=hidden_size,
num_layers=num_layers,
batch_first=True,
dropout=dropout
)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x):
# LSTM输出
lstm_out, _ = self.lstm(x)
# 取最后一个时间步的输出
out = lstm_out[:, -1, :]
# 全连接层
out = self.fc(out)
return out
def create_sequences(data, seq_length):
"""
创建时间序列样本
参数:
data: 特征数据
seq_length: 序列长度
返回:
序列数据
"""
sequences = []
for i in range(len(data) - seq_length):
sequences.append(data[i:i+seq_length])
return np.array(sequences)
def train_lstm_model(X_train, y_train, seq_length=20, epochs=100, batch_size=32):
"""
训练LSTM模型
参数:
X_train: 训练特征
y_train: 训练标签
seq_length: 序列长度
epochs: 训练轮数
batch_size: 批大小
返回:
训练好的模型
"""
# 创建序列
X_seq = create_sequences(X_train.values, seq_length)
y_seq = y_train.values[seq_length:]
# 转换为Tensor
X_tensor = torch.FloatTensor(X_seq)
y_tensor = torch.LongTensor(y_seq + 1) # 标签从0开始
# 模型参数
input_size = X_train.shape[1]
hidden_size = 64
num_layers = 2
output_size = 3 # 三分类
# 创建模型
model = LSTMModel(input_size, hidden_size, num_layers, output_size)
# 损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# 训练
model.train()
for epoch in range(epochs):
# 前向传播
outputs = model(X_tensor)
loss = criterion(outputs, y_tensor)
# 反向传播
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (epoch + 1) % 10 == 0:
print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')
return model
模型评估与选择
交叉验证
对于时间序列数据,需要使用时间序列交叉验证。
from sklearn.model_selection import TimeSeriesSplit
def time_series_cv(model_class, X, y, n_splits=5, **model_params):
"""
时间序列交叉验证
参数:
model_class: 模型类
X: 特征
y: 标签
n_splits: 分割数
model_params: 模型参数
返回:
各折结果
"""
tscv = TimeSeriesSplit(n_splits=n_splits)
results = []
for fold, (train_idx, test_idx) in enumerate(tscv.split(X)):
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
# 训练模型
model = model_class(**model_params)
model.fit(X_train, y_train)
# 预测
y_pred = model.predict(X_test)
accuracy = (y_pred == y_test).mean()
results.append({
'fold': fold,
'accuracy': accuracy,
'train_size': len(train_idx),
'test_size': len(test_idx)
})
print(f"Fold {fold}: Accuracy = {accuracy:.4f}")
return pd.DataFrame(results)
回测集成
将机器学习预测集成到回测框架中。
import backtrader as bt
class MLStrategy(bt.Strategy):
"""机器学习策略"""
params = (
('model', None),
('scaler', None),
('feature_func', None),
('seq_length', 20),
)
def __init__(self):
self.data_buffer = []
self.order = None
def next(self):
# 收集数据
current_data = {
'Close': self.data.close[0],
'High': self.data.high[0],
'Low': self.data.low[0],
'Volume': self.data.volume[0],
}
self.data_buffer.append(current_data)
# 等待足够的数据
if len(self.data_buffer) < self.params.seq_length:
return
# 创建特征
df = pd.DataFrame(self.data_buffer[-self.params.seq_length:])
features = self.params.feature_func(df).iloc[-1:].values
# 标准化
features_scaled = self.params.scaler.transform(features)
# 预测
prediction = self.params.model.predict(features_scaled)[0]
# 交易逻辑
if not self.position:
if prediction == 1: # 预测上涨
self.order = self.buy()
else:
if prediction == -1: # 预测下跌
self.order = self.sell()
防止过拟合
正则化
from sklearn.linear_model import Lasso, Ridge
def train_regularized_model(X_train, y_train, alpha=0.1, model_type='lasso'):
"""
训练正则化模型
参数:
X_train: 训练特征
y_train: 训练标签
alpha: 正则化强度
model_type: 模型类型
返回:
训练好的模型
"""
if model_type == 'lasso':
model = Lasso(alpha=alpha)
else:
model = Ridge(alpha=alpha)
model.fit(X_train, y_train)
return model
早停
from lightgbm import LGBMClassifier
def train_with_early_stopping(X_train, y_train, X_val, y_val):
"""
使用早停训练模型
参数:
X_train: 训练特征
y_train: 训练标签
X_val: 验证特征
y_val: 验证标签
返回:
训练好的模型
"""
model = LGBMClassifier(
n_estimators=1000,
learning_rate=0.05,
early_stopping_rounds=50,
random_state=42,
verbose=-1
)
model.fit(
X_train, y_train,
eval_set=[(X_val, y_val)],
eval_metric='multi_logloss'
)
print(f"最佳迭代次数: {model.best_iteration_}")
return model
特征选择
from sklearn.feature_selection import SelectKBest, mutual_info_classif
def select_features(X_train, y_train, X_test, k=20):
"""
特征选择
参数:
X_train: 训练特征
y_train: 训练标签
X_test: 测试特征
k: 选择的特征数量
返回:
选择后的特征
"""
selector = SelectKBest(score_func=mutual_info_classif, k=k)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)
selected_features = X_train.columns[selector.get_support()].tolist()
print(f"选择的特征: {selected_features}")
return X_train_selected, X_test_selected, selected_features
小结
机器学习为量化交易提供了强大的工具,但也带来了过拟合等挑战。使用机器学习时需要注意:
- 特征工程是关键:好的特征比复杂的模型更重要
- 防止过拟合:使用正则化、早停、特征选择等技术
- 时间序列特性:使用时间序列交叉验证,不能用随机分割
- 模型解释:关注特征重要性,理解模型决策逻辑
- 持续监控:模型上线后需要持续监控,及时发现问题
机器学习不是万能的,它只是工具之一。在实际应用中,应该将机器学习与传统量化方法结合,发挥各自优势。下一章将介绍高频交易的基础知识。