Actor-Critic 方法
Actor-Critic 方法结合了基于价值和基于策略两类方法的优点。它使用两个神经网络:Actor(演员)负责选择动作,Critic(评论家)负责评估动作的价值。这种架构能够同时享受策略梯度的灵活性和价值函数的低方差优势。
Actor-Critic 架构
核心思想
Actor-Critic 的核心思想是将策略梯度中的 替换为 Critic 网络的估计值:
其中:
- 是 Actor 网络的参数
- 是 Critic 网络的参数
架构示意
状态 s
│
▼
┌──────────────┐
│ Actor 网络 │ ──→ 动作概率 π(a|s)
│ (策略网络) │
└──────────────┘
│
▼
┌──────────────┐
│ Critic 网络 │ ──→ 状态价值 V(s) 或 Q(s,a)
│ (价值网络) │
└──────────────┘
优势函数
在 Actor-Critic 中,通常使用优势函数(Advantage Function)替代 Q 值:
优势函数表示动作 相对于平均动作的优势程度:
- :动作比平均好
- :动作比平均差
使用优势函数可以进一步降低方差。
Actor-Critic 算法实现
基础 Actor-Critic
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
class ActorCritic(nn.Module):
def __init__(self, state_dim, action_dim, hidden_dim=128):
super().__init__()
self.shared = nn.Sequential(
nn.Linear(state_dim, hidden_dim),
nn.ReLU()
)
self.actor = nn.Sequential(
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, action_dim),
nn.Softmax(dim=-1)
)
self.critic = nn.Sequential(
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, 1)
)
def forward(self, x):
shared = self.shared(x)
action_probs = self.actor(shared)
state_value = self.critic(shared)
return action_probs, state_value
def select_action(self, state):
state = torch.FloatTensor(state).unsqueeze(0)
probs, value = self.forward(state)
dist = torch.distributions.Categorical(probs)
action = dist.sample()
return action.item(), dist.log_prob(action), value
class ActorCriticAgent:
def __init__(self, state_dim, action_dim, hidden_dim=128, lr=1e-3, gamma=0.99):
self.model = ActorCritic(state_dim, action_dim, hidden_dim)
self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
self.gamma = gamma
def update(self, log_probs, values, rewards, dones):
returns = []
R = 0
for r, done in zip(reversed(rewards), reversed(dones)):
R = r + self.gamma * R * (1 - done)
returns.insert(0, R)
returns = torch.tensor(returns, dtype=torch.float32)
log_probs = torch.stack(log_probs)
values = torch.cat(values).squeeze()
advantages = returns - values.detach()
actor_loss = -(log_probs * advantages).mean()
critic_loss = F.mse_loss(values, returns)
loss = actor_loss + 0.5 * critic_loss
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
return loss.item()
def train(self, env, num_episodes=1000):
rewards_history = []
for episode in range(num_episodes):
state, _ = env.reset()
log_probs = []
values = []
rewards = []
dones = []
total_reward = 0
done = False
while not done:
action, log_prob, value = self.model.select_action(state)
next_state, reward, terminated, truncated, _ = env.step(action)
done = terminated or truncated
log_probs.append(log_prob)
values.append(value)
rewards.append(reward)
dones.append(done)
state = next_state
total_reward += reward
loss = self.update(log_probs, values, rewards, dones)
rewards_history.append(total_reward)
if (episode + 1) % 50 == 0:
avg_reward = np.mean(rewards_history[-50:])
print(f"Episode {episode + 1}, Avg Reward: {avg_reward:.2f}")
return rewards_history
A2C(Advantage Actor-Critic)
A2C 是 Actor-Critic 的同步版本,使用多个并行环境收集经验:
import torch.multiprocessing as mp
class A2C:
def __init__(self, state_dim, action_dim, hidden_dim=128, lr=1e-3,
gamma=0.99, num_workers=4):
self.model = ActorCritic(state_dim, action_dim, hidden_dim)
self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
self.gamma = gamma
self.num_workers = num_workers
def collect_experience(self, env, worker_id):
state, _ = env.reset()
experiences = []
total_reward = 0
for _ in range(5):
action, log_prob, value = self.model.select_action(state)
next_state, reward, terminated, truncated, _ = env.step(action)
done = terminated or truncated
experiences.append((state, action, log_prob, value, reward, done))
state = next_state
total_reward += reward
if done:
state, _ = env.reset()
return experiences, total_reward
def compute_gae(self, rewards, values, dones, next_value, gamma=0.99, lam=0.95):
advantages = []
gae = 0
for r, v, done in zip(reversed(rewards), reversed(values), reversed(dones)):
if done:
delta = r - v
gae = delta
else:
delta = r + gamma * next_value - v
gae = delta + gamma * lam * gae
advantages.insert(0, gae)
next_value = v
return advantages
A3C(Asynchronous Advantage Actor-Critic)
A3C 是 A2C 的异步版本,每个 worker 独立更新全局网络:
class A3CWorker(mp.Process):
def __init__(self, worker_id, global_model, optimizer, env_name,
state_dim, action_dim, gamma=0.99, max_episodes=1000):
super().__init__()
self.worker_id = worker_id
self.global_model = global_model
self.optimizer = optimizer
self.env = gym.make(env_name)
self.local_model = ActorCritic(state_dim, action_dim)
self.gamma = gamma
self.max_episodes = max_episodes
def run(self):
for episode in range(self.max_episodes):
self.local_model.load_state_dict(self.global_model.state_dict())
state, _ = self.env.reset()
log_probs = []
values = []
rewards = []
done = False
while not done:
action, log_prob, value = self.local_model.select_action(state)
next_state, reward, terminated, truncated, _ = self.env.step(action)
done = terminated or truncated
log_probs.append(log_prob)
values.append(value)
rewards.append(reward)
state = next_state
self.update_global(log_probs, values, rewards)
def update_global(self, log_probs, values, rewards):
returns = self.compute_returns(rewards)
log_probs = torch.stack(log_probs)
values = torch.cat(values).squeeze()
advantages = returns - values.detach()
actor_loss = -(log_probs * advantages).mean()
critic_loss = F.mse_loss(values, returns)
loss = actor_loss + 0.5 * critic_loss
self.optimizer.zero_grad()
loss.backward()
for global_param, local_param in zip(self.global_model.parameters(),
self.local_model.parameters()):
global_param._grad = local_param.grad
self.optimizer.step()
优势函数估计
TD 误差
最简单的优势估计是 TD 误差:
GAE(Generalized Advantage Estimation)
GAE 是一种更优的优势估计方法,平衡了偏差和方差:
其中 是 TD 误差。
def compute_gae(rewards, values, next_value, dones, gamma=0.99, lam=0.95):
advantages = []
gae = 0
for t in reversed(range(len(rewards))):
if t == len(rewards) - 1:
next_val = next_value
else:
next_val = values[t + 1]
delta = rewards[t] + gamma * next_val * (1 - dones[t]) - values[t]
gae = delta + gamma * lam * (1 - dones[t]) * gae
advantages.insert(0, gae)
returns = [adv + val for adv, val in zip(advantages, values)]
return advantages, returns
连续动作空间的 Actor-Critic
对于连续动作空间,Actor 输出高斯分布的参数:
class ContinuousActorCritic(nn.Module):
def __init__(self, state_dim, action_dim, hidden_dim=128):
super().__init__()
self.shared = nn.Sequential(
nn.Linear(state_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU()
)
self.actor_mean = nn.Linear(hidden_dim, action_dim)
self.actor_log_std = nn.Parameter(torch.zeros(action_dim))
self.critic = nn.Linear(hidden_dim, 1)
def forward(self, x):
features = self.shared(x)
action_mean = self.actor_mean(features)
action_std = torch.exp(self.actor_log_std)
state_value = self.critic(features)
return action_mean, action_std, state_value
def select_action(self, state):
state = torch.FloatTensor(state).unsqueeze(0)
mean, std, value = self.forward(state)
dist = torch.distributions.Normal(mean, std)
action = dist.sample()
log_prob = dist.log_prob(action).sum(-1)
return action.squeeze().numpy(), log_prob, value
训练技巧
梯度裁剪
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
价值函数系数
loss = actor_loss + value_coef * critic_loss - entropy_coef * entropy
学习率调整
def get_lr(optimizer):
return optimizer.param_groups[0]['lr']
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.95)
完整示例
import gymnasium as gym
env = gym.make('CartPole-v1')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
agent = ActorCriticAgent(state_dim, action_dim, hidden_dim=128, lr=1e-3, gamma=0.99)
rewards = agent.train(env, num_episodes=1000)
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 5))
plt.plot(rewards)
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('Actor-Critic on CartPole')
plt.show()
小结
Actor-Critic 方法结合了策略梯度和价值函数的优点:
- 架构:Actor 选择动作,Critic 评估价值
- 优势函数:,降低方差
- GAE:平衡偏差和方差的优势估计
- A2C/A3C:并行化训练提高效率
- 连续动作:使用高斯策略处理连续动作空间
下一章将介绍 PPO(近端策略优化),它是目前最流行、最稳定的策略梯度算法之一。