跳到主要内容

Actor-Critic 方法

Actor-Critic 方法结合了基于价值和基于策略两类方法的优点。它使用两个神经网络:Actor(演员)负责选择动作,Critic(评论家)负责评估动作的价值。这种架构能够同时享受策略梯度的灵活性和价值函数的低方差优势。

Actor-Critic 架构

核心思想

Actor-Critic 的核心思想是将策略梯度中的 Q(s,a)Q(s,a) 替换为 Critic 网络的估计值:

θJ(θ)=E[θlogπθ(as)Qw(s,a)]\nabla_\theta J(\theta) = \mathbb{E}[\nabla_\theta \log \pi_\theta(a|s) Q_w(s,a)]

其中:

  • θ\theta 是 Actor 网络的参数
  • ww 是 Critic 网络的参数

架构示意

        状态 s


┌──────────────┐
│ Actor 网络 │ ──→ 动作概率 π(a|s)
│ (策略网络) │
└──────────────┘


┌──────────────┐
│ Critic 网络 │ ──→ 状态价值 V(s) 或 Q(s,a)
│ (价值网络) │
└──────────────┘

优势函数

在 Actor-Critic 中,通常使用优势函数(Advantage Function)替代 Q 值:

A(s,a)=Q(s,a)V(s)A(s,a) = Q(s,a) - V(s)

优势函数表示动作 aa 相对于平均动作的优势程度:

  • A(s,a)>0A(s,a) > 0:动作比平均好
  • A(s,a)<0A(s,a) < 0:动作比平均差

使用优势函数可以进一步降低方差。

Actor-Critic 算法实现

基础 Actor-Critic

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np

class ActorCritic(nn.Module):
def __init__(self, state_dim, action_dim, hidden_dim=128):
super().__init__()

self.shared = nn.Sequential(
nn.Linear(state_dim, hidden_dim),
nn.ReLU()
)

self.actor = nn.Sequential(
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, action_dim),
nn.Softmax(dim=-1)
)

self.critic = nn.Sequential(
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, 1)
)

def forward(self, x):
shared = self.shared(x)
action_probs = self.actor(shared)
state_value = self.critic(shared)
return action_probs, state_value

def select_action(self, state):
state = torch.FloatTensor(state).unsqueeze(0)
probs, value = self.forward(state)
dist = torch.distributions.Categorical(probs)
action = dist.sample()
return action.item(), dist.log_prob(action), value

class ActorCriticAgent:
def __init__(self, state_dim, action_dim, hidden_dim=128, lr=1e-3, gamma=0.99):
self.model = ActorCritic(state_dim, action_dim, hidden_dim)
self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
self.gamma = gamma

def update(self, log_probs, values, rewards, dones):
returns = []
R = 0

for r, done in zip(reversed(rewards), reversed(dones)):
R = r + self.gamma * R * (1 - done)
returns.insert(0, R)

returns = torch.tensor(returns, dtype=torch.float32)
log_probs = torch.stack(log_probs)
values = torch.cat(values).squeeze()

advantages = returns - values.detach()

actor_loss = -(log_probs * advantages).mean()
critic_loss = F.mse_loss(values, returns)

loss = actor_loss + 0.5 * critic_loss

self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()

return loss.item()

def train(self, env, num_episodes=1000):
rewards_history = []

for episode in range(num_episodes):
state, _ = env.reset()
log_probs = []
values = []
rewards = []
dones = []
total_reward = 0

done = False
while not done:
action, log_prob, value = self.model.select_action(state)
next_state, reward, terminated, truncated, _ = env.step(action)
done = terminated or truncated

log_probs.append(log_prob)
values.append(value)
rewards.append(reward)
dones.append(done)

state = next_state
total_reward += reward

loss = self.update(log_probs, values, rewards, dones)
rewards_history.append(total_reward)

if (episode + 1) % 50 == 0:
avg_reward = np.mean(rewards_history[-50:])
print(f"Episode {episode + 1}, Avg Reward: {avg_reward:.2f}")

return rewards_history

A2C(Advantage Actor-Critic)

A2C 是 Actor-Critic 的同步版本,使用多个并行环境收集经验:

import torch.multiprocessing as mp

class A2C:
def __init__(self, state_dim, action_dim, hidden_dim=128, lr=1e-3,
gamma=0.99, num_workers=4):
self.model = ActorCritic(state_dim, action_dim, hidden_dim)
self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
self.gamma = gamma
self.num_workers = num_workers

def collect_experience(self, env, worker_id):
state, _ = env.reset()
experiences = []
total_reward = 0

for _ in range(5):
action, log_prob, value = self.model.select_action(state)
next_state, reward, terminated, truncated, _ = env.step(action)
done = terminated or truncated

experiences.append((state, action, log_prob, value, reward, done))
state = next_state
total_reward += reward

if done:
state, _ = env.reset()

return experiences, total_reward

def compute_gae(self, rewards, values, dones, next_value, gamma=0.99, lam=0.95):
advantages = []
gae = 0

for r, v, done in zip(reversed(rewards), reversed(values), reversed(dones)):
if done:
delta = r - v
gae = delta
else:
delta = r + gamma * next_value - v
gae = delta + gamma * lam * gae

advantages.insert(0, gae)
next_value = v

return advantages

A3C(Asynchronous Advantage Actor-Critic)

A3C 是 A2C 的异步版本,每个 worker 独立更新全局网络:

class A3CWorker(mp.Process):
def __init__(self, worker_id, global_model, optimizer, env_name,
state_dim, action_dim, gamma=0.99, max_episodes=1000):
super().__init__()
self.worker_id = worker_id
self.global_model = global_model
self.optimizer = optimizer
self.env = gym.make(env_name)
self.local_model = ActorCritic(state_dim, action_dim)
self.gamma = gamma
self.max_episodes = max_episodes

def run(self):
for episode in range(self.max_episodes):
self.local_model.load_state_dict(self.global_model.state_dict())

state, _ = self.env.reset()
log_probs = []
values = []
rewards = []

done = False
while not done:
action, log_prob, value = self.local_model.select_action(state)
next_state, reward, terminated, truncated, _ = self.env.step(action)
done = terminated or truncated

log_probs.append(log_prob)
values.append(value)
rewards.append(reward)
state = next_state

self.update_global(log_probs, values, rewards)

def update_global(self, log_probs, values, rewards):
returns = self.compute_returns(rewards)

log_probs = torch.stack(log_probs)
values = torch.cat(values).squeeze()

advantages = returns - values.detach()

actor_loss = -(log_probs * advantages).mean()
critic_loss = F.mse_loss(values, returns)
loss = actor_loss + 0.5 * critic_loss

self.optimizer.zero_grad()
loss.backward()

for global_param, local_param in zip(self.global_model.parameters(),
self.local_model.parameters()):
global_param._grad = local_param.grad

self.optimizer.step()

优势函数估计

TD 误差

最简单的优势估计是 TD 误差:

A(s,a)=r+γV(s)V(s)A(s,a) = r + \gamma V(s') - V(s)

GAE(Generalized Advantage Estimation)

GAE 是一种更优的优势估计方法,平衡了偏差和方差:

AGAE(γ,λ)=l=0(γλ)lδt+lA^{GAE}(\gamma, \lambda) = \sum_{l=0}^{\infty} (\gamma \lambda)^l \delta_{t+l}

其中 δt=rt+γV(st+1)V(st)\delta_t = r_t + \gamma V(s_{t+1}) - V(s_t) 是 TD 误差。

def compute_gae(rewards, values, next_value, dones, gamma=0.99, lam=0.95):
advantages = []
gae = 0

for t in reversed(range(len(rewards))):
if t == len(rewards) - 1:
next_val = next_value
else:
next_val = values[t + 1]

delta = rewards[t] + gamma * next_val * (1 - dones[t]) - values[t]
gae = delta + gamma * lam * (1 - dones[t]) * gae
advantages.insert(0, gae)

returns = [adv + val for adv, val in zip(advantages, values)]
return advantages, returns

连续动作空间的 Actor-Critic

对于连续动作空间,Actor 输出高斯分布的参数:

class ContinuousActorCritic(nn.Module):
def __init__(self, state_dim, action_dim, hidden_dim=128):
super().__init__()

self.shared = nn.Sequential(
nn.Linear(state_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU()
)

self.actor_mean = nn.Linear(hidden_dim, action_dim)
self.actor_log_std = nn.Parameter(torch.zeros(action_dim))

self.critic = nn.Linear(hidden_dim, 1)

def forward(self, x):
features = self.shared(x)

action_mean = self.actor_mean(features)
action_std = torch.exp(self.actor_log_std)

state_value = self.critic(features)

return action_mean, action_std, state_value

def select_action(self, state):
state = torch.FloatTensor(state).unsqueeze(0)
mean, std, value = self.forward(state)

dist = torch.distributions.Normal(mean, std)
action = dist.sample()
log_prob = dist.log_prob(action).sum(-1)

return action.squeeze().numpy(), log_prob, value

训练技巧

梯度裁剪

torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)

价值函数系数

loss = actor_loss + value_coef * critic_loss - entropy_coef * entropy

学习率调整

def get_lr(optimizer):
return optimizer.param_groups[0]['lr']

scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.95)

完整示例

import gymnasium as gym

env = gym.make('CartPole-v1')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

agent = ActorCriticAgent(state_dim, action_dim, hidden_dim=128, lr=1e-3, gamma=0.99)
rewards = agent.train(env, num_episodes=1000)

import matplotlib.pyplot as plt
plt.figure(figsize=(10, 5))
plt.plot(rewards)
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('Actor-Critic on CartPole')
plt.show()

小结

Actor-Critic 方法结合了策略梯度和价值函数的优点:

  • 架构:Actor 选择动作,Critic 评估价值
  • 优势函数A(s,a)=Q(s,a)V(s)A(s,a) = Q(s,a) - V(s),降低方差
  • GAE:平衡偏差和方差的优势估计
  • A2C/A3C:并行化训练提高效率
  • 连续动作:使用高斯策略处理连续动作空间

下一章将介绍 PPO(近端策略优化),它是目前最流行、最稳定的策略梯度算法之一。