Gymnasium 环境接口
Gymnasium 是 OpenAI Gym 的维护分支,提供了标准化的强化学习环境接口。它包含了丰富的预定义环境,并支持自定义环境的开发。掌握 Gymnasium 是进行强化学习实践的基础。
Gymnasium 简介
什么是 Gymnasium?
Gymnasium 是一个用于开发和比较强化学习算法的工具包。它提供了:
- 统一的环境接口
- 丰富的预定义环境
- 环境包装器系统
- 自定义环境开发支持
安装
pip install gymnasium
# 安装特定环境
pip install gymnasium[atari]
pip install gymnasium[box2d]
pip install gymnasium[mujoco]
基本使用
创建环境
import gymnasium as gym
env = gym.make('CartPole-v1')
环境交互循环
import gymnasium as gym
env = gym.make('CartPole-v1', render_mode='human')
observation, info = env.reset(seed=42)
for _ in range(1000):
action = env.action_space.sample()
observation, reward, terminated, truncated, info = env.step(action)
if terminated or truncated:
observation, info = env.reset()
env.close()
核心方法
| 方法 | 说明 |
|---|---|
reset() | 重置环境,返回初始观测 |
step(action) | 执行动作,返回 (obs, reward, terminated, truncated, info) |
render() | 渲染环境画面 |
close() | 关闭环境 |
环境空间
观测空间(Observation Space)
import gymnasium as gym
env = gym.make('CartPole-v1')
print(env.observation_space)
# Box(-4.8, 4.8, (4,), float32)
print(env.observation_space.shape)
# (4,)
print(env.observation_space.low)
# [-4.8 -inf -0.41887903 -inf]
print(env.observation_space.high)
# [4.8 inf 0.41887903 inf]
动作空间(Action Space)
# 离散动作空间
env = gym.make('CartPole-v1')
print(env.action_space)
# Discrete(2)
print(env.action_space.n)
# 2
# 连续动作空间
env = gym.make('Pendulum-v1')
print(env.action_space)
# Box(-2.0, 2.0, (1,), float32)
print(env.action_space.shape)
# (1,)
空间类型
| 空间类型 | 说明 | 示例 |
|---|---|---|
Box | 连续空间,n维数组 | 图像、位置坐标 |
Discrete | 离散空间,整数 | 动作选择 |
MultiDiscrete | 多维离散空间 | 多个离散动作 |
MultiBinary | 多维二进制空间 | 多个开关 |
Dict | 字典空间 | 多种观测组合 |
Tuple | 元组空间 | 多个空间组合 |
经典环境
控制任务
# 倒立摆
env = gym.make('CartPole-v1')
# 月球着陆器
env = gym.make('LunarLander-v2')
# 钟摆
env = gym.make('Pendulum-v1')
# 山地车
env = gym.make('MountainCar-v0')
# 双足机器人
env = gym.make('BipedalWalker-v3')
Atari 游戏
import gymnasium as gym
# Breakout
env = gym.make('ALE/Breakout-v5')
# Pong
env = gym.make('ALE/Pong-v5')
# Space Invaders
env = gym.make('ALE/SpaceInvaders-v5')
# 使用 Atari 预处理
from gymnasium.wrappers import AtariPreprocessing
env = gym.make('ALE/Breakout-v5')
env = AtariPreprocessing(env)
MuJoCo 环境
# 半猎豹
env = gym.make('HalfCheetah-v4')
# 人形机器人
env = gym.make('Humanoid-v4')
# 蚂蚁
env = gym.make('Ant-v4')
# 机械臂
env = gym.make('Reacher-v4')
环境包装器
包装器是修改环境行为的强大工具。
常用包装器
import gymnasium as gym
from gymnasium.wrappers import (
TimeLimit,
RecordVideo,
NormalizeObservation,
NormalizeReward,
FlattenObservation,
TransformObservation,
TransformReward
)
env = gym.make('CartPole-v1')
# 限制最大步数
env = TimeLimit(env, max_episode_steps=500)
# 记录视频
env = RecordVideo(env, video_folder='./videos')
# 归一化观测
env = NormalizeObservation(env)
# 归一化奖励
env = NormalizeReward(env)
自定义包装器
class CustomWrapper(gym.Wrapper):
def __init__(self, env):
super().__init__(env)
def reset(self, **kwargs):
observation, info = self.env.reset(**kwargs)
return observation, info
def step(self, action):
observation, reward, terminated, truncated, info = self.env.step(action)
reward = reward * 2
return observation, reward, terminated, truncated, info
env = gym.make('CartPole-v1')
env = CustomWrapper(env)
观测包装器
class ScaleObservation(gym.ObservationWrapper):
def __init__(self, env, scale=1.0):
super().__init__(env)
self.scale = scale
self.observation_space = gym.spaces.Box(
low=env.observation_space.low * scale,
high=env.observation_space.high * scale,
dtype=env.observation_space.dtype
)
def observation(self, obs):
return obs * self.scale
奖励包装器
class ClipReward(gym.RewardWrapper):
def __init__(self, env, min_reward=-1, max_reward=1):
super().__init__(env)
self.min_reward = min_reward
self.max_reward = max_reward
def reward(self, reward):
return np.clip(reward, self.min_reward, self.max_reward)
自定义环境
创建自定义环境
import gymnasium as gym
import numpy as np
class CustomEnv(gym.Env):
metadata = {'render_modes': ['human', 'rgb_array'], 'render_fps': 30}
def __init__(self, render_mode=None):
self.observation_space = gym.spaces.Box(low=0, high=1, shape=(4,), dtype=np.float32)
self.action_space = gym.spaces.Discrete(2)
self.render_mode = render_mode
self.window = None
self.clock = None
def reset(self, seed=None, options=None):
super().reset(seed=seed)
self.state = self.np_random.random(4).astype(np.float32)
info = {}
return self.state, info
def step(self, action):
reward = 1.0 if action == 0 else -1.0
terminated = False
truncated = False
info = {}
self.state = self.np_random.random(4).astype(np.float32)
return self.state, reward, terminated, truncated, info
def render(self):
if self.render_mode == 'rgb_array':
return np.zeros((100, 100, 3), dtype=np.uint8)
def close(self):
pass
注册自定义环境
from gymnasium.envs.registration import register
register(
id='CustomEnv-v0',
entry_point='custom_env:CustomEnv',
max_episode_steps=1000,
)
env = gym.make('CustomEnv-v0')
向量化环境
向量化环境可以并行运行多个环境实例。
使用向量化环境
import gymnasium as gym
from gymnasium.vector import SyncVectorEnv, AsyncVectorEnv
# 同步向量化环境
envs = SyncVectorEnv([
lambda: gym.make('CartPole-v1'),
lambda: gym.make('CartPole-v1'),
lambda: gym.make('CartPole-v1'),
])
# 异步向量化环境(多进程)
envs = AsyncVectorEnv([
lambda: gym.make('CartPole-v1') for _ in range(4)
])
observations, infos = envs.reset()
actions = envs.action_space.sample()
observations, rewards, terminateds, truncateds, infos = envs.step(actions)
envs.close()
使用 make_vec_env
from gymnasium.vector import make_vec_env
envs = make_vec_env('CartPole-v1', n_envs=4)
环境检查
Gymnasium 提供了环境检查工具:
from gymnasium.utils.env_checker import check_env
env = CustomEnv()
check_env(env, warn=True)
实用工具
环境信息
import gymnasium as gym
env = gym.make('CartPole-v1')
print(f"观测空间: {env.observation_space}")
print(f"动作空间: {env.action_space}")
print(f"最大步数: {env.spec.max_episode_steps}")
print(f"奖励阈值: {env.spec.reward_threshold}")
随机动作
env = gym.make('CartPole-v1')
obs, info = env.reset()
for _ in range(100):
action = env.action_space.sample()
obs, reward, terminated, truncated, info = env.step(action)
if terminated or truncated:
obs, info = env.reset()
环境规范
from gymnasium.wrappers import TimeLimit
env = gym.make('CartPole-v1')
env = TimeLimit(env, max_episode_steps=500)
完整示例
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
def train_random_agent(env_name='CartPole-v1', num_episodes=100):
env = gym.make(env_name)
rewards = []
for episode in range(num_episodes):
obs, info = env.reset()
total_reward = 0
done = False
while not done:
action = env.action_space.sample()
obs, reward, terminated, truncated, info = env.step(action)
done = terminated or truncated
total_reward += reward
rewards.append(total_reward)
env.close()
return rewards
rewards = train_random_agent()
plt.figure(figsize=(10, 5))
plt.plot(rewards)
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('Random Agent on CartPole')
plt.show()
print(f"平均奖励: {np.mean(rewards):.2f}")
print(f"最大奖励: {np.max(rewards):.2f}")
小结
Gymnasium 是强化学习实践的基础工具:
- 统一接口:reset、step、render、close
- 空间定义:Box、Discrete、MultiDiscrete 等
- 环境包装器:修改环境行为的强大工具
- 自定义环境:继承 gym.Env 创建自己的环境
- 向量化环境:并行运行多个环境实例
下一章将介绍 Stable Baselines3,它提供了可靠的强化学习算法实现。