"""并行环境 DQN 训练脚本 - 使用 AsyncVectorEnv 加速数据收集. 每个训练迭代并行采集 N 个环境的转移,批量 GPU 推理,显著提升 FPS。 适合在 AutoDL 等多核服务器+GPU 环境下运行。 """ import sys import os sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) import argparse import time import numpy as np import torch import torch.nn.functional as F from collections import deque from src.network import QNetwork, DuelingQNetwork from src.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer from src.utils import make_env, get_device # ── 环境工厂函数(供 AsyncVectorEnv 子进程使用)── def _make_env_fn(env_id): """环境工厂 - 必须在模块级别以便 multiprocessing pickle.""" # AsyncVectorEnv 子进程需要独立注册 ALE try: import ale_py import gymnasium as gym gym.register_envs(ale_py) except ImportError: pass def _make(): return make_env(env_id, gray_scale=True, resize=True, frame_stack=4) return _make # ── 并行训练器 ── class ParallelTrainer: """并行环境 DQN 训练器. 使用 AsyncVectorEnv 并行运行 N 个环境, 同时收集转移 + 批量推理,大幅提升训练速度。 """ def __init__( self, agent, envs, eval_env, num_envs, save_dir="models", eval_freq=10000, save_freq=50000, num_eval_episodes=10, warmup_steps=10000, n_steps_per_env=1, ): self.agent = agent self.envs = envs self.eval_env = eval_env self.num_envs = num_envs self.save_dir = save_dir self.eval_freq = eval_freq self.save_freq = save_freq self.num_eval_episodes = num_eval_episodes self.warmup_steps = warmup_steps self.n_steps_per_env = n_steps_per_env self.episode_rewards = deque(maxlen=100) self.eval_rewards = [] self.best_eval_reward = -float("inf") def train(self, total_steps): """主并行训练循环. Args: total_steps: 总环境交互步数 """ num_envs = self.num_envs device = self.agent.device envs = self.envs print(f"开始并行训练,总步数: {total_steps:,}") print(f"并行环境数: {num_envs}") print(f"预热步数: {self.warmup_steps:,}") print("=" * 60) # 重置所有环境 states, _ = envs.reset() episode_rewards = np.zeros(num_envs, dtype=np.float32) episode_lengths = np.zeros(num_envs, dtype=np.int32) episode_count = 0 start_time = time.time() step = 0 while step < total_steps: # ── 动作选择 ── if step < self.warmup_steps: actions = np.array([envs.single_action_space.sample() for _ in range(num_envs)]) else: actions = self._batch_select_actions(states) # ── 环境步进(N 个环境并行)── next_states, rewards, terminateds, truncateds, _ = envs.step(actions) dones = np.logical_or(terminateds, truncateds) # ── 存储转移 ── for i in range(num_envs): self.agent.replay_buffer.add( states[i], actions[i], rewards[i], next_states[i], dones[i] ) # ── 统计 ── episode_rewards += rewards episode_lengths += 1 # 处理结束的 episode for i in range(num_envs): if dones[i]: self.episode_rewards.append(episode_rewards[i]) episode_count += 1 episode_rewards[i] = 0 episode_lengths[i] = 0 step += num_envs states = next_states # ── 训练(环境每步一个 mini-batch)── if step >= self.warmup_steps: self.agent.train_step() # ── 进度打印 ── if episode_count > 0 and episode_count % 10 == 0: avg_reward = np.mean(self.episode_rewards) if self.episode_rewards else 0 elapsed = time.time() - start_time fps = step / elapsed current_lr = self.agent.optimizer.param_groups[0]["lr"] print( f"Step: {step:>10,} | " f"Ep: {episode_count:>5} | " f"AvgReward: {avg_reward:>7.1f} | " f"Epsilon: {self.agent.epsilon:.3f} | " f"LR: {current_lr:.2e} | " f"FPS: {fps:.0f}" ) # ── 定期评估 ── if step % self.eval_freq == 0 and step > 0: eval_reward = self.evaluate() self.eval_rewards.append((step, eval_reward)) print(f"\n[Eval] Step: {step:>10,} | AvgReward: {eval_reward:.1f}\n" + "-" * 60) if eval_reward > self.best_eval_reward: self.best_eval_reward = eval_reward self.agent.save(f"{self.save_dir}/dqn_best.pt") # ── 定期保存 ── if step % self.save_freq == 0: self.agent.save(f"{self.save_dir}/dqn_step_{step}.pt") # 训练结束 total_time = time.time() - start_time print("\n" + "=" * 60) print(f"训练完成!总时间: {total_time:.1f} 秒") print(f"平均 FPS: {total_steps / total_time:.0f}") print(f"最佳评估回报: {self.best_eval_reward:.1f}") self.agent.save(f"{self.save_dir}/dqn_final.pt") def _batch_select_actions(self, states): """批量选择动作(使用 GPU 批量推理).""" epsilon = self.agent.epsilon num_envs = len(states) # 随机探索 random_mask = np.random.random(num_envs) < epsilon actions = np.zeros(num_envs, dtype=np.int64) # 对非随机的环境做批量推理 non_random = ~random_mask if non_random.any(): state_tensor = ( torch.from_numpy(states[non_random]).float().to(self.agent.device) ) with torch.no_grad(): q_values = self.agent.q_network(state_tensor) actions[non_random] = q_values.argmax(dim=1).cpu().numpy() # 随机的环境 if random_mask.any(): actions[random_mask] = np.random.randint( 0, self.agent.num_actions, size=random_mask.sum() ) return actions def evaluate(self): """评估智能体.""" rewards = [] for _ in range(self.num_eval_episodes): state, _ = self.eval_env.reset() episode_reward = 0 done = False while not done: action = self.agent.select_action(state, evaluate=True) state, reward, terminated, truncated, _ = self.eval_env.step(action) done = terminated or truncated episode_reward += reward rewards.append(episode_reward) return np.mean(rewards) # ── 主入口 ── def main(): parser = argparse.ArgumentParser(description="Parallel DQN for Space Invaders") # 并行参数 parser.add_argument("--n-envs", type=int, default=8, help="并行环境数") # 训练参数 parser.add_argument("--env", type=str, default="ALE/SpaceInvaders-v5") parser.add_argument("--steps", type=int, default=10_000_000, help="总训练步数") parser.add_argument("--lr", type=float, default=5e-5, help="学习率") parser.add_argument("--gamma", type=float, default=0.99, help="折扣因子") parser.add_argument("--batch-size", type=int, default=64, help="批次大小") parser.add_argument("--buffer-size", type=int, default=500_000, help="回放缓冲区大小") # ε-greedy parser.add_argument("--epsilon-start", type=float, default=1.0) parser.add_argument("--epsilon-end", type=float, default=0.01) parser.add_argument("--epsilon-decay", type=int, default=2_000_000) # 网络 parser.add_argument("--target-update", type=int, default=1000) parser.add_argument("--double-dqn", action="store_true", default=True) parser.add_argument("--dueling", action="store_true", default=True) # 学习率 parser.add_argument("--lr-decay-steps", type=int, default=5_000_000) parser.add_argument("--lr-decay-factor", type=float, default=0.5) parser.add_argument("--warmup-steps", type=int, default=10_000) # 评估 parser.add_argument("--eval-freq", type=int, default=50000) parser.add_argument("--eval-episodes", type=int, default=10) parser.add_argument("--save-freq", type=int, default=100000) # 优先回放 parser.add_argument("--prioritized", action="store_true", default=True) # 其他 parser.add_argument("--seed", type=int, default=42) parser.add_argument("--save-dir", type=str, default="models") parser.add_argument("--log-dir", type=str, default="logs") args = parser.parse_args() # 随机种子 torch.manual_seed(args.seed) np.random.seed(args.seed) # 设备 device = get_device() # 创建并行训练环境 print(f"创建 {args.n_envs} 个并行训练环境...") try: from gymnasium.vector import AsyncVectorEnv env_fns = [_make_env_fn(args.env) for _ in range(args.n_envs)] envs = AsyncVectorEnv(env_fns, shared_memory=True) except ImportError: print("AsyncVectorEnv 不可用,回退到 SyncVectorEnv") from gymnasium.vector import SyncVectorEnv env_fns = [_make_env_fn(args.env) for _ in range(args.n_envs)] envs = SyncVectorEnv(env_fns) # 创建评估环境(单环境) eval_env = make_env(args.env, gray_scale=True, resize=True, frame_stack=4) num_actions = envs.single_action_space.n print(f"动作空间: {num_actions}") print(f"实际环境数: {envs.num_envs}") state_shape = (4, 84, 84) # 创建网络 if args.dueling: print("使用 Dueling Double DQN") q_network = DuelingQNetwork(state_shape, num_actions).to(device) target_network = DuelingQNetwork(state_shape, num_actions).to(device) else: print("使用标准 DQN") q_network = QNetwork(state_shape, num_actions).to(device) target_network = QNetwork(state_shape, num_actions).to(device) target_network.load_state_dict(q_network.state_dict()) target_network.eval() print(f"网络参数量: {sum(p.numel() for p in q_network.parameters()):,}") # 回放缓冲区 if args.prioritized: print("使用优先经验回放") replay_buffer = PrioritizedReplayBuffer(args.buffer_size, state_shape, device) else: print("使用标准经验回放") replay_buffer = ReplayBuffer(args.buffer_size, state_shape, device) # 创建 Agent from src.agent import DQNAgent agent = DQNAgent( q_network=q_network, target_network=target_network, replay_buffer=replay_buffer, device=device, num_actions=num_actions, gamma=args.gamma, lr=args.lr, epsilon_start=args.epsilon_start, epsilon_end=args.epsilon_end, epsilon_decay_steps=args.epsilon_decay, target_update_freq=args.target_update, batch_size=args.batch_size, double_dqn=args.double_dqn, lr_decay_steps=args.lr_decay_steps, lr_decay_factor=args.lr_decay_factor, warmup_steps=args.warmup_steps, ) # 创建训练器 trainer = ParallelTrainer( agent=agent, envs=envs, eval_env=eval_env, num_envs=args.n_envs, save_dir=args.save_dir, eval_freq=args.eval_freq, save_freq=args.save_freq, num_eval_episodes=args.eval_episodes, warmup_steps=args.warmup_steps, ) # 打印配置 print("\n训练配置:") print(f" 并行环境数: {args.n_envs}") print(f" 总步数: {args.steps:,}") print(f" 学习率: {args.lr} (Warmup: {args.warmup_steps:,} 步)") print(f" ε衰减: {args.epsilon_start} -> {args.epsilon_end} ({args.epsilon_decay:,} 步)") print(f" 批次大小: {args.batch_size}") print(f" 缓冲区大小: {args.buffer_size:,}") print(f" Double DQN: {args.double_dqn}") print(f" Dueling: {args.dueling}") print("=" * 60) trainer.train(args.steps) if __name__ == "__main__": main()