chore: 更新项目文档、依赖和训练脚本

- 更新 requirements.txt，添加 opencv-python-headless 并补充 uv 安装说明 - 修复 CSV 文件中的换行符格式（CRLF 转 LF） - 更新 TASK_PROGRESS.md，记录并行训练实现和 WSL 支持 - 优化 train_improved.py 代码格式，移除多余空行和注释 - 更新课程作业要求文档的字符编码 - 添加新的 TensorBoard 日志文件和训练模型
2026-05-01 09:26:23 +08:00
parent 6b929e9790
commit d6860f1f15
16 changed files with 25712 additions and 25680 deletions
@@ -26,6 +26,9 @@
 | ✅ 环境预处理 | 灰度化 + Resize(84×84) + 帧堆叠(4帧) Wrapper | [src/utils.py](src/utils.py) |
 | ✅ 评估脚本 | 渲染测试 + 多回合平均分数评估 | [src/evaluate.py](src/evaluate.py) |
 | ✅ 训练入口 | 主训练循环、TensorBoard 记录、模型保存 | [train.py](train.py) |
 | ✅ 并行训练 | 多环境并行采集 + WSL 支持 | [train_parallel.py](train_parallel.py) |
 | ✅ WSL 脚本 | 环境配置 + 启动脚本 | [setup_wsl.sh](setup_wsl.sh)、[run_wsl.sh](run_wsl.sh)、[start_wsl_training.bat](start_wsl_training.bat) |
 | ✅ 测试脚本 | 快速验证并行环境和网络 | [test_parallel.py](test_parallel.py) |
 **核心算法实现要点**：
 - 策略网络：3 层 CNN + FC(512) → μ, σ（高斯策略，tanh 激活）
@@ -60,9 +63,15 @@
 │   ├── trainer.py          # PPO 更新逻辑
 │   ├── utils.py           # 环境预处理 wrappers
 │   └── evaluate.py         # 评估脚本
-├── train.py                 # 主训练入口
+├── train.py                 # 单线程训练入口
 ├── train_parallel.py        # 多环境并行训练（推荐）
 ├── setup_wsl.sh             # WSL 环境配置
 ├── run_wsl.sh               # WSL 训练启动脚本
 ├── start_wsl_training.bat   # Windows 一键启动 WSL 训练
 ├── test_parallel.py         # 并行训练测试
 ├── requirements.txt
 ├── README.md
 ├── WSL_README.md            # WSL 训练指南
 └── TASK_PROGRESS.md         # 本文档
 ```
@@ -70,26 +79,38 @@
 ## 四、超参数配置
-| 参数 | 值 |
+| 参数 | train.py (单线程) | train_parallel.py (并行) |
-|------|-----|
+|------|-------------------|--------------------------|
-| Learning rate | 3e-4 |
+| Learning rate | 3e-4 | 3e-4 |
-| Gamma | 0.99 |
+| Gamma | 0.99 | 0.99 |
-| GAE lambda | 0.95 |
+| GAE lambda | 0.95 | 0.98 |
-| Clip epsilon | 0.2 |
+| Clip epsilon | 0.2 | 0.1 |
-| PPO epochs | 4 |
+| PPO epochs | 4 | 10 |
-| Mini-batch size | 64 |
+| Mini-batch size | 64 | 128 |
-| Rollout steps | 2048 |
+| Rollout steps | 2048 | 2048 |
-| Entropy coefficient | 0.01 |
+| Entropy coefficient | 0.01 | 0.005 |
-| Value coefficient | 0.5 |
+| Value coefficient | 0.5 | 0.75 |
-| Max gradient norm | 0.5 |
+| Max gradient norm | 0.5 | 0.5 |
-| State shape | (84, 84, 4) |
+| 总步数 | 500,000 | 2,000,000 |
-| Action dim | 3（连续：steer, gas, brake） |
+| 环境数 | 1 | 4 |
 | 预计时长 | ~8h | ~5h (4x) |
 ---
 ## 五、下一步行动
-### 立即执行
+### 方案 A：WSL 并行训练（推荐）
 ```bash
 # Windows 下双击 start_wsl_training.bat
 # 或手动：
 wsl
 cd "/mnt/d/Code/doing_exercises/programs/外教作业外快/强化学习个人项目报告"
 chmod +x setup_wsl.sh run_wsl.sh
 ./setup_wsl.sh   # 首次运行
 ./run_wsl.sh     # 开始训练
 ```
 ### 方案 B：Windows 单线程训练
 ```bash
 # 1. 安装依赖
 uv pip install --system -r requirements.txt
@@ -3,3 +3,8 @@ gymnasium[box2d]
 numpy
 matplotlib
 tensorboard
 opencv-python-headless
 # uv 安装方式（可选）:
 # curl -LsSf https://astral.sh/uv/install.sh | sh
 # uv pip install -r requirements.txt
@@ -1,4 +1,5 @@
-"""Improved training script with reward shaping and better hyperparameters."""
+"""Improved training script for CarRacing-v3 PPO with reward shaping."""
 import os
 import time
 import argparse
@@ -12,8 +13,6 @@ import cv2
 class RewardShapingWrapper(gym.Wrapper):
    """Add reward shaping for better learning."""
    def __init__(self, env):
        super().__init__(env)
        self.steps_on_track = 0
@@ -29,17 +28,17 @@ class RewardShapingWrapper(gym.Wrapper):
        shaped_reward = reward
-        if info.get('speed', 0) > 0.1:
+        if info.get("speed", 0) > 0.1:
-            shaped_reward += info['speed'] * 0.1
+            shaped_reward += info["speed"] * 0.1
-        if not info.get('offtrack', False):
+        if not info.get("offtrack", False):
            shaped_reward += 0.1
            self.steps_on_track += 1
        else:
            shaped_reward -= 0.5
            self.steps_on_track = 0
-        if info.get('lap_complete', False):
+        if info.get("lap_complete", False):
            shaped_reward += 100
        return obs, shaped_reward, terminated, truncated, info
@@ -70,9 +69,7 @@ class FrameStackWrapper(gym.ObservationWrapper):
        self.frames = deque(maxlen=num_stack)
        obs_shape = env.observation_space.shape
        self.observation_space = gym.spaces.Box(
-            low=0, high=255,
+            low=0, high=255, shape=(num_stack, *obs_shape[-2:]), dtype=np.uint8
            shape=(num_stack, *obs_shape[-2:]),
            dtype=np.uint8
        )
    def reset(self, **kwargs):
@@ -179,11 +176,7 @@ class Critic(nn.Module):
        out_h = (out_h - 3) // 1 + 1
        feat_size = 64 * out_h * out_h
-        self.fc = nn.Sequential(
+        self.fc = nn.Sequential(nn.Linear(feat_size, 512), nn.LeakyReLU(0.2), nn.Linear(512, 1))
            nn.Linear(feat_size, 512),
            nn.LeakyReLU(0.2),
            nn.Linear(512, 1)
        )
        for m in self.modules():
            if isinstance(m, (nn.Conv2d, nn.Linear)):
@@ -235,17 +228,17 @@ class RolloutBuffer:
            last_gae = delta + gamma * gae_lambda * (1 - self.dones[t]) * last_gae
            advantages[t] = last_gae
-        returns = advantages + self.values[:self.size]
+        returns = advantages + self.values[: self.size]
        return returns, advantages
    def get(self):
        return (
-            self.states[:self.size],
+            self.states[: self.size],
-            self.actions[:self.size],
+            self.actions[: self.size],
-            self.rewards[:self.size],
+            self.rewards[: self.size],
-            self.dones[:self.size],
+            self.dones[: self.size],
-            self.values[:self.size],
+            self.values[: self.size],
-            self.log_probs[:self.size],
+            self.log_probs[: self.size],
        )
    def reset(self):
@@ -286,8 +279,6 @@ class PPOTrainer:
        self.actor_optim = torch.optim.Adam(actor.parameters(), lr=lr, eps=1e-5)
        self.critic_optim = torch.optim.Adam(critic.parameters(), lr=lr, eps=1e-5)
        self.total_updates = 0
    def update(self, last_value):
        states, actions, rewards, dones, values, log_probs_old = self.buffer.get()
@@ -344,8 +335,6 @@ class PPOTrainer:
                total_entropy += entropy.mean().item()
                count += 1
        self.total_updates += 1
        avg_actor = total_actor_loss / count
        avg_critic = total_critic_loss / count
        avg_entropy = total_entropy / count
@@ -388,7 +377,7 @@ def collect_rollout(actor, critic, env, buffer, device, rollout_steps):
    return obs
-def train_improved(
+def train(
    total_steps=2000000,
    rollout_steps=2048,
    eval_interval=10,
@@ -434,12 +423,12 @@ def train_improved(
    print(f"Training on {device}")
    print(f"Log directory: {log_dir}")
-    print("Improvements: LeakyReLU, BatchNorm, He init, Reward shaping, LR decay, More epochs")
+    print("Improvements: LeakyReLU, BatchNorm, He init, Reward shaping, More epochs")
    episode = 0
    total_timesteps = 0
    episode_rewards = []
-    best_eval = -float('inf')
+    best_eval = -float("inf")
    while total_timesteps < total_steps:
        obs = collect_rollout(actor, critic, env, buffer, device, rollout_steps)
@@ -457,7 +446,7 @@ def train_improved(
        total_timesteps += rollout_steps
        episode += 1
-        ep_reward = buffer.rewards[:buffer.size].sum()
+        ep_reward = buffer.rewards[: buffer.size].sum()
        episode_rewards.append(ep_reward)
        recent_rewards = episode_rewards[-10:] if len(episode_rewards) >= 10 else episode_rewards
@@ -466,7 +455,9 @@ def train_improved(
        writer.add_scalar("Reward/Episode", ep_reward, total_timesteps)
        writer.add_scalar("Reward/AvgLast10", avg_reward, total_timesteps)
-        print(f"Episode {episode}, steps {total_timesteps}, ep_reward={ep_reward:.1f}, avg_10={avg_reward:.1f}")
+        print(
            f"Episode {episode}, steps {total_timesteps}, ep_reward={ep_reward:.1f}, avg_10={avg_reward:.1f}"
        )
        if episode % eval_interval == 0:
            eval_returns = []
@@ -478,7 +469,13 @@ def train_improved(
                while not done:
                    with torch.no_grad():
-                        eval_obs_t = torch.from_numpy(eval_obs).float().unsqueeze(0).permute(0, 3, 1, 2).to(device)
+                        eval_obs_t = (
                            torch.from_numpy(eval_obs)
                            .float()
                            .unsqueeze(0)
                            .permute(0, 3, 1, 2)
                            .to(device)
                        )
                        mu, std = actor(eval_obs_t)
                        action = torch.clamp(mu, -1, 1).squeeze(0).cpu().numpy()
                    eval_obs, reward, terminated, truncated, _ = eval_env.step(action)
@@ -495,33 +492,42 @@ def train_improved(
            if mean_eval > best_eval:
                best_eval = mean_eval
                os.makedirs("models", exist_ok=True)
-                torch.save({
+                torch.save(
                    {
                        "actor": actor.state_dict(),
                        "critic": critic.state_dict(),
                        "episode": episode,
                        "timesteps": total_timesteps,
                        "best_eval": best_eval,
-                }, os.path.join("models", "ppo_improved_best.pt"))
+                    },
                    os.path.join("models", "ppo_improved_best.pt"),
                )
                print(f"  New best model saved! eval={best_eval:.2f}")
        if episode % save_interval == 0:
            os.makedirs("models", exist_ok=True)
-            torch.save({
+            torch.save(
                {
                    "actor": actor.state_dict(),
                    "critic": critic.state_dict(),
                    "episode": episode,
                    "timesteps": total_timesteps,
-            }, os.path.join("models", f"ppo_improved_ep{episode}.pt"))
+                },
                os.path.join("models", f"ppo_improved_ep{episode}.pt"),
            )
            print(f"  Saved model at episode {episode}")
    os.makedirs("models", exist_ok=True)
-    torch.save({
+    torch.save(
        {
            "actor": actor.state_dict(),
            "critic": critic.state_dict(),
            "episode": episode,
            "timesteps": total_timesteps,
            "best_eval": best_eval,
-    }, os.path.join("models", "ppo_improved_final.pt"))
+        },
        os.path.join("models", "ppo_improved_final.pt"),
    )
    writer.close()
    env.close()
@@ -536,4 +542,4 @@ if __name__ == "__main__":
    args = parser.parse_args()
    device = get_device()
-    train_improved(total_steps=args.steps, rollout_steps=args.rollout, device=device)
+    train(total_steps=args.steps, rollout_steps=args.rollout, device=device)