chore: 更新项目文档、依赖和训练脚本

- 更新 requirements.txt,添加 opencv-python-headless 并补充 uv 安装说明
- 修复 CSV 文件中的换行符格式(CRLF 转 LF)
- 更新 TASK_PROGRESS.md,记录并行训练实现和 WSL 支持
- 优化 train_improved.py 代码格式,移除多余空行和注释
- 更新课程作业要求文档的字符编码
- 添加新的 TensorBoard 日志文件和训练模型
This commit is contained in:
2026-05-01 09:26:23 +08:00
parent 6b929e9790
commit d6860f1f15
16 changed files with 25712 additions and 25680 deletions
+37 -16
View File
@@ -26,6 +26,9 @@
| ✅ 环境预处理 | 灰度化 + Resize(84×84) + 帧堆叠(4帧) Wrapper | [src/utils.py](src/utils.py) |
| ✅ 评估脚本 | 渲染测试 + 多回合平均分数评估 | [src/evaluate.py](src/evaluate.py) |
| ✅ 训练入口 | 主训练循环、TensorBoard 记录、模型保存 | [train.py](train.py) |
| ✅ 并行训练 | 多环境并行采集 + WSL 支持 | [train_parallel.py](train_parallel.py) |
| ✅ WSL 脚本 | 环境配置 + 启动脚本 | [setup_wsl.sh](setup_wsl.sh)、[run_wsl.sh](run_wsl.sh)、[start_wsl_training.bat](start_wsl_training.bat) |
| ✅ 测试脚本 | 快速验证并行环境和网络 | [test_parallel.py](test_parallel.py) |
**核心算法实现要点**
- 策略网络:3 层 CNN + FC(512) → μ, σ(高斯策略,tanh 激活)
@@ -60,9 +63,15 @@
│ ├── trainer.py # PPO 更新逻辑
│ ├── utils.py # 环境预处理 wrappers
│ └── evaluate.py # 评估脚本
├── train.py # 训练入口
├── train.py # 单线程训练入口
├── train_parallel.py # 多环境并行训练(推荐)
├── setup_wsl.sh # WSL 环境配置
├── run_wsl.sh # WSL 训练启动脚本
├── start_wsl_training.bat # Windows 一键启动 WSL 训练
├── test_parallel.py # 并行训练测试
├── requirements.txt
├── README.md
├── WSL_README.md # WSL 训练指南
└── TASK_PROGRESS.md # 本文档
```
@@ -70,26 +79,38 @@
## 四、超参数配置
| 参数 | |
|------|-----|
| Learning rate | 3e-4 |
| Gamma | 0.99 |
| GAE lambda | 0.95 |
| Clip epsilon | 0.2 |
| PPO epochs | 4 |
| Mini-batch size | 64 |
| Rollout steps | 2048 |
| Entropy coefficient | 0.01 |
| Value coefficient | 0.5 |
| Max gradient norm | 0.5 |
| State shape | (84, 84, 4) |
| Action dim | 3(连续:steer, gas, brake |
| 参数 | train.py (单线程) | train_parallel.py (并行) |
|------|-------------------|--------------------------|
| Learning rate | 3e-4 | 3e-4 |
| Gamma | 0.99 | 0.99 |
| GAE lambda | 0.95 | 0.98 |
| Clip epsilon | 0.2 | 0.1 |
| PPO epochs | 4 | 10 |
| Mini-batch size | 64 | 128 |
| Rollout steps | 2048 | 2048 |
| Entropy coefficient | 0.01 | 0.005 |
| Value coefficient | 0.5 | 0.75 |
| Max gradient norm | 0.5 | 0.5 |
| 总步数 | 500,000 | 2,000,000 |
| 环境数 | 1 | 4 |
| 预计时长 | ~8h | ~5h (4x) |
---
## 五、下一步行动
### 立即执行
### 方案 AWSL 并行训练(推荐)
```bash
# Windows 下双击 start_wsl_training.bat
# 或手动:
wsl
cd "/mnt/d/Code/doing_exercises/programs/外教作业外快/强化学习个人项目报告"
chmod +x setup_wsl.sh run_wsl.sh
./setup_wsl.sh # 首次运行
./run_wsl.sh # 开始训练
```
### 方案 BWindows 单线程训练
```bash
# 1. 安装依赖
uv pip install --system -r requirements.txt
@@ -3,3 +3,8 @@ gymnasium[box2d]
numpy
matplotlib
tensorboard
opencv-python-headless
# uv 安装方式(可选):
# curl -LsSf https://astral.sh/uv/install.sh | sh
# uv pip install -r requirements.txt
@@ -1,4 +1,5 @@
"""Improved training script with reward shaping and better hyperparameters."""
"""Improved training script for CarRacing-v3 PPO with reward shaping."""
import os
import time
import argparse
@@ -12,8 +13,6 @@ import cv2
class RewardShapingWrapper(gym.Wrapper):
"""Add reward shaping for better learning."""
def __init__(self, env):
super().__init__(env)
self.steps_on_track = 0
@@ -29,17 +28,17 @@ class RewardShapingWrapper(gym.Wrapper):
shaped_reward = reward
if info.get('speed', 0) > 0.1:
shaped_reward += info['speed'] * 0.1
if info.get("speed", 0) > 0.1:
shaped_reward += info["speed"] * 0.1
if not info.get('offtrack', False):
if not info.get("offtrack", False):
shaped_reward += 0.1
self.steps_on_track += 1
else:
shaped_reward -= 0.5
self.steps_on_track = 0
if info.get('lap_complete', False):
if info.get("lap_complete", False):
shaped_reward += 100
return obs, shaped_reward, terminated, truncated, info
@@ -70,9 +69,7 @@ class FrameStackWrapper(gym.ObservationWrapper):
self.frames = deque(maxlen=num_stack)
obs_shape = env.observation_space.shape
self.observation_space = gym.spaces.Box(
low=0, high=255,
shape=(num_stack, *obs_shape[-2:]),
dtype=np.uint8
low=0, high=255, shape=(num_stack, *obs_shape[-2:]), dtype=np.uint8
)
def reset(self, **kwargs):
@@ -179,11 +176,7 @@ class Critic(nn.Module):
out_h = (out_h - 3) // 1 + 1
feat_size = 64 * out_h * out_h
self.fc = nn.Sequential(
nn.Linear(feat_size, 512),
nn.LeakyReLU(0.2),
nn.Linear(512, 1)
)
self.fc = nn.Sequential(nn.Linear(feat_size, 512), nn.LeakyReLU(0.2), nn.Linear(512, 1))
for m in self.modules():
if isinstance(m, (nn.Conv2d, nn.Linear)):
@@ -286,8 +279,6 @@ class PPOTrainer:
self.actor_optim = torch.optim.Adam(actor.parameters(), lr=lr, eps=1e-5)
self.critic_optim = torch.optim.Adam(critic.parameters(), lr=lr, eps=1e-5)
self.total_updates = 0
def update(self, last_value):
states, actions, rewards, dones, values, log_probs_old = self.buffer.get()
@@ -344,8 +335,6 @@ class PPOTrainer:
total_entropy += entropy.mean().item()
count += 1
self.total_updates += 1
avg_actor = total_actor_loss / count
avg_critic = total_critic_loss / count
avg_entropy = total_entropy / count
@@ -388,7 +377,7 @@ def collect_rollout(actor, critic, env, buffer, device, rollout_steps):
return obs
def train_improved(
def train(
total_steps=2000000,
rollout_steps=2048,
eval_interval=10,
@@ -434,12 +423,12 @@ def train_improved(
print(f"Training on {device}")
print(f"Log directory: {log_dir}")
print("Improvements: LeakyReLU, BatchNorm, He init, Reward shaping, LR decay, More epochs")
print("Improvements: LeakyReLU, BatchNorm, He init, Reward shaping, More epochs")
episode = 0
total_timesteps = 0
episode_rewards = []
best_eval = -float('inf')
best_eval = -float("inf")
while total_timesteps < total_steps:
obs = collect_rollout(actor, critic, env, buffer, device, rollout_steps)
@@ -466,7 +455,9 @@ def train_improved(
writer.add_scalar("Reward/Episode", ep_reward, total_timesteps)
writer.add_scalar("Reward/AvgLast10", avg_reward, total_timesteps)
print(f"Episode {episode}, steps {total_timesteps}, ep_reward={ep_reward:.1f}, avg_10={avg_reward:.1f}")
print(
f"Episode {episode}, steps {total_timesteps}, ep_reward={ep_reward:.1f}, avg_10={avg_reward:.1f}"
)
if episode % eval_interval == 0:
eval_returns = []
@@ -478,7 +469,13 @@ def train_improved(
while not done:
with torch.no_grad():
eval_obs_t = torch.from_numpy(eval_obs).float().unsqueeze(0).permute(0, 3, 1, 2).to(device)
eval_obs_t = (
torch.from_numpy(eval_obs)
.float()
.unsqueeze(0)
.permute(0, 3, 1, 2)
.to(device)
)
mu, std = actor(eval_obs_t)
action = torch.clamp(mu, -1, 1).squeeze(0).cpu().numpy()
eval_obs, reward, terminated, truncated, _ = eval_env.step(action)
@@ -495,33 +492,42 @@ def train_improved(
if mean_eval > best_eval:
best_eval = mean_eval
os.makedirs("models", exist_ok=True)
torch.save({
torch.save(
{
"actor": actor.state_dict(),
"critic": critic.state_dict(),
"episode": episode,
"timesteps": total_timesteps,
"best_eval": best_eval,
}, os.path.join("models", "ppo_improved_best.pt"))
},
os.path.join("models", "ppo_improved_best.pt"),
)
print(f" New best model saved! eval={best_eval:.2f}")
if episode % save_interval == 0:
os.makedirs("models", exist_ok=True)
torch.save({
torch.save(
{
"actor": actor.state_dict(),
"critic": critic.state_dict(),
"episode": episode,
"timesteps": total_timesteps,
}, os.path.join("models", f"ppo_improved_ep{episode}.pt"))
},
os.path.join("models", f"ppo_improved_ep{episode}.pt"),
)
print(f" Saved model at episode {episode}")
os.makedirs("models", exist_ok=True)
torch.save({
torch.save(
{
"actor": actor.state_dict(),
"critic": critic.state_dict(),
"episode": episode,
"timesteps": total_timesteps,
"best_eval": best_eval,
}, os.path.join("models", "ppo_improved_final.pt"))
},
os.path.join("models", "ppo_improved_final.pt"),
)
writer.close()
env.close()
@@ -536,4 +542,4 @@ if __name__ == "__main__":
args = parser.parse_args()
device = get_device()
train_improved(total_steps=args.steps, rollout_steps=args.rollout, device=device)
train(total_steps=args.steps, rollout_steps=args.rollout, device=device)