chore: 更新项目文档、依赖和训练脚本
- 更新 requirements.txt,添加 opencv-python-headless 并补充 uv 安装说明 - 修复 CSV 文件中的换行符格式(CRLF 转 LF) - 更新 TASK_PROGRESS.md,记录并行训练实现和 WSL 支持 - 优化 train_improved.py 代码格式,移除多余空行和注释 - 更新课程作业要求文档的字符编码 - 添加新的 TensorBoard 日志文件和训练模型
This commit is contained in:
+37
-16
@@ -26,6 +26,9 @@
|
||||
| ✅ 环境预处理 | 灰度化 + Resize(84×84) + 帧堆叠(4帧) Wrapper | [src/utils.py](src/utils.py) |
|
||||
| ✅ 评估脚本 | 渲染测试 + 多回合平均分数评估 | [src/evaluate.py](src/evaluate.py) |
|
||||
| ✅ 训练入口 | 主训练循环、TensorBoard 记录、模型保存 | [train.py](train.py) |
|
||||
| ✅ 并行训练 | 多环境并行采集 + WSL 支持 | [train_parallel.py](train_parallel.py) |
|
||||
| ✅ WSL 脚本 | 环境配置 + 启动脚本 | [setup_wsl.sh](setup_wsl.sh)、[run_wsl.sh](run_wsl.sh)、[start_wsl_training.bat](start_wsl_training.bat) |
|
||||
| ✅ 测试脚本 | 快速验证并行环境和网络 | [test_parallel.py](test_parallel.py) |
|
||||
|
||||
**核心算法实现要点**:
|
||||
- 策略网络:3 层 CNN + FC(512) → μ, σ(高斯策略,tanh 激活)
|
||||
@@ -60,9 +63,15 @@
|
||||
│ ├── trainer.py # PPO 更新逻辑
|
||||
│ ├── utils.py # 环境预处理 wrappers
|
||||
│ └── evaluate.py # 评估脚本
|
||||
├── train.py # 主训练入口
|
||||
├── train.py # 单线程训练入口
|
||||
├── train_parallel.py # 多环境并行训练(推荐)
|
||||
├── setup_wsl.sh # WSL 环境配置
|
||||
├── run_wsl.sh # WSL 训练启动脚本
|
||||
├── start_wsl_training.bat # Windows 一键启动 WSL 训练
|
||||
├── test_parallel.py # 并行训练测试
|
||||
├── requirements.txt
|
||||
├── README.md
|
||||
├── WSL_README.md # WSL 训练指南
|
||||
└── TASK_PROGRESS.md # 本文档
|
||||
```
|
||||
|
||||
@@ -70,26 +79,38 @@
|
||||
|
||||
## 四、超参数配置
|
||||
|
||||
| 参数 | 值 |
|
||||
|------|-----|
|
||||
| Learning rate | 3e-4 |
|
||||
| Gamma | 0.99 |
|
||||
| GAE lambda | 0.95 |
|
||||
| Clip epsilon | 0.2 |
|
||||
| PPO epochs | 4 |
|
||||
| Mini-batch size | 64 |
|
||||
| Rollout steps | 2048 |
|
||||
| Entropy coefficient | 0.01 |
|
||||
| Value coefficient | 0.5 |
|
||||
| Max gradient norm | 0.5 |
|
||||
| State shape | (84, 84, 4) |
|
||||
| Action dim | 3(连续:steer, gas, brake) |
|
||||
| 参数 | train.py (单线程) | train_parallel.py (并行) |
|
||||
|------|-------------------|--------------------------|
|
||||
| Learning rate | 3e-4 | 3e-4 |
|
||||
| Gamma | 0.99 | 0.99 |
|
||||
| GAE lambda | 0.95 | 0.98 |
|
||||
| Clip epsilon | 0.2 | 0.1 |
|
||||
| PPO epochs | 4 | 10 |
|
||||
| Mini-batch size | 64 | 128 |
|
||||
| Rollout steps | 2048 | 2048 |
|
||||
| Entropy coefficient | 0.01 | 0.005 |
|
||||
| Value coefficient | 0.5 | 0.75 |
|
||||
| Max gradient norm | 0.5 | 0.5 |
|
||||
| 总步数 | 500,000 | 2,000,000 |
|
||||
| 环境数 | 1 | 4 |
|
||||
| 预计时长 | ~8h | ~5h (4x) |
|
||||
|
||||
---
|
||||
|
||||
## 五、下一步行动
|
||||
|
||||
### 立即执行
|
||||
### 方案 A:WSL 并行训练(推荐)
|
||||
```bash
|
||||
# Windows 下双击 start_wsl_training.bat
|
||||
# 或手动:
|
||||
wsl
|
||||
cd "/mnt/d/Code/doing_exercises/programs/外教作业外快/强化学习个人项目报告"
|
||||
chmod +x setup_wsl.sh run_wsl.sh
|
||||
./setup_wsl.sh # 首次运行
|
||||
./run_wsl.sh # 开始训练
|
||||
```
|
||||
|
||||
### 方案 B:Windows 单线程训练
|
||||
```bash
|
||||
# 1. 安装依赖
|
||||
uv pip install --system -r requirements.txt
|
||||
|
||||
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
Binary file not shown.
@@ -3,3 +3,8 @@ gymnasium[box2d]
|
||||
numpy
|
||||
matplotlib
|
||||
tensorboard
|
||||
opencv-python-headless
|
||||
|
||||
# uv 安装方式(可选):
|
||||
# curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
# uv pip install -r requirements.txt
|
||||
@@ -1,4 +1,5 @@
|
||||
"""Improved training script with reward shaping and better hyperparameters."""
|
||||
"""Improved training script for CarRacing-v3 PPO with reward shaping."""
|
||||
|
||||
import os
|
||||
import time
|
||||
import argparse
|
||||
@@ -12,8 +13,6 @@ import cv2
|
||||
|
||||
|
||||
class RewardShapingWrapper(gym.Wrapper):
|
||||
"""Add reward shaping for better learning."""
|
||||
|
||||
def __init__(self, env):
|
||||
super().__init__(env)
|
||||
self.steps_on_track = 0
|
||||
@@ -29,17 +28,17 @@ class RewardShapingWrapper(gym.Wrapper):
|
||||
|
||||
shaped_reward = reward
|
||||
|
||||
if info.get('speed', 0) > 0.1:
|
||||
shaped_reward += info['speed'] * 0.1
|
||||
if info.get("speed", 0) > 0.1:
|
||||
shaped_reward += info["speed"] * 0.1
|
||||
|
||||
if not info.get('offtrack', False):
|
||||
if not info.get("offtrack", False):
|
||||
shaped_reward += 0.1
|
||||
self.steps_on_track += 1
|
||||
else:
|
||||
shaped_reward -= 0.5
|
||||
self.steps_on_track = 0
|
||||
|
||||
if info.get('lap_complete', False):
|
||||
if info.get("lap_complete", False):
|
||||
shaped_reward += 100
|
||||
|
||||
return obs, shaped_reward, terminated, truncated, info
|
||||
@@ -70,9 +69,7 @@ class FrameStackWrapper(gym.ObservationWrapper):
|
||||
self.frames = deque(maxlen=num_stack)
|
||||
obs_shape = env.observation_space.shape
|
||||
self.observation_space = gym.spaces.Box(
|
||||
low=0, high=255,
|
||||
shape=(num_stack, *obs_shape[-2:]),
|
||||
dtype=np.uint8
|
||||
low=0, high=255, shape=(num_stack, *obs_shape[-2:]), dtype=np.uint8
|
||||
)
|
||||
|
||||
def reset(self, **kwargs):
|
||||
@@ -179,11 +176,7 @@ class Critic(nn.Module):
|
||||
out_h = (out_h - 3) // 1 + 1
|
||||
feat_size = 64 * out_h * out_h
|
||||
|
||||
self.fc = nn.Sequential(
|
||||
nn.Linear(feat_size, 512),
|
||||
nn.LeakyReLU(0.2),
|
||||
nn.Linear(512, 1)
|
||||
)
|
||||
self.fc = nn.Sequential(nn.Linear(feat_size, 512), nn.LeakyReLU(0.2), nn.Linear(512, 1))
|
||||
|
||||
for m in self.modules():
|
||||
if isinstance(m, (nn.Conv2d, nn.Linear)):
|
||||
@@ -286,8 +279,6 @@ class PPOTrainer:
|
||||
self.actor_optim = torch.optim.Adam(actor.parameters(), lr=lr, eps=1e-5)
|
||||
self.critic_optim = torch.optim.Adam(critic.parameters(), lr=lr, eps=1e-5)
|
||||
|
||||
self.total_updates = 0
|
||||
|
||||
def update(self, last_value):
|
||||
states, actions, rewards, dones, values, log_probs_old = self.buffer.get()
|
||||
|
||||
@@ -344,8 +335,6 @@ class PPOTrainer:
|
||||
total_entropy += entropy.mean().item()
|
||||
count += 1
|
||||
|
||||
self.total_updates += 1
|
||||
|
||||
avg_actor = total_actor_loss / count
|
||||
avg_critic = total_critic_loss / count
|
||||
avg_entropy = total_entropy / count
|
||||
@@ -388,7 +377,7 @@ def collect_rollout(actor, critic, env, buffer, device, rollout_steps):
|
||||
return obs
|
||||
|
||||
|
||||
def train_improved(
|
||||
def train(
|
||||
total_steps=2000000,
|
||||
rollout_steps=2048,
|
||||
eval_interval=10,
|
||||
@@ -434,12 +423,12 @@ def train_improved(
|
||||
|
||||
print(f"Training on {device}")
|
||||
print(f"Log directory: {log_dir}")
|
||||
print("Improvements: LeakyReLU, BatchNorm, He init, Reward shaping, LR decay, More epochs")
|
||||
print("Improvements: LeakyReLU, BatchNorm, He init, Reward shaping, More epochs")
|
||||
|
||||
episode = 0
|
||||
total_timesteps = 0
|
||||
episode_rewards = []
|
||||
best_eval = -float('inf')
|
||||
best_eval = -float("inf")
|
||||
|
||||
while total_timesteps < total_steps:
|
||||
obs = collect_rollout(actor, critic, env, buffer, device, rollout_steps)
|
||||
@@ -466,7 +455,9 @@ def train_improved(
|
||||
writer.add_scalar("Reward/Episode", ep_reward, total_timesteps)
|
||||
writer.add_scalar("Reward/AvgLast10", avg_reward, total_timesteps)
|
||||
|
||||
print(f"Episode {episode}, steps {total_timesteps}, ep_reward={ep_reward:.1f}, avg_10={avg_reward:.1f}")
|
||||
print(
|
||||
f"Episode {episode}, steps {total_timesteps}, ep_reward={ep_reward:.1f}, avg_10={avg_reward:.1f}"
|
||||
)
|
||||
|
||||
if episode % eval_interval == 0:
|
||||
eval_returns = []
|
||||
@@ -478,7 +469,13 @@ def train_improved(
|
||||
|
||||
while not done:
|
||||
with torch.no_grad():
|
||||
eval_obs_t = torch.from_numpy(eval_obs).float().unsqueeze(0).permute(0, 3, 1, 2).to(device)
|
||||
eval_obs_t = (
|
||||
torch.from_numpy(eval_obs)
|
||||
.float()
|
||||
.unsqueeze(0)
|
||||
.permute(0, 3, 1, 2)
|
||||
.to(device)
|
||||
)
|
||||
mu, std = actor(eval_obs_t)
|
||||
action = torch.clamp(mu, -1, 1).squeeze(0).cpu().numpy()
|
||||
eval_obs, reward, terminated, truncated, _ = eval_env.step(action)
|
||||
@@ -495,33 +492,42 @@ def train_improved(
|
||||
if mean_eval > best_eval:
|
||||
best_eval = mean_eval
|
||||
os.makedirs("models", exist_ok=True)
|
||||
torch.save({
|
||||
torch.save(
|
||||
{
|
||||
"actor": actor.state_dict(),
|
||||
"critic": critic.state_dict(),
|
||||
"episode": episode,
|
||||
"timesteps": total_timesteps,
|
||||
"best_eval": best_eval,
|
||||
}, os.path.join("models", "ppo_improved_best.pt"))
|
||||
},
|
||||
os.path.join("models", "ppo_improved_best.pt"),
|
||||
)
|
||||
print(f" New best model saved! eval={best_eval:.2f}")
|
||||
|
||||
if episode % save_interval == 0:
|
||||
os.makedirs("models", exist_ok=True)
|
||||
torch.save({
|
||||
torch.save(
|
||||
{
|
||||
"actor": actor.state_dict(),
|
||||
"critic": critic.state_dict(),
|
||||
"episode": episode,
|
||||
"timesteps": total_timesteps,
|
||||
}, os.path.join("models", f"ppo_improved_ep{episode}.pt"))
|
||||
},
|
||||
os.path.join("models", f"ppo_improved_ep{episode}.pt"),
|
||||
)
|
||||
print(f" Saved model at episode {episode}")
|
||||
|
||||
os.makedirs("models", exist_ok=True)
|
||||
torch.save({
|
||||
torch.save(
|
||||
{
|
||||
"actor": actor.state_dict(),
|
||||
"critic": critic.state_dict(),
|
||||
"episode": episode,
|
||||
"timesteps": total_timesteps,
|
||||
"best_eval": best_eval,
|
||||
}, os.path.join("models", "ppo_improved_final.pt"))
|
||||
},
|
||||
os.path.join("models", "ppo_improved_final.pt"),
|
||||
)
|
||||
|
||||
writer.close()
|
||||
env.close()
|
||||
@@ -536,4 +542,4 @@ if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
|
||||
device = get_device()
|
||||
train_improved(total_steps=args.steps, rollout_steps=args.rollout, device=device)
|
||||
train(total_steps=args.steps, rollout_steps=args.rollout, device=device)
|
||||
|
||||
Reference in New Issue
Block a user