chore: 更新项目文档、依赖和训练脚本
- 更新 requirements.txt,添加 opencv-python-headless 并补充 uv 安装说明 - 修复 CSV 文件中的换行符格式(CRLF 转 LF) - 更新 TASK_PROGRESS.md,记录并行训练实现和 WSL 支持 - 优化 train_improved.py 代码格式,移除多余空行和注释 - 更新课程作业要求文档的字符编码 - 添加新的 TensorBoard 日志文件和训练模型
This commit is contained in:
+37
-16
@@ -26,6 +26,9 @@
|
|||||||
| ✅ 环境预处理 | 灰度化 + Resize(84×84) + 帧堆叠(4帧) Wrapper | [src/utils.py](src/utils.py) |
|
| ✅ 环境预处理 | 灰度化 + Resize(84×84) + 帧堆叠(4帧) Wrapper | [src/utils.py](src/utils.py) |
|
||||||
| ✅ 评估脚本 | 渲染测试 + 多回合平均分数评估 | [src/evaluate.py](src/evaluate.py) |
|
| ✅ 评估脚本 | 渲染测试 + 多回合平均分数评估 | [src/evaluate.py](src/evaluate.py) |
|
||||||
| ✅ 训练入口 | 主训练循环、TensorBoard 记录、模型保存 | [train.py](train.py) |
|
| ✅ 训练入口 | 主训练循环、TensorBoard 记录、模型保存 | [train.py](train.py) |
|
||||||
|
| ✅ 并行训练 | 多环境并行采集 + WSL 支持 | [train_parallel.py](train_parallel.py) |
|
||||||
|
| ✅ WSL 脚本 | 环境配置 + 启动脚本 | [setup_wsl.sh](setup_wsl.sh)、[run_wsl.sh](run_wsl.sh)、[start_wsl_training.bat](start_wsl_training.bat) |
|
||||||
|
| ✅ 测试脚本 | 快速验证并行环境和网络 | [test_parallel.py](test_parallel.py) |
|
||||||
|
|
||||||
**核心算法实现要点**:
|
**核心算法实现要点**:
|
||||||
- 策略网络:3 层 CNN + FC(512) → μ, σ(高斯策略,tanh 激活)
|
- 策略网络:3 层 CNN + FC(512) → μ, σ(高斯策略,tanh 激活)
|
||||||
@@ -60,9 +63,15 @@
|
|||||||
│ ├── trainer.py # PPO 更新逻辑
|
│ ├── trainer.py # PPO 更新逻辑
|
||||||
│ ├── utils.py # 环境预处理 wrappers
|
│ ├── utils.py # 环境预处理 wrappers
|
||||||
│ └── evaluate.py # 评估脚本
|
│ └── evaluate.py # 评估脚本
|
||||||
├── train.py # 主训练入口
|
├── train.py # 单线程训练入口
|
||||||
|
├── train_parallel.py # 多环境并行训练(推荐)
|
||||||
|
├── setup_wsl.sh # WSL 环境配置
|
||||||
|
├── run_wsl.sh # WSL 训练启动脚本
|
||||||
|
├── start_wsl_training.bat # Windows 一键启动 WSL 训练
|
||||||
|
├── test_parallel.py # 并行训练测试
|
||||||
├── requirements.txt
|
├── requirements.txt
|
||||||
├── README.md
|
├── README.md
|
||||||
|
├── WSL_README.md # WSL 训练指南
|
||||||
└── TASK_PROGRESS.md # 本文档
|
└── TASK_PROGRESS.md # 本文档
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -70,26 +79,38 @@
|
|||||||
|
|
||||||
## 四、超参数配置
|
## 四、超参数配置
|
||||||
|
|
||||||
| 参数 | 值 |
|
| 参数 | train.py (单线程) | train_parallel.py (并行) |
|
||||||
|------|-----|
|
|------|-------------------|--------------------------|
|
||||||
| Learning rate | 3e-4 |
|
| Learning rate | 3e-4 | 3e-4 |
|
||||||
| Gamma | 0.99 |
|
| Gamma | 0.99 | 0.99 |
|
||||||
| GAE lambda | 0.95 |
|
| GAE lambda | 0.95 | 0.98 |
|
||||||
| Clip epsilon | 0.2 |
|
| Clip epsilon | 0.2 | 0.1 |
|
||||||
| PPO epochs | 4 |
|
| PPO epochs | 4 | 10 |
|
||||||
| Mini-batch size | 64 |
|
| Mini-batch size | 64 | 128 |
|
||||||
| Rollout steps | 2048 |
|
| Rollout steps | 2048 | 2048 |
|
||||||
| Entropy coefficient | 0.01 |
|
| Entropy coefficient | 0.01 | 0.005 |
|
||||||
| Value coefficient | 0.5 |
|
| Value coefficient | 0.5 | 0.75 |
|
||||||
| Max gradient norm | 0.5 |
|
| Max gradient norm | 0.5 | 0.5 |
|
||||||
| State shape | (84, 84, 4) |
|
| 总步数 | 500,000 | 2,000,000 |
|
||||||
| Action dim | 3(连续:steer, gas, brake) |
|
| 环境数 | 1 | 4 |
|
||||||
|
| 预计时长 | ~8h | ~5h (4x) |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 五、下一步行动
|
## 五、下一步行动
|
||||||
|
|
||||||
### 立即执行
|
### 方案 A:WSL 并行训练(推荐)
|
||||||
|
```bash
|
||||||
|
# Windows 下双击 start_wsl_training.bat
|
||||||
|
# 或手动:
|
||||||
|
wsl
|
||||||
|
cd "/mnt/d/Code/doing_exercises/programs/外教作业外快/强化学习个人项目报告"
|
||||||
|
chmod +x setup_wsl.sh run_wsl.sh
|
||||||
|
./setup_wsl.sh # 首次运行
|
||||||
|
./run_wsl.sh # 开始训练
|
||||||
|
```
|
||||||
|
|
||||||
|
### 方案 B:Windows 单线程训练
|
||||||
```bash
|
```bash
|
||||||
# 1. 安装依赖
|
# 1. 安装依赖
|
||||||
uv pip install --system -r requirements.txt
|
uv pip install --system -r requirements.txt
|
||||||
|
|||||||
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
Binary file not shown.
@@ -3,3 +3,8 @@ gymnasium[box2d]
|
|||||||
numpy
|
numpy
|
||||||
matplotlib
|
matplotlib
|
||||||
tensorboard
|
tensorboard
|
||||||
|
opencv-python-headless
|
||||||
|
|
||||||
|
# uv 安装方式(可选):
|
||||||
|
# curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||||
|
# uv pip install -r requirements.txt
|
||||||
@@ -1,4 +1,5 @@
|
|||||||
"""Improved training script with reward shaping and better hyperparameters."""
|
"""Improved training script for CarRacing-v3 PPO with reward shaping."""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
import argparse
|
import argparse
|
||||||
@@ -12,8 +13,6 @@ import cv2
|
|||||||
|
|
||||||
|
|
||||||
class RewardShapingWrapper(gym.Wrapper):
|
class RewardShapingWrapper(gym.Wrapper):
|
||||||
"""Add reward shaping for better learning."""
|
|
||||||
|
|
||||||
def __init__(self, env):
|
def __init__(self, env):
|
||||||
super().__init__(env)
|
super().__init__(env)
|
||||||
self.steps_on_track = 0
|
self.steps_on_track = 0
|
||||||
@@ -29,17 +28,17 @@ class RewardShapingWrapper(gym.Wrapper):
|
|||||||
|
|
||||||
shaped_reward = reward
|
shaped_reward = reward
|
||||||
|
|
||||||
if info.get('speed', 0) > 0.1:
|
if info.get("speed", 0) > 0.1:
|
||||||
shaped_reward += info['speed'] * 0.1
|
shaped_reward += info["speed"] * 0.1
|
||||||
|
|
||||||
if not info.get('offtrack', False):
|
if not info.get("offtrack", False):
|
||||||
shaped_reward += 0.1
|
shaped_reward += 0.1
|
||||||
self.steps_on_track += 1
|
self.steps_on_track += 1
|
||||||
else:
|
else:
|
||||||
shaped_reward -= 0.5
|
shaped_reward -= 0.5
|
||||||
self.steps_on_track = 0
|
self.steps_on_track = 0
|
||||||
|
|
||||||
if info.get('lap_complete', False):
|
if info.get("lap_complete", False):
|
||||||
shaped_reward += 100
|
shaped_reward += 100
|
||||||
|
|
||||||
return obs, shaped_reward, terminated, truncated, info
|
return obs, shaped_reward, terminated, truncated, info
|
||||||
@@ -70,9 +69,7 @@ class FrameStackWrapper(gym.ObservationWrapper):
|
|||||||
self.frames = deque(maxlen=num_stack)
|
self.frames = deque(maxlen=num_stack)
|
||||||
obs_shape = env.observation_space.shape
|
obs_shape = env.observation_space.shape
|
||||||
self.observation_space = gym.spaces.Box(
|
self.observation_space = gym.spaces.Box(
|
||||||
low=0, high=255,
|
low=0, high=255, shape=(num_stack, *obs_shape[-2:]), dtype=np.uint8
|
||||||
shape=(num_stack, *obs_shape[-2:]),
|
|
||||||
dtype=np.uint8
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def reset(self, **kwargs):
|
def reset(self, **kwargs):
|
||||||
@@ -179,11 +176,7 @@ class Critic(nn.Module):
|
|||||||
out_h = (out_h - 3) // 1 + 1
|
out_h = (out_h - 3) // 1 + 1
|
||||||
feat_size = 64 * out_h * out_h
|
feat_size = 64 * out_h * out_h
|
||||||
|
|
||||||
self.fc = nn.Sequential(
|
self.fc = nn.Sequential(nn.Linear(feat_size, 512), nn.LeakyReLU(0.2), nn.Linear(512, 1))
|
||||||
nn.Linear(feat_size, 512),
|
|
||||||
nn.LeakyReLU(0.2),
|
|
||||||
nn.Linear(512, 1)
|
|
||||||
)
|
|
||||||
|
|
||||||
for m in self.modules():
|
for m in self.modules():
|
||||||
if isinstance(m, (nn.Conv2d, nn.Linear)):
|
if isinstance(m, (nn.Conv2d, nn.Linear)):
|
||||||
@@ -235,17 +228,17 @@ class RolloutBuffer:
|
|||||||
last_gae = delta + gamma * gae_lambda * (1 - self.dones[t]) * last_gae
|
last_gae = delta + gamma * gae_lambda * (1 - self.dones[t]) * last_gae
|
||||||
advantages[t] = last_gae
|
advantages[t] = last_gae
|
||||||
|
|
||||||
returns = advantages + self.values[:self.size]
|
returns = advantages + self.values[: self.size]
|
||||||
return returns, advantages
|
return returns, advantages
|
||||||
|
|
||||||
def get(self):
|
def get(self):
|
||||||
return (
|
return (
|
||||||
self.states[:self.size],
|
self.states[: self.size],
|
||||||
self.actions[:self.size],
|
self.actions[: self.size],
|
||||||
self.rewards[:self.size],
|
self.rewards[: self.size],
|
||||||
self.dones[:self.size],
|
self.dones[: self.size],
|
||||||
self.values[:self.size],
|
self.values[: self.size],
|
||||||
self.log_probs[:self.size],
|
self.log_probs[: self.size],
|
||||||
)
|
)
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
@@ -286,8 +279,6 @@ class PPOTrainer:
|
|||||||
self.actor_optim = torch.optim.Adam(actor.parameters(), lr=lr, eps=1e-5)
|
self.actor_optim = torch.optim.Adam(actor.parameters(), lr=lr, eps=1e-5)
|
||||||
self.critic_optim = torch.optim.Adam(critic.parameters(), lr=lr, eps=1e-5)
|
self.critic_optim = torch.optim.Adam(critic.parameters(), lr=lr, eps=1e-5)
|
||||||
|
|
||||||
self.total_updates = 0
|
|
||||||
|
|
||||||
def update(self, last_value):
|
def update(self, last_value):
|
||||||
states, actions, rewards, dones, values, log_probs_old = self.buffer.get()
|
states, actions, rewards, dones, values, log_probs_old = self.buffer.get()
|
||||||
|
|
||||||
@@ -344,8 +335,6 @@ class PPOTrainer:
|
|||||||
total_entropy += entropy.mean().item()
|
total_entropy += entropy.mean().item()
|
||||||
count += 1
|
count += 1
|
||||||
|
|
||||||
self.total_updates += 1
|
|
||||||
|
|
||||||
avg_actor = total_actor_loss / count
|
avg_actor = total_actor_loss / count
|
||||||
avg_critic = total_critic_loss / count
|
avg_critic = total_critic_loss / count
|
||||||
avg_entropy = total_entropy / count
|
avg_entropy = total_entropy / count
|
||||||
@@ -388,7 +377,7 @@ def collect_rollout(actor, critic, env, buffer, device, rollout_steps):
|
|||||||
return obs
|
return obs
|
||||||
|
|
||||||
|
|
||||||
def train_improved(
|
def train(
|
||||||
total_steps=2000000,
|
total_steps=2000000,
|
||||||
rollout_steps=2048,
|
rollout_steps=2048,
|
||||||
eval_interval=10,
|
eval_interval=10,
|
||||||
@@ -434,12 +423,12 @@ def train_improved(
|
|||||||
|
|
||||||
print(f"Training on {device}")
|
print(f"Training on {device}")
|
||||||
print(f"Log directory: {log_dir}")
|
print(f"Log directory: {log_dir}")
|
||||||
print("Improvements: LeakyReLU, BatchNorm, He init, Reward shaping, LR decay, More epochs")
|
print("Improvements: LeakyReLU, BatchNorm, He init, Reward shaping, More epochs")
|
||||||
|
|
||||||
episode = 0
|
episode = 0
|
||||||
total_timesteps = 0
|
total_timesteps = 0
|
||||||
episode_rewards = []
|
episode_rewards = []
|
||||||
best_eval = -float('inf')
|
best_eval = -float("inf")
|
||||||
|
|
||||||
while total_timesteps < total_steps:
|
while total_timesteps < total_steps:
|
||||||
obs = collect_rollout(actor, critic, env, buffer, device, rollout_steps)
|
obs = collect_rollout(actor, critic, env, buffer, device, rollout_steps)
|
||||||
@@ -457,7 +446,7 @@ def train_improved(
|
|||||||
total_timesteps += rollout_steps
|
total_timesteps += rollout_steps
|
||||||
episode += 1
|
episode += 1
|
||||||
|
|
||||||
ep_reward = buffer.rewards[:buffer.size].sum()
|
ep_reward = buffer.rewards[: buffer.size].sum()
|
||||||
episode_rewards.append(ep_reward)
|
episode_rewards.append(ep_reward)
|
||||||
|
|
||||||
recent_rewards = episode_rewards[-10:] if len(episode_rewards) >= 10 else episode_rewards
|
recent_rewards = episode_rewards[-10:] if len(episode_rewards) >= 10 else episode_rewards
|
||||||
@@ -466,7 +455,9 @@ def train_improved(
|
|||||||
writer.add_scalar("Reward/Episode", ep_reward, total_timesteps)
|
writer.add_scalar("Reward/Episode", ep_reward, total_timesteps)
|
||||||
writer.add_scalar("Reward/AvgLast10", avg_reward, total_timesteps)
|
writer.add_scalar("Reward/AvgLast10", avg_reward, total_timesteps)
|
||||||
|
|
||||||
print(f"Episode {episode}, steps {total_timesteps}, ep_reward={ep_reward:.1f}, avg_10={avg_reward:.1f}")
|
print(
|
||||||
|
f"Episode {episode}, steps {total_timesteps}, ep_reward={ep_reward:.1f}, avg_10={avg_reward:.1f}"
|
||||||
|
)
|
||||||
|
|
||||||
if episode % eval_interval == 0:
|
if episode % eval_interval == 0:
|
||||||
eval_returns = []
|
eval_returns = []
|
||||||
@@ -478,7 +469,13 @@ def train_improved(
|
|||||||
|
|
||||||
while not done:
|
while not done:
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
eval_obs_t = torch.from_numpy(eval_obs).float().unsqueeze(0).permute(0, 3, 1, 2).to(device)
|
eval_obs_t = (
|
||||||
|
torch.from_numpy(eval_obs)
|
||||||
|
.float()
|
||||||
|
.unsqueeze(0)
|
||||||
|
.permute(0, 3, 1, 2)
|
||||||
|
.to(device)
|
||||||
|
)
|
||||||
mu, std = actor(eval_obs_t)
|
mu, std = actor(eval_obs_t)
|
||||||
action = torch.clamp(mu, -1, 1).squeeze(0).cpu().numpy()
|
action = torch.clamp(mu, -1, 1).squeeze(0).cpu().numpy()
|
||||||
eval_obs, reward, terminated, truncated, _ = eval_env.step(action)
|
eval_obs, reward, terminated, truncated, _ = eval_env.step(action)
|
||||||
@@ -495,33 +492,42 @@ def train_improved(
|
|||||||
if mean_eval > best_eval:
|
if mean_eval > best_eval:
|
||||||
best_eval = mean_eval
|
best_eval = mean_eval
|
||||||
os.makedirs("models", exist_ok=True)
|
os.makedirs("models", exist_ok=True)
|
||||||
torch.save({
|
torch.save(
|
||||||
|
{
|
||||||
"actor": actor.state_dict(),
|
"actor": actor.state_dict(),
|
||||||
"critic": critic.state_dict(),
|
"critic": critic.state_dict(),
|
||||||
"episode": episode,
|
"episode": episode,
|
||||||
"timesteps": total_timesteps,
|
"timesteps": total_timesteps,
|
||||||
"best_eval": best_eval,
|
"best_eval": best_eval,
|
||||||
}, os.path.join("models", "ppo_improved_best.pt"))
|
},
|
||||||
|
os.path.join("models", "ppo_improved_best.pt"),
|
||||||
|
)
|
||||||
print(f" New best model saved! eval={best_eval:.2f}")
|
print(f" New best model saved! eval={best_eval:.2f}")
|
||||||
|
|
||||||
if episode % save_interval == 0:
|
if episode % save_interval == 0:
|
||||||
os.makedirs("models", exist_ok=True)
|
os.makedirs("models", exist_ok=True)
|
||||||
torch.save({
|
torch.save(
|
||||||
|
{
|
||||||
"actor": actor.state_dict(),
|
"actor": actor.state_dict(),
|
||||||
"critic": critic.state_dict(),
|
"critic": critic.state_dict(),
|
||||||
"episode": episode,
|
"episode": episode,
|
||||||
"timesteps": total_timesteps,
|
"timesteps": total_timesteps,
|
||||||
}, os.path.join("models", f"ppo_improved_ep{episode}.pt"))
|
},
|
||||||
|
os.path.join("models", f"ppo_improved_ep{episode}.pt"),
|
||||||
|
)
|
||||||
print(f" Saved model at episode {episode}")
|
print(f" Saved model at episode {episode}")
|
||||||
|
|
||||||
os.makedirs("models", exist_ok=True)
|
os.makedirs("models", exist_ok=True)
|
||||||
torch.save({
|
torch.save(
|
||||||
|
{
|
||||||
"actor": actor.state_dict(),
|
"actor": actor.state_dict(),
|
||||||
"critic": critic.state_dict(),
|
"critic": critic.state_dict(),
|
||||||
"episode": episode,
|
"episode": episode,
|
||||||
"timesteps": total_timesteps,
|
"timesteps": total_timesteps,
|
||||||
"best_eval": best_eval,
|
"best_eval": best_eval,
|
||||||
}, os.path.join("models", "ppo_improved_final.pt"))
|
},
|
||||||
|
os.path.join("models", "ppo_improved_final.pt"),
|
||||||
|
)
|
||||||
|
|
||||||
writer.close()
|
writer.close()
|
||||||
env.close()
|
env.close()
|
||||||
@@ -536,4 +542,4 @@ if __name__ == "__main__":
|
|||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
device = get_device()
|
device = get_device()
|
||||||
train_improved(total_steps=args.steps, rollout_steps=args.rollout, device=device)
|
train(total_steps=args.steps, rollout_steps=args.rollout, device=device)
|
||||||
|
|||||||
Reference in New Issue
Block a user