fix(ppo): 修正日志概率维度与状态张量格式
修复 replay buffer 中 log_probs 的维度错误,从 (buffer_size, action_dim) 改为 buffer_size 修正训练时状态张量格式,从 (N, H, W, C) 转换为 (N, C, H, W) 更新 collect_rollout 返回观测值并修正 log_prob 计算 添加项目配置文件和训练曲线生成脚本
This commit is contained in:
@@ -0,0 +1,107 @@
|
|||||||
|
"""Generate training plots from TensorBoard logs."""
|
||||||
|
import os
|
||||||
|
import numpy as np
|
||||||
|
from tensorboard.backend.event_processing import event_accumulator
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
def extract_metrics(log_dir):
|
||||||
|
"""Extract metrics from TensorBoard log directory."""
|
||||||
|
ea = event_accumulator.EventAccumulator(log_dir)
|
||||||
|
ea.Reload()
|
||||||
|
|
||||||
|
metrics = {}
|
||||||
|
for tag in ea.Tags()['scalars']:
|
||||||
|
events = ea.Scalars(tag)
|
||||||
|
steps = [e.step for e in events]
|
||||||
|
values = [e.value for e in events]
|
||||||
|
metrics[tag] = {'steps': steps, 'values': values}
|
||||||
|
|
||||||
|
return metrics
|
||||||
|
|
||||||
|
def smooth(data, weight=0.6):
|
||||||
|
"""Exponential moving average for smoothing."""
|
||||||
|
smoothed = []
|
||||||
|
last = data[0]
|
||||||
|
for point in data:
|
||||||
|
smoothed_val = last * weight + (1 - weight) * point
|
||||||
|
smoothed.append(smoothed_val)
|
||||||
|
last = smoothed_val
|
||||||
|
return smoothed
|
||||||
|
|
||||||
|
def plot_training_curves(metrics, save_path):
|
||||||
|
"""Plot training curves."""
|
||||||
|
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
|
||||||
|
|
||||||
|
episodes = metrics.get('Reward/Episode', {}).get('steps', [])
|
||||||
|
ep_rewards = metrics.get('Reward/Episode', {}).get('values', [])
|
||||||
|
avg_rewards = metrics.get('Reward/AvgLast10', {}).get('values', [])
|
||||||
|
|
||||||
|
if episodes and ep_rewards:
|
||||||
|
axes[0, 0].plot(episodes, ep_rewards, alpha=0.3, label='Episode Reward')
|
||||||
|
if avg_rewards:
|
||||||
|
axes[0, 0].plot(episodes, smooth(avg_rewards), 'r-', linewidth=2, label='Smoothed (EMA)')
|
||||||
|
axes[0, 0].set_xlabel('Training Steps')
|
||||||
|
axes[0, 0].set_ylabel('Episode Reward')
|
||||||
|
axes[0, 0].set_title('Training Episode Reward')
|
||||||
|
axes[0, 0].legend()
|
||||||
|
axes[0, 0].grid(True, alpha=0.3)
|
||||||
|
|
||||||
|
eval_steps = metrics.get('Eval/MeanReturn', {}).get('steps', [])
|
||||||
|
eval_returns = metrics.get('Eval/MeanReturn', {}).get('values', [])
|
||||||
|
|
||||||
|
if eval_steps and eval_returns:
|
||||||
|
axes[0, 1].plot(eval_steps, eval_returns, 'g-', linewidth=2, marker='o', markersize=4)
|
||||||
|
axes[0, 1].set_xlabel('Episode')
|
||||||
|
axes[0, 1].set_ylabel('Mean Evaluation Return')
|
||||||
|
axes[0, 1].set_title('Evaluation Performance')
|
||||||
|
axes[0, 1].grid(True, alpha=0.3)
|
||||||
|
|
||||||
|
actor_loss_steps = metrics.get('Loss/Actor', {}).get('steps', [])
|
||||||
|
actor_losses = metrics.get('Loss/Actor', {}).get('values', [])
|
||||||
|
|
||||||
|
if actor_loss_steps and actor_losses:
|
||||||
|
axes[1, 0].plot(actor_loss_steps, smooth(actor_losses), 'b-', linewidth=1.5)
|
||||||
|
axes[1, 0].set_xlabel('Training Steps')
|
||||||
|
axes[1, 0].set_ylabel('Actor Loss')
|
||||||
|
axes[1, 0].set_title('Actor Loss Over Training')
|
||||||
|
axes[1, 0].grid(True, alpha=0.3)
|
||||||
|
|
||||||
|
critic_loss_steps = metrics.get('Loss/Critic', {}).get('steps', [])
|
||||||
|
critic_losses = metrics.get('Loss/Critic', {}).get('values', [])
|
||||||
|
|
||||||
|
if critic_loss_steps and critic_losses:
|
||||||
|
axes[1, 1].plot(critic_loss_steps, smooth(critic_losses), 'purple', linewidth=1.5)
|
||||||
|
axes[1, 1].set_xlabel('Training Steps')
|
||||||
|
axes[1, 1].set_ylabel('Critic Loss')
|
||||||
|
axes[1, 1].set_title('Critic Loss Over Training')
|
||||||
|
axes[1, 1].grid(True, alpha=0.3)
|
||||||
|
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.savefig(save_path, dpi=150, bbox_inches='tight')
|
||||||
|
plt.close()
|
||||||
|
print(f"Plots saved to {save_path}")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
log_base = 'logs/tensorboard'
|
||||||
|
|
||||||
|
runs = sorted([d for d in os.listdir(log_base) if os.path.isdir(os.path.join(log_base, d))])
|
||||||
|
|
||||||
|
if not runs:
|
||||||
|
print("No runs found!")
|
||||||
|
return
|
||||||
|
|
||||||
|
latest_run = os.path.join(log_base, runs[-1])
|
||||||
|
print(f"Analyzing run: {runs[-1]}")
|
||||||
|
|
||||||
|
metrics = extract_metrics(latest_run)
|
||||||
|
|
||||||
|
plot_training_curves(metrics, 'training_curves.png')
|
||||||
|
|
||||||
|
print("\nExtracted metrics:")
|
||||||
|
for tag, data in metrics.items():
|
||||||
|
if data['values']:
|
||||||
|
values = np.array(data['values'])
|
||||||
|
print(f" {tag}: min={values.min():.2f}, max={values.max():.2f}, final={values[-1]:.2f}")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,39 @@
|
|||||||
|
[project]
|
||||||
|
name = "ppo-carracing"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "PPO (Proximal Policy Optimization) for CarRacing-v3 environment"
|
||||||
|
requires-python = ">=3.10"
|
||||||
|
dependencies = [
|
||||||
|
"torch>=2.0.0",
|
||||||
|
"gymnasium[box2d]>=0.29.0",
|
||||||
|
"numpy>=1.24.0",
|
||||||
|
"matplotlib>=3.7.0",
|
||||||
|
"tensorboard>=2.14.0",
|
||||||
|
"opencv-python>=4.8.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
dev = ["pytest>=7.4.0", "black>=23.0.0", "ruff>=0.1.0"]
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
ppo-train = "train:main"
|
||||||
|
ppo-evaluate = "src.evaluate:main"
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["hatchling"]
|
||||||
|
build-backend = "hatchling.build"
|
||||||
|
|
||||||
|
[tool.ruff]
|
||||||
|
line-length = 100
|
||||||
|
target-version = "py310"
|
||||||
|
|
||||||
|
[tool.ruff.lint]
|
||||||
|
select = ["E", "F", "I", "N", "W"]
|
||||||
|
ignore = ["E501"]
|
||||||
|
|
||||||
|
[tool.black]
|
||||||
|
line-length = 100
|
||||||
|
target-version = ["py310"]
|
||||||
|
|
||||||
|
[tool.hatch.build.targets.wheel]
|
||||||
|
packages = ["src"]
|
||||||
@@ -1,4 +1,8 @@
|
|||||||
"""Evaluation script for trained PPO agent."""
|
"""Evaluation script for trained PPO agent."""
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import gymnasium as gym
|
import gymnasium as gym
|
||||||
|
|||||||
+16
-10
@@ -1,4 +1,5 @@
|
|||||||
"""Neural network architectures for Actor and Critic."""
|
"""Neural network architectures for Actor and Critic."""
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
@@ -9,7 +10,11 @@ class Actor(nn.Module):
|
|||||||
|
|
||||||
def __init__(self, state_shape=(84, 84, 4), action_dim=3):
|
def __init__(self, state_shape=(84, 84, 4), action_dim=3):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
c, h, w = state_shape[2], state_shape[0], state_shape[1] # channels, height, width
|
c, h, w = (
|
||||||
|
state_shape[2],
|
||||||
|
state_shape[0],
|
||||||
|
state_shape[1],
|
||||||
|
) # channels, height, width
|
||||||
|
|
||||||
self.conv = nn.Sequential(
|
self.conv = nn.Sequential(
|
||||||
nn.Conv2d(c, 32, kernel_size=8, stride=4),
|
nn.Conv2d(c, 32, kernel_size=8, stride=4),
|
||||||
@@ -20,8 +25,10 @@ class Actor(nn.Module):
|
|||||||
nn.ReLU(),
|
nn.ReLU(),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Calculate feature map size: 84x84 -> 20x20 after conv layers
|
out_h = (h - 8) // 4 + 1
|
||||||
feat_size = 64 * 20 * 20
|
out_h = (out_h - 4) // 2 + 1
|
||||||
|
out_h = (out_h - 3) // 1 + 1
|
||||||
|
feat_size = 64 * out_h * out_h
|
||||||
|
|
||||||
self.fc = nn.Sequential(
|
self.fc = nn.Sequential(
|
||||||
nn.Linear(feat_size, 512),
|
nn.Linear(feat_size, 512),
|
||||||
@@ -62,17 +69,16 @@ class Critic(nn.Module):
|
|||||||
nn.ReLU(),
|
nn.ReLU(),
|
||||||
)
|
)
|
||||||
|
|
||||||
feat_size = 64 * 20 * 20
|
out_h = (h - 8) // 4 + 1
|
||||||
|
out_h = (out_h - 4) // 2 + 1
|
||||||
|
out_h = (out_h - 3) // 1 + 1
|
||||||
|
feat_size = 64 * out_h * out_h
|
||||||
|
|
||||||
self.fc = nn.Sequential(
|
self.fc = nn.Sequential(nn.Linear(feat_size, 512), nn.ReLU(), nn.Linear(512, 1))
|
||||||
nn.Linear(feat_size, 512),
|
|
||||||
nn.ReLU(),
|
|
||||||
nn.Linear(512, 1)
|
|
||||||
)
|
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
"""Forward pass returning V(s)."""
|
"""Forward pass returning V(s)."""
|
||||||
x = x / 255.0
|
x = x / 255.0
|
||||||
x = self.conv(x)
|
x = self.conv(x)
|
||||||
x = x.view(x.size(0), -1)
|
x = x.view(x.size(0), -1)
|
||||||
return self.fc(x)
|
return self.fc(x)
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ class RolloutBuffer:
|
|||||||
self.rewards = np.zeros(buffer_size, dtype=np.float32)
|
self.rewards = np.zeros(buffer_size, dtype=np.float32)
|
||||||
self.dones = np.zeros(buffer_size, dtype=np.bool_)
|
self.dones = np.zeros(buffer_size, dtype=np.bool_)
|
||||||
self.values = np.zeros(buffer_size, dtype=np.float32)
|
self.values = np.zeros(buffer_size, dtype=np.float32)
|
||||||
self.log_probs = np.zeros((buffer_size, action_dim), dtype=np.float32)
|
self.log_probs = np.zeros(buffer_size, dtype=np.float32)
|
||||||
|
|
||||||
def add(self, state, action, reward, done, value, log_prob):
|
def add(self, state, action, reward, done, value, log_prob):
|
||||||
"""Add a transition to the buffer."""
|
"""Add a transition to the buffer."""
|
||||||
|
|||||||
@@ -56,8 +56,8 @@ class PPOTrainer:
|
|||||||
# Normalize advantages
|
# Normalize advantages
|
||||||
advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
|
advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
|
||||||
|
|
||||||
# Convert to tensors
|
# Convert to tensors (states: N, H, W, C -> N, C, H, W)
|
||||||
states_t = torch.from_numpy(states).float().to(self.device)
|
states_t = torch.from_numpy(states).float().permute(0, 3, 1, 2).to(self.device)
|
||||||
actions_t = torch.from_numpy(actions).float().to(self.device)
|
actions_t = torch.from_numpy(actions).float().to(self.device)
|
||||||
log_probs_old_t = torch.from_numpy(log_probs_old).float().to(self.device)
|
log_probs_old_t = torch.from_numpy(log_probs_old).float().to(self.device)
|
||||||
returns_t = torch.from_numpy(returns).float().to(self.device)
|
returns_t = torch.from_numpy(returns).float().to(self.device)
|
||||||
@@ -75,16 +75,13 @@ class PPOTrainer:
|
|||||||
for batch in loader:
|
for batch in loader:
|
||||||
s, a, log_pi_old, ret, adv = batch
|
s, a, log_pi_old, ret, adv = batch
|
||||||
|
|
||||||
# Get current policy distribution
|
|
||||||
mu, std = self.actor(s)
|
mu, std = self.actor(s)
|
||||||
dist = torch.distributions.Normal(mu, std)
|
dist = torch.distributions.Normal(mu, std)
|
||||||
log_pi = dist.log_prob(a).sum(dim=-1, keepdim=True)
|
log_pi = dist.log_prob(a).sum(dim=-1)
|
||||||
entropy = dist.entropy().sum(dim=-1, keepdim=True)
|
entropy = dist.entropy().sum(dim=-1)
|
||||||
|
|
||||||
# Probability ratio
|
|
||||||
ratio = torch.exp(log_pi - log_pi_old)
|
ratio = torch.exp(log_pi - log_pi_old)
|
||||||
|
|
||||||
# Clipped surrogate objective
|
|
||||||
surr1 = ratio * adv
|
surr1 = ratio * adv
|
||||||
surr2 = torch.clamp(ratio, 1 - self.clip_eps, 1 + self.clip_eps) * adv
|
surr2 = torch.clamp(ratio, 1 - self.clip_eps, 1 + self.clip_eps) * adv
|
||||||
actor_loss = -torch.min(surr1, surr2).mean()
|
actor_loss = -torch.min(surr1, surr2).mean()
|
||||||
|
|||||||
+14
-4
@@ -30,7 +30,7 @@ def collect_rollout(actor, critic, env, buffer, device, rollout_steps):
|
|||||||
value = critic(obs_t).squeeze(0).item()
|
value = critic(obs_t).squeeze(0).item()
|
||||||
|
|
||||||
action_np = action.squeeze(0).cpu().numpy()
|
action_np = action.squeeze(0).cpu().numpy()
|
||||||
log_prob_np = log_prob.squeeze(0).cpu().numpy()
|
log_prob_np = log_prob.squeeze(0).cpu().numpy().sum()
|
||||||
|
|
||||||
next_obs, reward, terminated, truncated, _ = env.step(action_np)
|
next_obs, reward, terminated, truncated, _ = env.step(action_np)
|
||||||
done = terminated or truncated
|
done = terminated or truncated
|
||||||
@@ -46,6 +46,8 @@ def collect_rollout(actor, critic, env, buffer, device, rollout_steps):
|
|||||||
obs, _ = env.reset()
|
obs, _ = env.reset()
|
||||||
obs = np.transpose(obs, (1, 2, 0))
|
obs = np.transpose(obs, (1, 2, 0))
|
||||||
|
|
||||||
|
return obs
|
||||||
|
|
||||||
|
|
||||||
def train(
|
def train(
|
||||||
total_steps=500000,
|
total_steps=500000,
|
||||||
@@ -102,10 +104,8 @@ def train(
|
|||||||
recent_rewards = []
|
recent_rewards = []
|
||||||
|
|
||||||
while total_timesteps < total_steps:
|
while total_timesteps < total_steps:
|
||||||
# Collect rollout
|
obs = collect_rollout(actor, critic, env, buffer, device, rollout_steps)
|
||||||
collect_rollout(actor, critic, env, buffer, device, rollout_steps)
|
|
||||||
|
|
||||||
# Get last value for GAE
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
obs_t = torch.from_numpy(obs).float().unsqueeze(0).permute(0, 3, 1, 2).to(device)
|
obs_t = torch.from_numpy(obs).float().unsqueeze(0).permute(0, 3, 1, 2).to(device)
|
||||||
last_value = critic(obs_t).squeeze(0).item()
|
last_value = critic(obs_t).squeeze(0).item()
|
||||||
@@ -190,3 +190,13 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
device = get_device()
|
device = get_device()
|
||||||
train(total_steps=args.steps, rollout_steps=args.rollout, device=device)
|
train(total_steps=args.steps, rollout_steps=args.rollout, device=device)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--steps", type=int, default=500000, help="Total training steps")
|
||||||
|
parser.add_argument("--rollout", type=int, default=2048, help="Rollout buffer size")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
device = get_device()
|
||||||
|
train(total_steps=args.steps, rollout_steps=args.rollout, device=device)
|
||||||
|
|||||||
Binary file not shown.
|
After Width: | Height: | Size: 127 KiB |
Reference in New Issue
Block a user