fix(ppo): 修正日志概率维度与状态张量格式

修复 replay buffer 中 log_probs 的维度错误,从 (buffer_size, action_dim) 改为 buffer_size
修正训练时状态张量格式,从 (N, H, W, C) 转换为 (N, C, H, W)
更新 collect_rollout 返回观测值并修正 log_prob 计算
添加项目配置文件和训练曲线生成脚本
This commit is contained in:
2026-04-30 20:30:40 +08:00
parent d353133b31
commit b32490ae03
19 changed files with 185 additions and 22 deletions
@@ -0,0 +1,107 @@
"""Generate training plots from TensorBoard logs."""
import os
import numpy as np
from tensorboard.backend.event_processing import event_accumulator
import matplotlib.pyplot as plt
def extract_metrics(log_dir):
"""Extract metrics from TensorBoard log directory."""
ea = event_accumulator.EventAccumulator(log_dir)
ea.Reload()
metrics = {}
for tag in ea.Tags()['scalars']:
events = ea.Scalars(tag)
steps = [e.step for e in events]
values = [e.value for e in events]
metrics[tag] = {'steps': steps, 'values': values}
return metrics
def smooth(data, weight=0.6):
"""Exponential moving average for smoothing."""
smoothed = []
last = data[0]
for point in data:
smoothed_val = last * weight + (1 - weight) * point
smoothed.append(smoothed_val)
last = smoothed_val
return smoothed
def plot_training_curves(metrics, save_path):
"""Plot training curves."""
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
episodes = metrics.get('Reward/Episode', {}).get('steps', [])
ep_rewards = metrics.get('Reward/Episode', {}).get('values', [])
avg_rewards = metrics.get('Reward/AvgLast10', {}).get('values', [])
if episodes and ep_rewards:
axes[0, 0].plot(episodes, ep_rewards, alpha=0.3, label='Episode Reward')
if avg_rewards:
axes[0, 0].plot(episodes, smooth(avg_rewards), 'r-', linewidth=2, label='Smoothed (EMA)')
axes[0, 0].set_xlabel('Training Steps')
axes[0, 0].set_ylabel('Episode Reward')
axes[0, 0].set_title('Training Episode Reward')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)
eval_steps = metrics.get('Eval/MeanReturn', {}).get('steps', [])
eval_returns = metrics.get('Eval/MeanReturn', {}).get('values', [])
if eval_steps and eval_returns:
axes[0, 1].plot(eval_steps, eval_returns, 'g-', linewidth=2, marker='o', markersize=4)
axes[0, 1].set_xlabel('Episode')
axes[0, 1].set_ylabel('Mean Evaluation Return')
axes[0, 1].set_title('Evaluation Performance')
axes[0, 1].grid(True, alpha=0.3)
actor_loss_steps = metrics.get('Loss/Actor', {}).get('steps', [])
actor_losses = metrics.get('Loss/Actor', {}).get('values', [])
if actor_loss_steps and actor_losses:
axes[1, 0].plot(actor_loss_steps, smooth(actor_losses), 'b-', linewidth=1.5)
axes[1, 0].set_xlabel('Training Steps')
axes[1, 0].set_ylabel('Actor Loss')
axes[1, 0].set_title('Actor Loss Over Training')
axes[1, 0].grid(True, alpha=0.3)
critic_loss_steps = metrics.get('Loss/Critic', {}).get('steps', [])
critic_losses = metrics.get('Loss/Critic', {}).get('values', [])
if critic_loss_steps and critic_losses:
axes[1, 1].plot(critic_loss_steps, smooth(critic_losses), 'purple', linewidth=1.5)
axes[1, 1].set_xlabel('Training Steps')
axes[1, 1].set_ylabel('Critic Loss')
axes[1, 1].set_title('Critic Loss Over Training')
axes[1, 1].grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(save_path, dpi=150, bbox_inches='tight')
plt.close()
print(f"Plots saved to {save_path}")
def main():
log_base = 'logs/tensorboard'
runs = sorted([d for d in os.listdir(log_base) if os.path.isdir(os.path.join(log_base, d))])
if not runs:
print("No runs found!")
return
latest_run = os.path.join(log_base, runs[-1])
print(f"Analyzing run: {runs[-1]}")
metrics = extract_metrics(latest_run)
plot_training_curves(metrics, 'training_curves.png')
print("\nExtracted metrics:")
for tag, data in metrics.items():
if data['values']:
values = np.array(data['values'])
print(f" {tag}: min={values.min():.2f}, max={values.max():.2f}, final={values[-1]:.2f}")
if __name__ == '__main__':
main()
@@ -0,0 +1,39 @@
[project]
name = "ppo-carracing"
version = "0.1.0"
description = "PPO (Proximal Policy Optimization) for CarRacing-v3 environment"
requires-python = ">=3.10"
dependencies = [
"torch>=2.0.0",
"gymnasium[box2d]>=0.29.0",
"numpy>=1.24.0",
"matplotlib>=3.7.0",
"tensorboard>=2.14.0",
"opencv-python>=4.8.0",
]
[project.optional-dependencies]
dev = ["pytest>=7.4.0", "black>=23.0.0", "ruff>=0.1.0"]
[project.scripts]
ppo-train = "train:main"
ppo-evaluate = "src.evaluate:main"
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.ruff]
line-length = 100
target-version = "py310"
[tool.ruff.lint]
select = ["E", "F", "I", "N", "W"]
ignore = ["E501"]
[tool.black]
line-length = 100
target-version = ["py310"]
[tool.hatch.build.targets.wheel]
packages = ["src"]
@@ -1,4 +1,8 @@
"""Evaluation script for trained PPO agent."""
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import torch
import numpy as np
import gymnasium as gym
+15 -9
View File
@@ -1,4 +1,5 @@
"""Neural network architectures for Actor and Critic."""
import torch
import torch.nn as nn
import torch.nn.functional as F
@@ -9,7 +10,11 @@ class Actor(nn.Module):
def __init__(self, state_shape=(84, 84, 4), action_dim=3):
super().__init__()
c, h, w = state_shape[2], state_shape[0], state_shape[1] # channels, height, width
c, h, w = (
state_shape[2],
state_shape[0],
state_shape[1],
) # channels, height, width
self.conv = nn.Sequential(
nn.Conv2d(c, 32, kernel_size=8, stride=4),
@@ -20,8 +25,10 @@ class Actor(nn.Module):
nn.ReLU(),
)
# Calculate feature map size: 84x84 -> 20x20 after conv layers
feat_size = 64 * 20 * 20
out_h = (h - 8) // 4 + 1
out_h = (out_h - 4) // 2 + 1
out_h = (out_h - 3) // 1 + 1
feat_size = 64 * out_h * out_h
self.fc = nn.Sequential(
nn.Linear(feat_size, 512),
@@ -62,13 +69,12 @@ class Critic(nn.Module):
nn.ReLU(),
)
feat_size = 64 * 20 * 20
out_h = (h - 8) // 4 + 1
out_h = (out_h - 4) // 2 + 1
out_h = (out_h - 3) // 1 + 1
feat_size = 64 * out_h * out_h
self.fc = nn.Sequential(
nn.Linear(feat_size, 512),
nn.ReLU(),
nn.Linear(512, 1)
)
self.fc = nn.Sequential(nn.Linear(feat_size, 512), nn.ReLU(), nn.Linear(512, 1))
def forward(self, x):
"""Forward pass returning V(s)."""
@@ -15,7 +15,7 @@ class RolloutBuffer:
self.rewards = np.zeros(buffer_size, dtype=np.float32)
self.dones = np.zeros(buffer_size, dtype=np.bool_)
self.values = np.zeros(buffer_size, dtype=np.float32)
self.log_probs = np.zeros((buffer_size, action_dim), dtype=np.float32)
self.log_probs = np.zeros(buffer_size, dtype=np.float32)
def add(self, state, action, reward, done, value, log_prob):
"""Add a transition to the buffer."""
@@ -56,8 +56,8 @@ class PPOTrainer:
# Normalize advantages
advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
# Convert to tensors
states_t = torch.from_numpy(states).float().to(self.device)
# Convert to tensors (states: N, H, W, C -> N, C, H, W)
states_t = torch.from_numpy(states).float().permute(0, 3, 1, 2).to(self.device)
actions_t = torch.from_numpy(actions).float().to(self.device)
log_probs_old_t = torch.from_numpy(log_probs_old).float().to(self.device)
returns_t = torch.from_numpy(returns).float().to(self.device)
@@ -75,16 +75,13 @@ class PPOTrainer:
for batch in loader:
s, a, log_pi_old, ret, adv = batch
# Get current policy distribution
mu, std = self.actor(s)
dist = torch.distributions.Normal(mu, std)
log_pi = dist.log_prob(a).sum(dim=-1, keepdim=True)
entropy = dist.entropy().sum(dim=-1, keepdim=True)
log_pi = dist.log_prob(a).sum(dim=-1)
entropy = dist.entropy().sum(dim=-1)
# Probability ratio
ratio = torch.exp(log_pi - log_pi_old)
# Clipped surrogate objective
surr1 = ratio * adv
surr2 = torch.clamp(ratio, 1 - self.clip_eps, 1 + self.clip_eps) * adv
actor_loss = -torch.min(surr1, surr2).mean()
+14 -4
View File
@@ -30,7 +30,7 @@ def collect_rollout(actor, critic, env, buffer, device, rollout_steps):
value = critic(obs_t).squeeze(0).item()
action_np = action.squeeze(0).cpu().numpy()
log_prob_np = log_prob.squeeze(0).cpu().numpy()
log_prob_np = log_prob.squeeze(0).cpu().numpy().sum()
next_obs, reward, terminated, truncated, _ = env.step(action_np)
done = terminated or truncated
@@ -46,6 +46,8 @@ def collect_rollout(actor, critic, env, buffer, device, rollout_steps):
obs, _ = env.reset()
obs = np.transpose(obs, (1, 2, 0))
return obs
def train(
total_steps=500000,
@@ -102,10 +104,8 @@ def train(
recent_rewards = []
while total_timesteps < total_steps:
# Collect rollout
collect_rollout(actor, critic, env, buffer, device, rollout_steps)
obs = collect_rollout(actor, critic, env, buffer, device, rollout_steps)
# Get last value for GAE
with torch.no_grad():
obs_t = torch.from_numpy(obs).float().unsqueeze(0).permute(0, 3, 1, 2).to(device)
last_value = critic(obs_t).squeeze(0).item()
@@ -190,3 +190,13 @@ if __name__ == "__main__":
device = get_device()
train(total_steps=args.steps, rollout_steps=args.rollout, device=device)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--steps", type=int, default=500000, help="Total training steps")
parser.add_argument("--rollout", type=int, default=2048, help="Rollout buffer size")
args = parser.parse_args()
device = get_device()
train(total_steps=args.steps, rollout_steps=args.rollout, device=device)
Binary file not shown.

After

Width:  |  Height:  |  Size: 127 KiB