diff --git a/强化学习个人项目报告/generate_plots.py b/强化学习个人项目报告/generate_plots.py new file mode 100644 index 0000000..a35d106 --- /dev/null +++ b/强化学习个人项目报告/generate_plots.py @@ -0,0 +1,107 @@ +"""Generate training plots from TensorBoard logs.""" +import os +import numpy as np +from tensorboard.backend.event_processing import event_accumulator +import matplotlib.pyplot as plt + +def extract_metrics(log_dir): + """Extract metrics from TensorBoard log directory.""" + ea = event_accumulator.EventAccumulator(log_dir) + ea.Reload() + + metrics = {} + for tag in ea.Tags()['scalars']: + events = ea.Scalars(tag) + steps = [e.step for e in events] + values = [e.value for e in events] + metrics[tag] = {'steps': steps, 'values': values} + + return metrics + +def smooth(data, weight=0.6): + """Exponential moving average for smoothing.""" + smoothed = [] + last = data[0] + for point in data: + smoothed_val = last * weight + (1 - weight) * point + smoothed.append(smoothed_val) + last = smoothed_val + return smoothed + +def plot_training_curves(metrics, save_path): + """Plot training curves.""" + fig, axes = plt.subplots(2, 2, figsize=(14, 10)) + + episodes = metrics.get('Reward/Episode', {}).get('steps', []) + ep_rewards = metrics.get('Reward/Episode', {}).get('values', []) + avg_rewards = metrics.get('Reward/AvgLast10', {}).get('values', []) + + if episodes and ep_rewards: + axes[0, 0].plot(episodes, ep_rewards, alpha=0.3, label='Episode Reward') + if avg_rewards: + axes[0, 0].plot(episodes, smooth(avg_rewards), 'r-', linewidth=2, label='Smoothed (EMA)') + axes[0, 0].set_xlabel('Training Steps') + axes[0, 0].set_ylabel('Episode Reward') + axes[0, 0].set_title('Training Episode Reward') + axes[0, 0].legend() + axes[0, 0].grid(True, alpha=0.3) + + eval_steps = metrics.get('Eval/MeanReturn', {}).get('steps', []) + eval_returns = metrics.get('Eval/MeanReturn', {}).get('values', []) + + if eval_steps and eval_returns: + axes[0, 1].plot(eval_steps, eval_returns, 'g-', linewidth=2, marker='o', markersize=4) + axes[0, 1].set_xlabel('Episode') + axes[0, 1].set_ylabel('Mean Evaluation Return') + axes[0, 1].set_title('Evaluation Performance') + axes[0, 1].grid(True, alpha=0.3) + + actor_loss_steps = metrics.get('Loss/Actor', {}).get('steps', []) + actor_losses = metrics.get('Loss/Actor', {}).get('values', []) + + if actor_loss_steps and actor_losses: + axes[1, 0].plot(actor_loss_steps, smooth(actor_losses), 'b-', linewidth=1.5) + axes[1, 0].set_xlabel('Training Steps') + axes[1, 0].set_ylabel('Actor Loss') + axes[1, 0].set_title('Actor Loss Over Training') + axes[1, 0].grid(True, alpha=0.3) + + critic_loss_steps = metrics.get('Loss/Critic', {}).get('steps', []) + critic_losses = metrics.get('Loss/Critic', {}).get('values', []) + + if critic_loss_steps and critic_losses: + axes[1, 1].plot(critic_loss_steps, smooth(critic_losses), 'purple', linewidth=1.5) + axes[1, 1].set_xlabel('Training Steps') + axes[1, 1].set_ylabel('Critic Loss') + axes[1, 1].set_title('Critic Loss Over Training') + axes[1, 1].grid(True, alpha=0.3) + + plt.tight_layout() + plt.savefig(save_path, dpi=150, bbox_inches='tight') + plt.close() + print(f"Plots saved to {save_path}") + +def main(): + log_base = 'logs/tensorboard' + + runs = sorted([d for d in os.listdir(log_base) if os.path.isdir(os.path.join(log_base, d))]) + + if not runs: + print("No runs found!") + return + + latest_run = os.path.join(log_base, runs[-1]) + print(f"Analyzing run: {runs[-1]}") + + metrics = extract_metrics(latest_run) + + plot_training_curves(metrics, 'training_curves.png') + + print("\nExtracted metrics:") + for tag, data in metrics.items(): + if data['values']: + values = np.array(data['values']) + print(f" {tag}: min={values.min():.2f}, max={values.max():.2f}, final={values[-1]:.2f}") + +if __name__ == '__main__': + main() diff --git a/强化学习个人项目报告/logs/tensorboard/run_1777539820/events.out.tfevents.1777539820.LHY.40076.0 b/强化学习个人项目报告/logs/tensorboard/run_1777539820/events.out.tfevents.1777539820.LHY.40076.0 new file mode 100644 index 0000000..2a0eacd Binary files /dev/null and b/强化学习个人项目报告/logs/tensorboard/run_1777539820/events.out.tfevents.1777539820.LHY.40076.0 differ diff --git a/强化学习个人项目报告/logs/tensorboard/run_1777539879/events.out.tfevents.1777539879.LHY.42380.0 b/强化学习个人项目报告/logs/tensorboard/run_1777539879/events.out.tfevents.1777539879.LHY.42380.0 new file mode 100644 index 0000000..9fd14f8 Binary files /dev/null and b/强化学习个人项目报告/logs/tensorboard/run_1777539879/events.out.tfevents.1777539879.LHY.42380.0 differ diff --git a/强化学习个人项目报告/logs/tensorboard/run_1777539949/events.out.tfevents.1777539949.LHY.7860.0 b/强化学习个人项目报告/logs/tensorboard/run_1777539949/events.out.tfevents.1777539949.LHY.7860.0 new file mode 100644 index 0000000..dd0d63f Binary files /dev/null and b/强化学习个人项目报告/logs/tensorboard/run_1777539949/events.out.tfevents.1777539949.LHY.7860.0 differ diff --git a/强化学习个人项目报告/logs/tensorboard/run_1777540009/events.out.tfevents.1777540009.LHY.11952.0 b/强化学习个人项目报告/logs/tensorboard/run_1777540009/events.out.tfevents.1777540009.LHY.11952.0 new file mode 100644 index 0000000..517bc8a Binary files /dev/null and b/强化学习个人项目报告/logs/tensorboard/run_1777540009/events.out.tfevents.1777540009.LHY.11952.0 differ diff --git a/强化学习个人项目报告/logs/tensorboard/run_1777540095/events.out.tfevents.1777540095.LHY.5796.0 b/强化学习个人项目报告/logs/tensorboard/run_1777540095/events.out.tfevents.1777540095.LHY.5796.0 new file mode 100644 index 0000000..942bc6c Binary files /dev/null and b/强化学习个人项目报告/logs/tensorboard/run_1777540095/events.out.tfevents.1777540095.LHY.5796.0 differ diff --git a/强化学习个人项目报告/logs/tensorboard/run_1777540333/events.out.tfevents.1777540333.LHY.43256.0 b/强化学习个人项目报告/logs/tensorboard/run_1777540333/events.out.tfevents.1777540333.LHY.43256.0 new file mode 100644 index 0000000..89d1f72 Binary files /dev/null and b/强化学习个人项目报告/logs/tensorboard/run_1777540333/events.out.tfevents.1777540333.LHY.43256.0 differ diff --git a/强化学习个人项目报告/models/ppo_carracing_ep100.pt b/强化学习个人项目报告/models/ppo_carracing_ep100.pt new file mode 100644 index 0000000..70626cc Binary files /dev/null and b/强化学习个人项目报告/models/ppo_carracing_ep100.pt differ diff --git a/强化学习个人项目报告/models/ppo_carracing_ep150.pt b/强化学习个人项目报告/models/ppo_carracing_ep150.pt new file mode 100644 index 0000000..9bda300 Binary files /dev/null and b/强化学习个人项目报告/models/ppo_carracing_ep150.pt differ diff --git a/强化学习个人项目报告/models/ppo_carracing_ep200.pt b/强化学习个人项目报告/models/ppo_carracing_ep200.pt new file mode 100644 index 0000000..054d0ad Binary files /dev/null and b/强化学习个人项目报告/models/ppo_carracing_ep200.pt differ diff --git a/强化学习个人项目报告/models/ppo_carracing_ep50.pt b/强化学习个人项目报告/models/ppo_carracing_ep50.pt new file mode 100644 index 0000000..3012ed8 Binary files /dev/null and b/强化学习个人项目报告/models/ppo_carracing_ep50.pt differ diff --git a/强化学习个人项目报告/models/ppo_carracing_final.pt b/强化学习个人项目报告/models/ppo_carracing_final.pt new file mode 100644 index 0000000..e957c49 Binary files /dev/null and b/强化学习个人项目报告/models/ppo_carracing_final.pt differ diff --git a/强化学习个人项目报告/pyproject.toml b/强化学习个人项目报告/pyproject.toml new file mode 100644 index 0000000..891e9de --- /dev/null +++ b/强化学习个人项目报告/pyproject.toml @@ -0,0 +1,39 @@ +[project] +name = "ppo-carracing" +version = "0.1.0" +description = "PPO (Proximal Policy Optimization) for CarRacing-v3 environment" +requires-python = ">=3.10" +dependencies = [ + "torch>=2.0.0", + "gymnasium[box2d]>=0.29.0", + "numpy>=1.24.0", + "matplotlib>=3.7.0", + "tensorboard>=2.14.0", + "opencv-python>=4.8.0", +] + +[project.optional-dependencies] +dev = ["pytest>=7.4.0", "black>=23.0.0", "ruff>=0.1.0"] + +[project.scripts] +ppo-train = "train:main" +ppo-evaluate = "src.evaluate:main" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.ruff] +line-length = 100 +target-version = "py310" + +[tool.ruff.lint] +select = ["E", "F", "I", "N", "W"] +ignore = ["E501"] + +[tool.black] +line-length = 100 +target-version = ["py310"] + +[tool.hatch.build.targets.wheel] +packages = ["src"] diff --git a/强化学习个人项目报告/src/evaluate.py b/强化学习个人项目报告/src/evaluate.py index 1f41d10..27f6561 100644 --- a/强化学习个人项目报告/src/evaluate.py +++ b/强化学习个人项目报告/src/evaluate.py @@ -1,4 +1,8 @@ """Evaluation script for trained PPO agent.""" +import sys +import os +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + import torch import numpy as np import gymnasium as gym diff --git a/强化学习个人项目报告/src/network.py b/强化学习个人项目报告/src/network.py index d89a0ac..b332c12 100644 --- a/强化学习个人项目报告/src/network.py +++ b/强化学习个人项目报告/src/network.py @@ -1,4 +1,5 @@ """Neural network architectures for Actor and Critic.""" + import torch import torch.nn as nn import torch.nn.functional as F @@ -9,7 +10,11 @@ class Actor(nn.Module): def __init__(self, state_shape=(84, 84, 4), action_dim=3): super().__init__() - c, h, w = state_shape[2], state_shape[0], state_shape[1] # channels, height, width + c, h, w = ( + state_shape[2], + state_shape[0], + state_shape[1], + ) # channels, height, width self.conv = nn.Sequential( nn.Conv2d(c, 32, kernel_size=8, stride=4), @@ -20,8 +25,10 @@ class Actor(nn.Module): nn.ReLU(), ) - # Calculate feature map size: 84x84 -> 20x20 after conv layers - feat_size = 64 * 20 * 20 + out_h = (h - 8) // 4 + 1 + out_h = (out_h - 4) // 2 + 1 + out_h = (out_h - 3) // 1 + 1 + feat_size = 64 * out_h * out_h self.fc = nn.Sequential( nn.Linear(feat_size, 512), @@ -62,17 +69,16 @@ class Critic(nn.Module): nn.ReLU(), ) - feat_size = 64 * 20 * 20 + out_h = (h - 8) // 4 + 1 + out_h = (out_h - 4) // 2 + 1 + out_h = (out_h - 3) // 1 + 1 + feat_size = 64 * out_h * out_h - self.fc = nn.Sequential( - nn.Linear(feat_size, 512), - nn.ReLU(), - nn.Linear(512, 1) - ) + self.fc = nn.Sequential(nn.Linear(feat_size, 512), nn.ReLU(), nn.Linear(512, 1)) def forward(self, x): """Forward pass returning V(s).""" x = x / 255.0 x = self.conv(x) x = x.view(x.size(0), -1) - return self.fc(x) \ No newline at end of file + return self.fc(x) diff --git a/强化学习个人项目报告/src/replay_buffer.py b/强化学习个人项目报告/src/replay_buffer.py index 8080a3e..b6a0004 100644 --- a/强化学习个人项目报告/src/replay_buffer.py +++ b/强化学习个人项目报告/src/replay_buffer.py @@ -15,7 +15,7 @@ class RolloutBuffer: self.rewards = np.zeros(buffer_size, dtype=np.float32) self.dones = np.zeros(buffer_size, dtype=np.bool_) self.values = np.zeros(buffer_size, dtype=np.float32) - self.log_probs = np.zeros((buffer_size, action_dim), dtype=np.float32) + self.log_probs = np.zeros(buffer_size, dtype=np.float32) def add(self, state, action, reward, done, value, log_prob): """Add a transition to the buffer.""" diff --git a/强化学习个人项目报告/src/trainer.py b/强化学习个人项目报告/src/trainer.py index 880d3c2..19bb18c 100644 --- a/强化学习个人项目报告/src/trainer.py +++ b/强化学习个人项目报告/src/trainer.py @@ -56,8 +56,8 @@ class PPOTrainer: # Normalize advantages advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) - # Convert to tensors - states_t = torch.from_numpy(states).float().to(self.device) + # Convert to tensors (states: N, H, W, C -> N, C, H, W) + states_t = torch.from_numpy(states).float().permute(0, 3, 1, 2).to(self.device) actions_t = torch.from_numpy(actions).float().to(self.device) log_probs_old_t = torch.from_numpy(log_probs_old).float().to(self.device) returns_t = torch.from_numpy(returns).float().to(self.device) @@ -75,16 +75,13 @@ class PPOTrainer: for batch in loader: s, a, log_pi_old, ret, adv = batch - # Get current policy distribution mu, std = self.actor(s) dist = torch.distributions.Normal(mu, std) - log_pi = dist.log_prob(a).sum(dim=-1, keepdim=True) - entropy = dist.entropy().sum(dim=-1, keepdim=True) + log_pi = dist.log_prob(a).sum(dim=-1) + entropy = dist.entropy().sum(dim=-1) - # Probability ratio ratio = torch.exp(log_pi - log_pi_old) - # Clipped surrogate objective surr1 = ratio * adv surr2 = torch.clamp(ratio, 1 - self.clip_eps, 1 + self.clip_eps) * adv actor_loss = -torch.min(surr1, surr2).mean() diff --git a/强化学习个人项目报告/train.py b/强化学习个人项目报告/train.py index 3a605f5..7f7329d 100644 --- a/强化学习个人项目报告/train.py +++ b/强化学习个人项目报告/train.py @@ -30,7 +30,7 @@ def collect_rollout(actor, critic, env, buffer, device, rollout_steps): value = critic(obs_t).squeeze(0).item() action_np = action.squeeze(0).cpu().numpy() - log_prob_np = log_prob.squeeze(0).cpu().numpy() + log_prob_np = log_prob.squeeze(0).cpu().numpy().sum() next_obs, reward, terminated, truncated, _ = env.step(action_np) done = terminated or truncated @@ -46,6 +46,8 @@ def collect_rollout(actor, critic, env, buffer, device, rollout_steps): obs, _ = env.reset() obs = np.transpose(obs, (1, 2, 0)) + return obs + def train( total_steps=500000, @@ -102,10 +104,8 @@ def train( recent_rewards = [] while total_timesteps < total_steps: - # Collect rollout - collect_rollout(actor, critic, env, buffer, device, rollout_steps) + obs = collect_rollout(actor, critic, env, buffer, device, rollout_steps) - # Get last value for GAE with torch.no_grad(): obs_t = torch.from_numpy(obs).float().unsqueeze(0).permute(0, 3, 1, 2).to(device) last_value = critic(obs_t).squeeze(0).item() @@ -190,3 +190,13 @@ if __name__ == "__main__": device = get_device() train(total_steps=args.steps, rollout_steps=args.rollout, device=device) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--steps", type=int, default=500000, help="Total training steps") + parser.add_argument("--rollout", type=int, default=2048, help="Rollout buffer size") + args = parser.parse_args() + + device = get_device() + train(total_steps=args.steps, rollout_steps=args.rollout, device=device) diff --git a/强化学习个人项目报告/training_curves.png b/强化学习个人项目报告/training_curves.png new file mode 100644 index 0000000..b32a4c1 Binary files /dev/null and b/强化学习个人项目报告/training_curves.png differ