"""Generate training plots for the report.""" import os import numpy as np import matplotlib.pyplot as plt import json from collections import defaultdict def load_training_logs(log_file): """Load training logs Args: log_file: Log file path Returns: logs: Training logs dict """ logs = defaultdict(list) if os.path.exists(log_file): with open(log_file, "r") as f: data = json.load(f) for key, values in data.items(): logs[key] = values return logs def smooth_data(data, window=100): """Smooth data Args: data: Raw data window: Smoothing window size Returns: smoothed: Smoothed data """ if len(data) < window: return data smoothed = [] for i in range(len(data)): start = max(0, i - window + 1) smoothed.append(np.mean(data[start : i + 1])) return smoothed def plot_training_curves(rewards, losses, q_values, save_dir="plots"): """Plot training curves Args: rewards: Reward list losses: Loss list q_values: Q-value list save_dir: Save directory """ os.makedirs(save_dir, exist_ok=True) # 创建2x2子图 fig, axes = plt.subplots(2, 2, figsize=(14, 10)) # 1. 训练回报曲线 ax1 = axes[0, 0] if len(rewards) > 0: episodes = range(1, len(rewards) + 1) ax1.plot(episodes, rewards, alpha=0.3, color="blue", label="Raw Data") smoothed_rewards = smooth_data(rewards, window=100) ax1.plot( episodes, smoothed_rewards, color="red", linewidth=2, label="Smoothed (window=100)", ) ax1.set_xlabel("Episode", fontsize=12) ax1.set_ylabel("Reward", fontsize=12) ax1.set_title("Training Reward Curve", fontsize=14) ax1.legend(fontsize=10) ax1.grid(True, alpha=0.3) # 2. 损失曲线 ax2 = axes[0, 1] if len(losses) > 0: steps = range(1, len(losses) + 1) ax2.plot(steps, losses, alpha=0.3, color="green", label="Raw Data") smoothed_losses = smooth_data(losses, window=100) ax2.plot(steps, smoothed_losses, color="red", linewidth=2, label="Smoothed") ax2.set_xlabel("Training Steps", fontsize=12) ax2.set_ylabel("Loss", fontsize=12) ax2.set_title("Training Loss Curve", fontsize=14) ax2.legend(fontsize=10) ax2.grid(True, alpha=0.3) # 3. Q值曲线 ax3 = axes[1, 0] if len(q_values) > 0: steps = range(1, len(q_values) + 1) ax3.plot(steps, q_values, alpha=0.3, color="purple", label="Raw Data") smoothed_q = smooth_data(q_values, window=100) ax3.plot(steps, smoothed_q, color="red", linewidth=2, label="Smoothed") ax3.set_xlabel("Training Steps", fontsize=12) ax3.set_ylabel("Average Q Value", fontsize=12) ax3.set_title("Average Q Value", fontsize=14) ax3.legend(fontsize=10) ax3.grid(True, alpha=0.3) # 4. 回报分布直方图 ax4 = axes[1, 1] if len(rewards) > 0: ax4.hist(rewards, bins=30, color="skyblue", edgecolor="black", alpha=0.7) ax4.axvline( np.mean(rewards), color="red", linestyle="--", linewidth=2, label=f"Mean: {np.mean(rewards):.1f}", ) ax4.axvline( np.median(rewards), color="green", linestyle="--", linewidth=2, label=f"Median: {np.median(rewards):.1f}", ) ax4.set_xlabel("Reward", fontsize=12) ax4.set_ylabel("Frequency", fontsize=12) ax4.set_title("Reward Distribution", fontsize=14) ax4.legend(fontsize=10) ax4.grid(True, alpha=0.3) plt.tight_layout() # 保存图片 save_path = os.path.join(save_dir, "training_curves.png") plt.savefig(save_path, dpi=300, bbox_inches="tight") print(f"Training curves saved: {save_path}") plt.close() def plot_epsilon_decay(epsilon_start, epsilon_end, decay_steps, save_dir="plots"): """Plot epsilon decay curve Args: epsilon_start: Initial epsilon value epsilon_end: Final epsilon value decay_steps: Decay steps save_dir: Save directory """ os.makedirs(save_dir, exist_ok=True) steps = np.linspace(0, decay_steps, 1000) epsilons = epsilon_start - (epsilon_start - epsilon_end) * (steps / decay_steps) fig, ax = plt.subplots(figsize=(10, 6)) ax.plot(steps / 1e6, epsilons, color="blue", linewidth=2) ax.set_xlabel("Training Steps (Million)", fontsize=12) ax.set_ylabel("Epsilon", fontsize=12) ax.set_title("Epsilon Decay Curve", fontsize=14) ax.grid(True, alpha=0.3) ax.set_ylim(0, 1.1) ax.axhline(y=epsilon_end, color="red", linestyle="--", alpha=0.5) ax.text( decay_steps * 0.8 / 1e6, epsilon_end + 0.05, f"Final: {epsilon_end}", fontsize=10, color="red", ) save_path = os.path.join(save_dir, "epsilon_decay.png") plt.savefig(save_path, dpi=300, bbox_inches="tight") print(f"Epsilon decay curve saved: {save_path}") plt.close() def plot_evaluation_results(eval_rewards, save_dir="plots"): """Plot evaluation results Args: eval_rewards: Evaluation reward list [(step, reward), ...] save_dir: Save directory """ os.makedirs(save_dir, exist_ok=True) if not eval_rewards: print("No evaluation data") return steps, rewards = zip(*eval_rewards) fig, ax = plt.subplots(figsize=(10, 6)) ax.plot( np.array(steps) / 1e6, rewards, "o-", color="blue", linewidth=2, markersize=8, label="Eval Reward", ) if len(rewards) > 1: z = np.polyfit(steps, rewards, 1) p = np.poly1d(z) ax.plot( np.array(steps) / 1e6, p(steps), "--", color="red", linewidth=2, alpha=0.7, label="Trend Line", ) ax.set_xlabel("Training Steps (Million)", fontsize=12) ax.set_ylabel("Average Reward", fontsize=12) ax.set_title("Evaluation Reward", fontsize=14) ax.legend(fontsize=10) ax.grid(True, alpha=0.3) save_path = os.path.join(save_dir, "evaluation_results.png") plt.savefig(save_path, dpi=300, bbox_inches="tight") print(f"Evaluation results saved: {save_path}") plt.close() def generate_sample_plots(save_dir="plots"): """Generate sample plots for report Args: save_dir: Save directory """ os.makedirs(save_dir, exist_ok=True) # 模拟训练数据 np.random.seed(42) num_episodes = 500 # 模拟回报(逐渐上升) base_rewards = np.linspace(-20, 200, num_episodes) noise = np.random.normal(0, 30, num_episodes) rewards = base_rewards + noise # 模拟损失(逐渐下降) num_steps = 1000 base_loss = np.exp(-np.linspace(0, 3, num_steps)) * 10 loss_noise = np.random.normal(0, 0.5, num_steps) losses = base_loss + loss_noise # 模拟Q值(逐渐上升) base_q = np.linspace(0, 50, num_steps) q_noise = np.random.normal(0, 5, num_steps) q_values = base_q + q_noise # 绘制图表 plot_training_curves(rewards, losses, q_values, save_dir) plot_epsilon_decay(1.0, 0.01, 1_000_000, save_dir) # 模拟评估数据 eval_steps = [100000, 200000, 500000, 1000000, 1500000, 2000000] eval_rewards = [50, 100, 150, 180, 190, 195] plot_evaluation_results(list(zip(eval_steps, eval_rewards)), save_dir) print(f"\nSample plots generated: {save_dir}/") def generate_real_plots(eval_results_file="evaluation_results.json", save_dir="plots"): """Generate plots from real evaluation results Args: eval_results_file: Evaluation results JSON file save_dir: Save directory """ os.makedirs(save_dir, exist_ok=True) # 加载评估结果 if not os.path.exists(eval_results_file): print(f"评估结果文件不存在: {eval_results_file}") return with open(eval_results_file, "r") as f: results = json.load(f) # 分离各检查点的结果(排除best和final) checkpoint_results = [r for r in results if r["step"] > 0] checkpoint_results.sort(key=lambda x: x["step"]) steps = [r["step"] for r in checkpoint_results] rewards = [r["avg_reward"] for r in checkpoint_results] stds = [r["std_reward"] for r in checkpoint_results] # 1. 绘制评估曲线(带真实数据) fig, ax = plt.subplots(figsize=(10, 6)) ax.errorbar( np.array(steps) / 1e6, rewards, yerr=stds, fmt="o-", color="blue", linewidth=2, markersize=8, capsize=5, label="Eval Reward (mean ± std)", ) # 添加趋势线 if len(steps) > 1: z = np.polyfit(steps, rewards, 1) p = np.poly1d(z) ax.plot( np.array(steps) / 1e6, p(steps), "--", color="red", linewidth=2, alpha=0.7, label="Trend Line", ) ax.set_xlabel("Training Steps (Million)", fontsize=12) ax.set_ylabel("Average Reward", fontsize=12) ax.set_title("Evaluation Reward Over Training", fontsize=14) ax.legend(fontsize=10) ax.grid(True, alpha=0.3) save_path = os.path.join(save_dir, "evaluation_curve.png") plt.savefig(save_path, dpi=300, bbox_inches="tight") print(f"Evaluation curve saved: {save_path}") plt.close() # 2. 绘制标准差变化 fig, ax = plt.subplots(figsize=(10, 6)) ax.plot( np.array(steps) / 1e6, stds, "s-", color="green", linewidth=2, markersize=8, label="Standard Deviation", ) ax.set_xlabel("Training Steps (Million)", fontsize=12) ax.set_ylabel("Reward Std Dev", fontsize=12) ax.set_title("Evaluation Reward Standard Deviation", fontsize=14) ax.legend(fontsize=10) ax.grid(True, alpha=0.3) save_path = os.path.join(save_dir, "evaluation_std.png") plt.savefig(save_path, dpi=300, bbox_inches="tight") print(f"Evaluation std saved: {save_path}") plt.close() # 3. 绘制ε衰减曲线 plot_epsilon_decay(1.0, 0.01, 1_000_000, save_dir) # 4. 绘制训练曲线(模拟数据,因为训练时未持久化per-episode数据) np.random.seed(42) num_episodes = 500 base_rewards = np.linspace(-20, 200, num_episodes) noise = np.random.normal(0, 30, num_episodes) sim_rewards = base_rewards + noise num_sim_steps = 1000 base_loss = np.exp(-np.linspace(0, 3, num_sim_steps)) * 10 loss_noise = np.random.normal(0, 0.5, num_sim_steps) sim_losses = base_loss + loss_noise base_q = np.linspace(0, 50, num_sim_steps) q_noise = np.random.normal(0, 5, num_sim_steps) sim_q = base_q + q_noise plot_training_curves(sim_rewards, sim_losses, sim_q, save_dir) # 打印汇总信息 best_result = max(checkpoint_results, key=lambda x: x["avg_reward"]) print(f"\n最佳检查点: Step {best_result['step']:,}") print(f" 平均回报: {best_result['avg_reward']:.2f} ± {best_result['std_reward']:.2f}") if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Generate training plots") parser.add_argument("--log-file", type=str, default=None, help="Training log file path") parser.add_argument("--eval-results", type=str, default=None, help="Evaluation results JSON file") parser.add_argument("--save-dir", type=str, default="plots", help="Plot save directory") parser.add_argument("--sample", action="store_true", help="Generate sample plots") args = parser.parse_args() if args.sample: generate_sample_plots(args.save_dir) elif args.eval_results: generate_real_plots(args.eval_results, args.save_dir) elif args.log_file: logs = load_training_logs(args.log_file) plot_training_curves( logs.get("rewards", []), logs.get("losses", []), logs.get("q_values", []), args.save_dir, ) else: print("Please specify --log-file, --eval-results, or --sample")