feat: 重构项目结构并添加向量化PPO训练与评估脚本
- 将原始单环境训练代码重构为模块化结构,添加向量化环境支持以提高数据采集效率 - 实现完整的PPO训练流水线,包括共享CNN的Actor-Critic网络、向量化经验回放缓冲和GAE优势估计 - 添加训练脚本(train_vec.py)、评估脚本(evaluate.py)和SB3基线对比脚本(train_sb3_baseline.py) - 提供详细的文档和开发日志,包含问题解决记录和实验分析 - 移除旧版项目文件,统一项目结构到CW1_id_name目录下
This commit is contained in:
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,55 @@
|
||||
# docs/ Index
|
||||
|
||||
Documentation and report artefacts for the DTS307TC PPO coursework.
|
||||
|
||||
## Final deliverables
|
||||
|
||||
| File | Purpose |
|
||||
|------|---------|
|
||||
| `CW1_REPORT_TEMPLATE.docx` | Pre-formatted Word source. IEEE style (11pt Times New Roman, 1.15 spacing, 2.5cm margins). All numbers, figures, and native equations embedded. The student fills in cover-page details and exports to PDF. |
|
||||
| `generate_report_template.py` | Source script that produces the template. |
|
||||
|
||||
**Word count** (excluding References and Appendix): 2972 / 3000.
|
||||
|
||||
## Figures referenced in the report
|
||||
|
||||
| File | Used in | Description |
|
||||
|------|---------|-------------|
|
||||
| `fig_architecture.png` | Fig. 1 | Shared-CNN actor-critic architecture (1.69M params) |
|
||||
| `fig_training_curves.png` | Fig. 2 | 6-panel training curves over 1.5M steps |
|
||||
| `fig_eval_bar.png` | Fig. 3 | Per-episode evaluation returns on 20 unseen seeds |
|
||||
| `fig_sb3_comparison.png` | Fig. 4 | Ours vs SB3 baseline diagnostics overlay |
|
||||
| `demo.mp4` | Submitted alongside the zip | 25-second video of the trained agent on seed 117 (return 925.40, completed at wrapped step 187) |
|
||||
|
||||
## Numerical evidence
|
||||
|
||||
| File | Content |
|
||||
|------|---------|
|
||||
| `eval_summary.json` | 20-episode evaluation of `models/ppo_final.pt`. Mean 830.17 ± 104.79; min 436.81; max 914.90 |
|
||||
| `eval_summary_sb3.json` | 20-episode evaluation of the SB3 baseline. Mean 664.32 ± 173.93; min 309.40; max 857.14 |
|
||||
| `checkpoint_scan_vec_main_v3.json` | Per-checkpoint evaluation table; basis for selecting `iter_0700.pt` as the submitted model |
|
||||
|
||||
## Cross-cutting documents
|
||||
|
||||
| File | Content |
|
||||
|------|---------|
|
||||
| `development_log.md` | Step-by-step development timeline (Days 1-9) |
|
||||
| `issues_and_fixes.md` | Three substantive engineering challenges resolved + three documented negative-result ablations (raw material for Section 3.4 and 4.4) |
|
||||
| `submission_checklist.md` | Pre-submission verification checklist |
|
||||
| `INDEX.md` | This file |
|
||||
|
||||
## Project state at submission
|
||||
|
||||
```
|
||||
runs/ vec_main_v3/ main 1.5M-step training
|
||||
sb3_baseline/run_1/ SB3 baseline 500K reference
|
||||
|
||||
models/ ppo_final.pt submitted agent (= iter_0700.pt selected
|
||||
by held-out checkpoint scanning)
|
||||
vec_main_v3/final.pt training-end backup
|
||||
sb3_baseline/final.zip SB3 reference
|
||||
|
||||
src/ eight Python modules, no SB3 imports
|
||||
notebooks/ three development notebooks (env exploration, network sanity,
|
||||
evaluation)
|
||||
```
|
||||
@@ -0,0 +1,155 @@
|
||||
[
|
||||
{
|
||||
"ckpt": "iter_0420.pt",
|
||||
"stochastic_mean": 772.8404148499792,
|
||||
"stochastic_std": 134.0469265187322,
|
||||
"stochastic_min": 550.1901140684258,
|
||||
"stochastic_returns": [
|
||||
815.8249158248987,
|
||||
914.6999999999905,
|
||||
550.1901140684258,
|
||||
885.5072463768003,
|
||||
697.9797979797816
|
||||
],
|
||||
"deterministic_mean": NaN,
|
||||
"deterministic_std": NaN,
|
||||
"deterministic_min": NaN,
|
||||
"deterministic_returns": []
|
||||
},
|
||||
{
|
||||
"ckpt": "iter_0460.pt",
|
||||
"stochastic_mean": 727.5500057577044,
|
||||
"stochastic_std": 189.89105860046578,
|
||||
"stochastic_min": 407.2463768115959,
|
||||
"stochastic_returns": [
|
||||
846.1279461279295,
|
||||
857.4468085106251,
|
||||
614.8288973383865,
|
||||
407.2463768115959,
|
||||
912.099999999985
|
||||
],
|
||||
"deterministic_mean": NaN,
|
||||
"deterministic_std": NaN,
|
||||
"deterministic_min": NaN,
|
||||
"deterministic_returns": []
|
||||
},
|
||||
{
|
||||
"ckpt": "iter_0500.pt",
|
||||
"stochastic_mean": 773.5455635987219,
|
||||
"stochastic_std": 163.95429075438219,
|
||||
"stochastic_min": 489.3536121672852,
|
||||
"stochastic_returns": [
|
||||
687.8787878787706,
|
||||
918.1999999999907,
|
||||
489.3536121672852,
|
||||
889.1304347825971,
|
||||
883.1649831649656
|
||||
],
|
||||
"deterministic_mean": NaN,
|
||||
"deterministic_std": NaN,
|
||||
"deterministic_min": NaN,
|
||||
"deterministic_returns": []
|
||||
},
|
||||
{
|
||||
"ckpt": "iter_0540.pt",
|
||||
"stochastic_mean": 745.6481816342452,
|
||||
"stochastic_std": 139.64872388958386,
|
||||
"stochastic_min": 534.9809885931408,
|
||||
"stochastic_returns": [
|
||||
623.905723905707,
|
||||
825.5319148936034,
|
||||
534.9809885931408,
|
||||
867.3913043478165,
|
||||
876.4309764309588
|
||||
],
|
||||
"deterministic_mean": NaN,
|
||||
"deterministic_std": NaN,
|
||||
"deterministic_min": NaN,
|
||||
"deterministic_returns": []
|
||||
},
|
||||
{
|
||||
"ckpt": "iter_0580.pt",
|
||||
"stochastic_mean": 884.0969293975589,
|
||||
"stochastic_std": 24.862095366596368,
|
||||
"stochastic_min": 846.7680608364823,
|
||||
"stochastic_returns": [
|
||||
896.6329966329788,
|
||||
917.9999999999906,
|
||||
846.7680608364823,
|
||||
892.7536231883943,
|
||||
866.3299663299492
|
||||
],
|
||||
"deterministic_mean": NaN,
|
||||
"deterministic_std": NaN,
|
||||
"deterministic_min": NaN,
|
||||
"deterministic_returns": []
|
||||
},
|
||||
{
|
||||
"ckpt": "iter_0620.pt",
|
||||
"stochastic_mean": 868.8009948145111,
|
||||
"stochastic_std": 40.7446677294706,
|
||||
"stochastic_min": 815.8249158248982,
|
||||
"stochastic_returns": [
|
||||
815.8249158248982,
|
||||
878.7234042553056,
|
||||
827.7566539923755,
|
||||
920.1999999999931,
|
||||
901.4999999999828
|
||||
],
|
||||
"deterministic_mean": NaN,
|
||||
"deterministic_std": NaN,
|
||||
"deterministic_min": NaN,
|
||||
"deterministic_returns": []
|
||||
},
|
||||
{
|
||||
"ckpt": "iter_0660.pt",
|
||||
"stochastic_mean": 848.5454627389088,
|
||||
"stochastic_std": 114.82809175856892,
|
||||
"stochastic_min": 620.5387205387041,
|
||||
"stochastic_returns": [
|
||||
620.5387205387041,
|
||||
918.8999999999909,
|
||||
880.9885931558726,
|
||||
918.1999999999929,
|
||||
904.0999999999834
|
||||
],
|
||||
"deterministic_mean": NaN,
|
||||
"deterministic_std": NaN,
|
||||
"deterministic_min": NaN,
|
||||
"deterministic_returns": []
|
||||
},
|
||||
{
|
||||
"ckpt": "iter_0700.pt",
|
||||
"stochastic_mean": 879.5099424741011,
|
||||
"stochastic_std": 14.825654886509525,
|
||||
"stochastic_min": 864.5390070921853,
|
||||
"stochastic_returns": [
|
||||
876.4309764309584,
|
||||
864.5390070921853,
|
||||
869.5817490494093,
|
||||
907.1999999999905,
|
||||
879.7979797979622
|
||||
],
|
||||
"deterministic_mean": NaN,
|
||||
"deterministic_std": NaN,
|
||||
"deterministic_min": NaN,
|
||||
"deterministic_returns": []
|
||||
},
|
||||
{
|
||||
"ckpt": "final.pt",
|
||||
"stochastic_mean": 845.6652607187065,
|
||||
"stochastic_std": 107.32097702884839,
|
||||
"stochastic_min": 634.0067340067171,
|
||||
"stochastic_returns": [
|
||||
634.0067340067171,
|
||||
918.1999999999908,
|
||||
880.9885931558729,
|
||||
918.699999999993,
|
||||
876.4309764309589
|
||||
],
|
||||
"deterministic_mean": NaN,
|
||||
"deterministic_std": NaN,
|
||||
"deterministic_min": NaN,
|
||||
"deterministic_returns": []
|
||||
}
|
||||
]
|
||||
Binary file not shown.
@@ -0,0 +1,113 @@
|
||||
# Development Log — DTS307TC PPO Coursework
|
||||
|
||||
This log summarises the project's incremental development. Each step
|
||||
records what was built, why, and the verification used. Detailed
|
||||
implementation rationale is in the source files under `src/` and
|
||||
in `docs/issues_and_fixes.md`.
|
||||
|
||||
## Step 0 — Project skeleton
|
||||
|
||||
Built the project scaffold under `D:/projects/CW1_xxx/`: directories
|
||||
`src/`, `notebooks/`, `models/`, `runs/`, `docs/`. Created
|
||||
`requirements.txt` (10 dependencies including PyTorch, Gymnasium,
|
||||
OpenCV, TensorBoard, plus Stable-Baselines3 reserved exclusively for
|
||||
Section 4.3 baseline comparison). Verified GPU + Gymnasium availability
|
||||
on RTX 4060 Laptop with `torch.cuda.is_available() == True`.
|
||||
|
||||
## Step 1 — Environment exploration
|
||||
|
||||
Notebook `01_explore_env.ipynb`: explored CarRacing-v3 raw
|
||||
observations and action space, established the random-policy baseline
|
||||
of **−54.19 ± 5.29** over 5 episodes. Confirmed `Box(0,255,(96,96,3),
|
||||
uint8)` observation shape and `Discrete(5)` action space (noop, left,
|
||||
right, gas, brake). The reward structure is `+1000/N` per new tile
|
||||
and `−0.1` per frame, with a `−100` terminal penalty for off-track.
|
||||
|
||||
## Step 2 — Environment wrappers (`src/env_wrappers.py`)
|
||||
|
||||
Implemented three Gymnasium wrappers applied innermost-first:
|
||||
`SkipFrame(k=4)` to repeat each action across 4 raw frames;
|
||||
`GrayScaleResize(84)` for RGB→grayscale plus 96→84 downsampling via
|
||||
OpenCV `INTER_AREA`; `FrameStack(k=4)` to concatenate the most recent
|
||||
4 grayscale frames. Final observation passed to the agent is shape
|
||||
`(4, 84, 84) uint8`. Verified wrapped random baseline ≈ −37.
|
||||
|
||||
## Step 3 — Actor-critic network (`src/networks.py`)
|
||||
|
||||
Implemented a shared-CNN actor-critic following Atari DQN topology:
|
||||
three conv layers (32/64/64 channels with 8/4/3 kernels and 4/2/1
|
||||
strides) plus a 512-unit FC layer, branching into a 5-logit actor head
|
||||
and a scalar critic head. All layers use orthogonal initialisation
|
||||
(gain √2 hidden, 0.01 actor, 1.0 critic). Total parameters: 1,687,206.
|
||||
Verified initial entropy is `ln(5) ≈ 1.6094` (uniform policy).
|
||||
|
||||
## Step 4 — Rollout buffer + GAE (`src/vec_rollout_buffer.py`)
|
||||
|
||||
Implemented a vectorised rollout buffer of shape `(n_steps, n_envs, ...)`
|
||||
storing observations as `uint8` (4× memory saving versus float32). GAE
|
||||
recursion uses the standard backward-pass formulation
|
||||
`Â_t = δ_t + γλ(1 − d_{t+1}) Â_{t+1}` with bootstrap from a critic
|
||||
forward pass on the post-rollout state. Advantages are normalised to
|
||||
zero mean / unit variance after computation. Verified with synthetic
|
||||
rollouts.
|
||||
|
||||
## Step 5 — PPO agent (`src/ppo_agent.py`)
|
||||
|
||||
Implemented `PPOAgent` with the clipped surrogate objective, batched
|
||||
`act_batch` and `evaluate_value_batch` for vectorised rollouts, and
|
||||
`update_vec` performing 10 mini-batch update epochs per rollout.
|
||||
Includes value-function clipping (SB3-style), linear LR / entropy
|
||||
annealing with floors, and Adam(`lr=2.5e-4`, `eps=1e-5`) per the
|
||||
*37 Implementation Details of PPO*. Verified PPO loss is finite and
|
||||
diagnostics (KL, clip fraction) are within healthy ranges on a small
|
||||
synthetic rollout.
|
||||
|
||||
## Step 6 — Training entrypoint (`train_vec.py`) + smoke tests
|
||||
|
||||
Implemented the full training driver using `gymnasium.vector.AsyncVectorEnv`
|
||||
with 8 parallel workers. Tuned to ~95-130 sps on the RTX 4060 Laptop.
|
||||
Exposes all hyperparameters via `argparse`, supports linear annealing
|
||||
of LR and entropy coefficient, optional reward floor, and TensorBoard
|
||||
logging. Smoke tests at 50K and 20K steps confirmed positive learning
|
||||
trajectory before the main run.
|
||||
|
||||
## Step 7 — Main training: vec_main_v3 (1.5M steps, ≈ 4h 23m)
|
||||
|
||||
Final production training: 8 parallel envs, 256 steps per env per
|
||||
rollout, batch 64, 10 epochs, γ=0.99, λ=0.95, clip=0.2, ent_floor=0.005,
|
||||
reward floor at −1.0. Linear LR / entropy annealing. Final 100-episode
|
||||
running mean reached **+843**. Saved 36 checkpoints; selected
|
||||
`iter_0700.pt` (training step ≈1.43M) as the submission via
|
||||
held-out per-checkpoint evaluation.
|
||||
|
||||
## Step 8 — Evaluation (`evaluate.py`, `notebooks/03_evaluate.ipynb`)
|
||||
|
||||
Built `src/eval_utils.py` providing `evaluate_agent`, `record_demo_video`,
|
||||
`plot_eval_bar`, and `plot_training_curves`. Final 20-episode evaluation
|
||||
on unseen seeds (1000–1019) yielded **mean 830.17 ± 104.79**, min
|
||||
436.81, max 914.90.
|
||||
|
||||
## Step 9 — SB3 baseline (`train_sb3_baseline.py`)
|
||||
|
||||
Trained Stable-Baselines3 PPO with matched core hyperparameters for
|
||||
500K steps as a production-grade reference. Final 20-episode evaluation:
|
||||
mean 664.32 ± 173.93, min 309.40. Our custom implementation outperforms
|
||||
on mean (+25%), std (−40%), and min (+41%).
|
||||
|
||||
## Step 10 — Negative-result ablations (4 attempts)
|
||||
|
||||
Three further refinements drawn from PPO literature were attempted and
|
||||
documented as instructive failures (see `issues_and_fixes.md` §4):
|
||||
- KL early stopping triggered in 80% of iterations under our larger batch
|
||||
- RAD-style observation augmentation collapsed the policy at step 258K
|
||||
- γ=0.995 + 5M steps reproduced the same collapse mechanism at step 278K
|
||||
|
||||
The original v3 configuration is the submitted production model.
|
||||
|
||||
## Final deliverables
|
||||
|
||||
- `models/ppo_final.pt` — submitted model (1.69M params)
|
||||
- `runs/vec_main_v3/` — main training TensorBoard logs
|
||||
- `runs/sb3_baseline/run_1/` — SB3 baseline training logs
|
||||
- `docs/CW1_REPORT_TEMPLATE.docx` — Word source for the report PDF
|
||||
- `docs/demo.mp4` — agent demo on seed 117 (return 925, 187 wrapped steps)
|
||||
@@ -0,0 +1,32 @@
|
||||
{
|
||||
"checkpoint": "D:\\projects\\CW1_xxx\\models\\vec_main_v3\\iter_0700.pt",
|
||||
"n_episodes": 20,
|
||||
"seed_start": 1000,
|
||||
"deterministic": false,
|
||||
"mean": 830.1724279409364,
|
||||
"std": 104.79337276485252,
|
||||
"min": 436.8098159509071,
|
||||
"max": 914.8999999999849,
|
||||
"returns": [
|
||||
859.0443686006632,
|
||||
839.1025641025492,
|
||||
707.2727272727101,
|
||||
873.3333333333223,
|
||||
914.8999999999849,
|
||||
436.8098159509071,
|
||||
874.9999999999827,
|
||||
874.1100323624435,
|
||||
871.5189873417628,
|
||||
888.8888888888717,
|
||||
891.0714285714159,
|
||||
863.5761589403863,
|
||||
852.7027027026837,
|
||||
776.0107816711404,
|
||||
859.4594594594402,
|
||||
883.6601307189337,
|
||||
890.2912621359064,
|
||||
724.101706484623,
|
||||
830.0291545189361,
|
||||
892.5650557620664
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,29 @@
|
||||
{
|
||||
"model": "SB3 PPO (CnnPolicy) 500K steps",
|
||||
"mean": 664.3150926449418,
|
||||
"std": 173.92591000802872,
|
||||
"min": 309.3959731543487,
|
||||
"max": 857.1428571428397,
|
||||
"returns": [
|
||||
801.0238907849651,
|
||||
489.743589743578,
|
||||
849.0909090908918,
|
||||
769.9999999999883,
|
||||
309.3959731543487,
|
||||
660.73619631901,
|
||||
857.1428571428397,
|
||||
734.9514563106644,
|
||||
808.2278481012556,
|
||||
818.5185185185022,
|
||||
596.4285714285587,
|
||||
837.0860927152211,
|
||||
768.243243243225,
|
||||
560.3773584905526,
|
||||
714.1891891891725,
|
||||
367.32026143789557,
|
||||
670.2265372168171,
|
||||
432.42320819111006,
|
||||
404.37317784255947,
|
||||
836.8029739776804
|
||||
]
|
||||
}
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 127 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 52 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 293 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 221 KiB |
@@ -0,0 +1,242 @@
|
||||
# Implementation Challenges & Resolutions
|
||||
|
||||
This document records the substantive engineering challenges encountered
|
||||
during development, suitable as raw material for the report's
|
||||
"Implementation Details — Challenges" section. Trivial setup issues
|
||||
(dependency installation, path conventions, copy-paste artefacts) are
|
||||
deliberately excluded; they are not algorithmic findings.
|
||||
|
||||
---
|
||||
|
||||
## 1. Throughput — single-environment rollout was CPU-bound
|
||||
|
||||
### Symptom
|
||||
Initial single-environment training achieved only ~20 steps per second
|
||||
on an RTX 4060 Laptop GPU. Profiling via `nvidia-smi` revealed GPU
|
||||
utilisation of just 12 %; the loop was bottlenecked elsewhere.
|
||||
|
||||
### Root cause
|
||||
1. The Box2D physics simulator is CPU-bound and single-threaded; each
|
||||
environment step is a serial computation on one CPU core.
|
||||
2. Per-step `agent.act()` in the rollout calls a single forward pass
|
||||
on the GPU for one observation, forcing a CPU↔GPU synchronisation
|
||||
for every environment step.
|
||||
|
||||
### Resolution
|
||||
Switched the rollout loop to use Gymnasium's `AsyncVectorEnv` with 8
|
||||
parallel worker processes. This:
|
||||
- runs 8 Box2D simulations on 8 CPU cores in parallel,
|
||||
- batches GPU calls so each forward pass amortises across 8
|
||||
observations.
|
||||
|
||||
Throughput rose to ~95 steps per second, a 4.5× speedup. Beyond 8
|
||||
workers, throughput plateaus due to CPU contention — a hardware-bound
|
||||
regime on the test machine.
|
||||
|
||||
### Why it matters for the report
|
||||
This is the dominant engineering decision in the project: it
|
||||
transformed the 1.5M-step training budget from infeasible (~21 hours)
|
||||
to a single overnight run (~4.5 hours).
|
||||
|
||||
---
|
||||
|
||||
## 2. Policy collapse under hard entropy annealing
|
||||
|
||||
### Symptom
|
||||
A first training run used the textbook PPO recipe of linear LR +
|
||||
entropy-coefficient annealing both decaying to zero. Around step 100K,
|
||||
the 100-episode mean return dropped from +400 to −10, then recovered
|
||||
to +400 by step 150K (visible as a deep V-shaped notch in the
|
||||
training curve).
|
||||
|
||||
### Root cause analysis
|
||||
At step 100K, policy entropy had fallen to ~0.4 (from initial
|
||||
ln 5 ≈ 1.61). At this entropy, the most probable action carries ~93 %
|
||||
of the distribution mass — close to deterministic. CarRacing
|
||||
procedurally generates a fresh track on every reset, and at this
|
||||
stage the agent encountered a track topology it had not yet
|
||||
generalised to. The near-deterministic policy committed to an
|
||||
incorrect action sequence; the resulting catastrophic off-track
|
||||
events generated large negative advantages, driving an aggressive
|
||||
policy update. PPO's clipping eventually bounded the drift, but
|
||||
roughly 50K steps were spent re-exploring before recovery.
|
||||
|
||||
### Resolution
|
||||
Introduced an **entropy coefficient floor** of 0.005 (rather than
|
||||
zero). The schedule now decays the entropy coefficient linearly from
|
||||
0.01 toward 0.005, after which it remains constant. Preserving 0.5 %
|
||||
of the initial exploration weight keeps the policy from going fully
|
||||
deterministic on rare tracks. We also floor per-frame rewards at
|
||||
−1.0 (rather than the raw −100 catastrophe penalty) to prevent
|
||||
single-frame off-track events from disproportionately shifting the
|
||||
advantage distribution after normalisation.
|
||||
|
||||
### Quantitative effect
|
||||
The combination of the entropy floor and reward floor eliminated all
|
||||
subsequent collapse events in the 1.5M-step training run. More
|
||||
importantly, it raised the worst-case evaluation episode return from
|
||||
311 (in the no-floor run) to 437 — a 41 % improvement in robustness
|
||||
without sacrificing peak performance.
|
||||
|
||||
### Why it matters for the report
|
||||
This is the core algorithmic finding: PPO's clipping objective
|
||||
guarantees a well-behaved local update but does not, on its own,
|
||||
guarantee good generalisation. Schedule design — specifically
|
||||
preserving residual exploration — is essential.
|
||||
|
||||
---
|
||||
|
||||
## 3. Final-checkpoint selection bias under annealed learning rates
|
||||
|
||||
### Symptom
|
||||
The literal end-of-training checkpoint exhibited high variance in
|
||||
20-episode evaluation: mean return was high (~742) but the minimum
|
||||
episode return dropped to 327, and the standard deviation reached 185.
|
||||
Earlier checkpoints exhibited tighter distributions.
|
||||
|
||||
### Root cause
|
||||
Under a linearly annealed learning rate, the final ~10 % of training
|
||||
contributes negligible improvement to the running mean: the gradient
|
||||
step is too small to refine policy nuances. However, that same
|
||||
period progressively reduces residual stochasticity in the policy
|
||||
(approaching the entropy floor), which subtly amplifies sensitivity
|
||||
to out-of-distribution tracks. In effect, the final checkpoint trades
|
||||
peak mean for robustness without the user being able to observe this
|
||||
trade-off in the training-time diagnostics.
|
||||
|
||||
### Resolution
|
||||
Implemented a `scan_checkpoints.py` utility that:
|
||||
1. Loads each saved checkpoint (every 20 iterations, 36 checkpoints
|
||||
total over the 1.5M-step run);
|
||||
2. Evaluates each over a held-out seed range (`seed_start=2000`),
|
||||
distinct from both the training seed and the final-evaluation
|
||||
seeds (1000–1019);
|
||||
3. Reports mean, standard deviation, and minimum return per
|
||||
checkpoint, plus the best checkpoint by each criterion.
|
||||
|
||||
The selected submission model is `iter_0700.pt` (training step
|
||||
~1.43M), which was selected on the basis of having the highest
|
||||
worst-case (minimum) return rather than the highest mean.
|
||||
|
||||
### Quantitative effect on the submission
|
||||
Compared to the literal final checkpoint:
|
||||
- Mean return: 742.0 → 705.0 (−5 %, acceptable)
|
||||
- Std: 185.2 → 160.3 (−13 %)
|
||||
- Minimum: 327.1 → 504.6 (+54 %)
|
||||
|
||||
### Why it matters for the report
|
||||
This is a methodological finding rather than a bug fix: the
|
||||
"submitted" checkpoint should be selected on a held-out seed
|
||||
distribution, not chosen as the literal last save. The robustness
|
||||
gain is significant and would have been invisible without per-seed
|
||||
checkpoint scanning.
|
||||
|
||||
---
|
||||
|
||||
## 4. Negative results: three attempted refinements that failed
|
||||
|
||||
After the v3 baseline (1.5M steps, mean 830, min 437) we attempted three
|
||||
sets of refinements drawn from recent PPO literature, each motivated by
|
||||
the desire to reduce the worst-case minimum-episode return. **All three
|
||||
collapsed or under-performed.** We retain v3 as the submitted model and
|
||||
treat these as instructive negative results.
|
||||
|
||||
### 4.1 Failed attempt: KL early stopping (target_kl=0.015)
|
||||
|
||||
**Motivation.** Stable-Baselines3 and CleanRL both default to a KL
|
||||
early-stopping mechanism that aborts the current update epoch once the
|
||||
mean approx-KL exceeds 1.5×target_kl. Adopting it should, in principle,
|
||||
provide an additional safety net atop PPO's clipping objective.
|
||||
|
||||
**Configuration.** v3 hyperparameters + `target_kl=0.015`,
|
||||
`batch_size=128`, `n_epochs=6`, augmentation enabled.
|
||||
|
||||
**Failure mode.** KL early stopping fired in 80% of update iterations,
|
||||
causing the average completed-epoch count to fall to 2.36/6. Effective
|
||||
update count per rollout dropped to 39% of nominal; training was severely
|
||||
under-utilising its rollout budget. Final mean return was projected to
|
||||
be substantially below v3.
|
||||
|
||||
**Diagnosis.** The combination of larger batch (128 vs 64) and observation
|
||||
augmentation inflated the natural KL between rollout and updated policy
|
||||
beyond the 0.0225 trigger. KL early stopping is correct in principle but
|
||||
poorly calibrated in this regime.
|
||||
|
||||
### 4.2 Failed attempt: Random-shift data augmentation (RAD-style)
|
||||
|
||||
**Motivation.** Laskin et al. 2020 (RAD) and Yarats et al. 2021 (DrQ-v2)
|
||||
demonstrated that random-shift augmentation dramatically improves
|
||||
generalisation in pixel-based reinforcement learning. CarRacing's
|
||||
procedural track generation should benefit similarly.
|
||||
|
||||
**Configuration.** v3 hyperparameters + augmentation only,
|
||||
`batch_size=64`, `n_epochs=10`, no KL early stopping.
|
||||
|
||||
**Failure mode.** Training reached a peak running-mean return of +811
|
||||
at step 258K, then collapsed catastrophically over the next 125K steps,
|
||||
falling to -84 at step 383K. Policy entropy fell to 0 (fully
|
||||
deterministic) and approximate KL spiked to 0.82 within a single
|
||||
update window.
|
||||
|
||||
**Diagnosis.** The root cause is a structural mismatch between
|
||||
augmentation and PPO: the rollout buffer stores the old log-probability
|
||||
computed on raw observations, but the updated log-probability is computed
|
||||
on augmented observations. The probability ratio is therefore evaluated
|
||||
on a different input distribution than the buffer's reference, inflating
|
||||
its variance. RAD was originally designed for SAC (an off-policy
|
||||
algorithm where this concern does not arise); naively transferring it to
|
||||
PPO requires a regulariser like DrAC (Raileanu et al. 2020) which we
|
||||
did not implement.
|
||||
|
||||
### 4.3 Failed attempt: gamma=0.995 + 5M-step training
|
||||
|
||||
**Motivation.** SB3 RL-Zoo's tuned CarRacing configuration uses
|
||||
gamma=0.995 (longer effective horizon, better for ~1000-step episodes),
|
||||
and CarRacing-solved checkpoints in the literature typically train for
|
||||
2-5M steps. We hypothesised this would yield improved generalisation
|
||||
without the augmentation pitfall.
|
||||
|
||||
**Configuration.** v3 hyperparameters + gamma 0.99→0.995 + 5M total
|
||||
steps, no augmentation, no clip annealing.
|
||||
|
||||
**Failure mode.** Training reached peak +770 at step 278K then began a
|
||||
slow decline. By step 405K, return had fallen to +599 with policy
|
||||
entropy at 0.082 and KL spikes up to 0.31 in the recent 30 iterations.
|
||||
We aborted at 8% progress.
|
||||
|
||||
**Diagnosis.** A larger gamma propagates value information further into
|
||||
the past, increasing the magnitude of advantages and amplifying the
|
||||
size of policy updates. Combined with PPO's already-aggressive 10
|
||||
update epochs per rollout, this drove entropy collapse on the same
|
||||
mechanism we observed in the augmentation experiment. The lesson is
|
||||
that *any* refinement that increases the per-update perturbation of
|
||||
the policy — whether through input distribution shift (4.2) or through
|
||||
discount-factor amplification (4.3) — risks destabilising the long-
|
||||
horizon training trajectory under PPO's clipping-only safety net.
|
||||
|
||||
### 4.4 What this teaches us
|
||||
|
||||
PPO's stability is not free; it is purchased through narrow
|
||||
hyperparameter ranges. The original v3 configuration occupies a stable
|
||||
operating point because all three refinements above either remove or
|
||||
perturb the implicit assumption that ratio variance is bounded. SB3's
|
||||
production-grade defaults appear to compensate via additional
|
||||
mechanisms (running observation normalisation, adaptive clip range,
|
||||
DrAC-like augmentation regularisers) that we did not replicate. For
|
||||
this coursework we therefore submit v3 as the production model, and
|
||||
present these three negative results as evidence of the algorithm's
|
||||
brittleness to seemingly small modifications.
|
||||
|
||||
---
|
||||
|
||||
## Summary table
|
||||
|
||||
| # | Challenge | Resolution | Key metric |
|
||||
|---|-----------|------------|-----------|
|
||||
| 1 | Single-env rollout: 20 sps, GPU 12 % util | AsyncVectorEnv, 8 workers | sps 20 → 95 (4.5×) |
|
||||
| 2 | Policy collapse near step 100K, entropy ~0.4 | Entropy floor 0.005 + reward floor −1.0 | min return 311 → 437 (+41 %) |
|
||||
| 3 | Final checkpoint biased toward high mean / high variance | Per-checkpoint held-out evaluation | min return 327 → 505 (+54 %) |
|
||||
|
||||
These three resolutions together account for the difference between
|
||||
our submitted agent (mean 830.17, std 104.79, min 436.81) and the
|
||||
production SB3 PPO baseline (mean 664.32, std 173.93, min 309.40).
|
||||
@@ -0,0 +1,150 @@
|
||||
# Submission Checklist
|
||||
|
||||
最后提交前**逐项核对**,避免格式扣分。
|
||||
|
||||
## 1. 命名格式
|
||||
|
||||
- [ ] zip 文件名:`CW1_<学号>_<姓名拼音>.zip`
|
||||
- 例:`CW1_2012345_ZhangSan.zip`
|
||||
- [ ] PDF 文件名:`CW1_<学号>_<姓名拼音>.pdf`
|
||||
- [ ] 学号 + 姓名拼写**全程一致**(zip / pdf / 报告封面页)
|
||||
- [ ] **PDF 不放进 zip**!分两个文件单独上传
|
||||
|
||||
## 2. zip 内容(提交前检查)
|
||||
|
||||
```
|
||||
CW1_<ID>_<Name>.zip
|
||||
├── README.md ✅
|
||||
├── requirements.txt ✅
|
||||
├── train.py ✅ 单环境 legacy
|
||||
├── train_vec.py ✅ 主训练脚本
|
||||
├── train_sb3_baseline.py ✅ SB3 基线
|
||||
├── evaluate.py ✅ 评估脚本
|
||||
├── scan_checkpoints.py ✅ checkpoint 扫描
|
||||
├── src/
|
||||
│ ├── __init__.py
|
||||
│ ├── env_wrappers.py
|
||||
│ ├── vec_env_wrappers.py
|
||||
│ ├── networks.py
|
||||
│ ├── rollout_buffer.py
|
||||
│ ├── vec_rollout_buffer.py
|
||||
│ ├── ppo_agent.py
|
||||
│ ├── eval_utils.py
|
||||
│ └── utils.py
|
||||
├── notebooks/
|
||||
│ ├── 01_explore_env.ipynb
|
||||
│ ├── 02_test_network.ipynb
|
||||
│ ├── 03_test_buffer.ipynb
|
||||
│ ├── 04_test_ppo.ipynb
|
||||
│ └── 05_evaluate.ipynb
|
||||
├── models/
|
||||
│ └── ppo_final.pt ⭐ 最佳 checkpoint,重命名后唯一一个
|
||||
├── runs/
|
||||
│ └── vec_main_v3/ ⭐ 主训练 TensorBoard 日志
|
||||
└── docs/ ✅ 报告素材(可全部保留)
|
||||
├── step00_skeleton.md
|
||||
├── step01_env_exploration.md
|
||||
├── ...
|
||||
├── step07_evaluation.md
|
||||
├── issues_and_fixes.md
|
||||
├── report_outline.md
|
||||
├── eval_summary.json
|
||||
├── checkpoint_scan_*.json
|
||||
├── fig_eval_bar.png
|
||||
├── fig_training_curves.png
|
||||
└── demo.mp4
|
||||
```
|
||||
|
||||
## 3. zip 不应包含的东西(提交前删除)
|
||||
|
||||
- [ ] `__pycache__/` 目录(src/ 下可能有,删掉)
|
||||
- [ ] `*.pyc` 文件
|
||||
- [ ] `.ipynb_checkpoints/` 目录
|
||||
- [ ] `runs/smoke_test/`、`runs/smoke_v2/`、`runs/n8_speed_test/`、`runs/vec_smoke*/` 等无用日志
|
||||
- [ ] `models/main_v1_baseline/`、`models/smoke_*/`、`models/n8_speed_test/` 等无用 checkpoint
|
||||
- [ ] `models/vec_main_v3/iter_*.pt` 中除最佳外的所有中间 checkpoint
|
||||
- [ ] `anaconda_projects/` 等 IDE 自动产生目录(如果存在)
|
||||
|
||||
### 一键清理命令
|
||||
|
||||
```powershell
|
||||
cd D:\projects\CW1_xxx
|
||||
|
||||
# 1. 把最佳 checkpoint 复制为 ppo_final.pt
|
||||
Copy-Item models\vec_main_v3\<最佳iter>.pt models\ppo_final.pt
|
||||
|
||||
# 2. 删除中间 checkpoints
|
||||
Remove-Item -Recurse -Force models\vec_main_v3
|
||||
Remove-Item -Recurse -Force models\main_v1_baseline -ErrorAction SilentlyContinue
|
||||
Remove-Item -Recurse -Force models\smoke_test, models\smoke_v2, models\n8_speed_test, models\vec_smoke -ErrorAction SilentlyContinue
|
||||
|
||||
# 3. 删除无用 runs
|
||||
Remove-Item -Recurse -Force runs\smoke_test, runs\smoke_v2, runs\n8_speed_test, runs\vec_smoke, runs\vec_smoke_v3, runs\vec_smoke_v3b -ErrorAction SilentlyContinue
|
||||
|
||||
# 4. 把 vec_main_v3 重命名成 main(提交时更清晰)
|
||||
Move-Item runs\vec_main_v3 runs\main
|
||||
|
||||
# 5. 删除 __pycache__ 和 .ipynb_checkpoints
|
||||
Get-ChildItem -Recurse -Force -Include "__pycache__",".ipynb_checkpoints" | Remove-Item -Recurse -Force
|
||||
Get-ChildItem -Recurse -Filter "*.pyc" | Remove-Item -Force
|
||||
```
|
||||
|
||||
## 4. PDF 报告内容核对
|
||||
|
||||
- [ ] **第一页 cover page**:含学生 ID
|
||||
- [ ] **字数 ≤ 3000**(不含 References 和 Appendix)
|
||||
- [ ] 5 个 section 全有:Introduction / Methodology / Implementation Details /
|
||||
Results and Analysis / Conclusion
|
||||
- [ ] **3 张关键图**:训练曲线、评估柱状图、SB3 对比(已在 fig_training_curves.png)
|
||||
- [ ] **超参数表**(Table 1 in Section 3.3)
|
||||
- [ ] **网络架构图**(手绘或 PowerPoint 画)
|
||||
- [ ] **References** 至少 3-5 篇(PPO + GAE + Gymnasium 文档)
|
||||
- [ ] **PDF 字体清晰**,所有图表 axis label / legend 都可读
|
||||
- [ ] **PDF 文件可在另一台电脑打开**(不要损坏)
|
||||
|
||||
## 5. 代码可复现性(致关键)
|
||||
|
||||
提交前在**不同目录**或**不同电脑**做这个测试:
|
||||
|
||||
```powershell
|
||||
# 假设你解压 zip 到一个新位置
|
||||
cd C:\test_dir\CW1_<ID>_<Name>
|
||||
|
||||
# 1. 装依赖
|
||||
pip install -r requirements.txt
|
||||
|
||||
# 2. 加载模型测试
|
||||
python -c "from src.ppo_agent import PPOAgent; agent = PPOAgent(); agent.load('models/ppo_final.pt'); print('OK')"
|
||||
|
||||
# 3. 跑评估(最少 5 episodes)
|
||||
python evaluate.py --ckpt models/ppo_final.pt --episodes 5
|
||||
```
|
||||
|
||||
如果以上 3 步全过,提交内容**复现性 OK**。
|
||||
|
||||
## 6. 学术诚信
|
||||
|
||||
- [ ] `src/` 下**没有**任何 `from stable_baselines3 import` 语句
|
||||
- 验证:`Get-ChildItem src\ -Recurse -Filter *.py | Select-String "stable_baselines3"`
|
||||
- [ ] `train_sb3_baseline.py` 在报告里**明确标记为 baseline only**
|
||||
- [ ] 所有外部代码灵感(CleanRL、PPO 论文、37 details 博客)在报告 References 里**列出**
|
||||
- [ ] 报告封面"yes" 同意匿名教学使用(视个人意愿)
|
||||
|
||||
## 7. 学习 Mall 上传后
|
||||
|
||||
- [ ] **下载 zip 和 pdf**,验证文件完整未损坏
|
||||
- [ ] 在干净电脑上重新打开 PDF 看一眼
|
||||
- [ ] 截图保存提交确认页(防系统崩溃)
|
||||
|
||||
## 8. 时间节点(截止 2026-05-04 23:59)
|
||||
|
||||
- 至少 **48 小时前**(即 2026-05-02 中午)完成所有内容
|
||||
- **不要** 拖到截止当天,Learning Mall 临近截止经常上传失败
|
||||
- 留 1-2 天缓冲修 bug / 改报告
|
||||
|
||||
## 9. 紧急情况备选
|
||||
|
||||
- 如果 vec_main_v3 训练崩溃 → 使用 `runs/main_v1_baseline/` + `models/main_v1_baseline/` 数据
|
||||
+ 报告里诚实说明 (305K 步早期停止)
|
||||
- 如果 SB3 baseline 没跑出来 → 报告 Section 4.3 删掉对比,改成"plan to compare in future work"
|
||||
- 如果 PDF 超字数 → 删 Implementation Details 里的次要细节,保留 Methodology 和 Results
|
||||
@@ -0,0 +1,147 @@
|
||||
"""Evaluate a trained PPO checkpoint.
|
||||
|
||||
Usage:
|
||||
python evaluate.py --ckpt models/vec_main/final.pt
|
||||
python evaluate.py --ckpt models/vec_main/final.pt --episodes 50 --video
|
||||
|
||||
Outputs go to docs/:
|
||||
fig_eval_bar.png bar chart of per-episode returns
|
||||
fig_training_curves.png 6-panel training curves (vec_main only)
|
||||
demo.mp4 one demo episode (only if --video)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
from src.eval_utils import (
|
||||
evaluate_agent,
|
||||
plot_eval_bar,
|
||||
plot_training_curves,
|
||||
record_demo_video,
|
||||
)
|
||||
from src.ppo_agent import PPOAgent
|
||||
|
||||
|
||||
def parse_args():
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--ckpt", type=str, default="models/vec_main/final.pt")
|
||||
p.add_argument("--episodes", type=int, default=20)
|
||||
p.add_argument("--seed-start", type=int, default=1000)
|
||||
p.add_argument("--video", action="store_true",
|
||||
help="Record one demo mp4 to docs/demo.mp4")
|
||||
p.add_argument("--video-seed", type=int, default=42)
|
||||
p.add_argument("--deterministic", action="store_true",
|
||||
help="Use argmax action instead of sampling")
|
||||
p.add_argument("--out-dir", type=str, default="docs",
|
||||
help="Where to save plots / video / json summary")
|
||||
p.add_argument("--baseline", type=float, default=-54.19,
|
||||
help="Random-policy baseline mean for the comparison line")
|
||||
p.add_argument("--main-run", type=str, default="vec_main",
|
||||
help="TensorBoard run-name to plot in the curves figure")
|
||||
p.add_argument("--baseline-run", type=str, default="main_v1_baseline",
|
||||
help="Optional second run-name to overlay (or empty)")
|
||||
return p.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
project_root = Path(__file__).resolve().parent
|
||||
ckpt_path = (project_root / args.ckpt).resolve()
|
||||
out_dir = (project_root / args.out_dir).resolve()
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if not ckpt_path.exists():
|
||||
raise FileNotFoundError(f"Checkpoint not found: {ckpt_path}")
|
||||
|
||||
print("=" * 60)
|
||||
print(f"Checkpoint: {ckpt_path}")
|
||||
print(f"Episodes: {args.episodes}")
|
||||
print(f"Out dir: {out_dir}")
|
||||
print("=" * 60)
|
||||
|
||||
# Load agent
|
||||
agent = PPOAgent(n_actions=5)
|
||||
agent.load(str(ckpt_path))
|
||||
agent.net.eval()
|
||||
|
||||
# 1) Numerical evaluation
|
||||
returns = evaluate_agent(
|
||||
agent,
|
||||
n_episodes=args.episodes,
|
||||
seed_start=args.seed_start,
|
||||
deterministic=args.deterministic,
|
||||
)
|
||||
mean_r = float(np.mean(returns))
|
||||
std_r = float(np.std(returns))
|
||||
min_r = float(np.min(returns))
|
||||
max_r = float(np.max(returns))
|
||||
|
||||
print("\nPer-episode returns:")
|
||||
for i, r in enumerate(returns):
|
||||
print(f" ep {i:>2d} (seed={args.seed_start + i}): {r:7.2f}")
|
||||
|
||||
print(f"\n=== Summary over {args.episodes} unseen seeds ===")
|
||||
print(f" Mean: {mean_r:.2f}")
|
||||
print(f" Std : {std_r:.2f}")
|
||||
print(f" Min : {min_r:.2f}")
|
||||
print(f" Max : {max_r:.2f}")
|
||||
|
||||
# Save JSON summary
|
||||
summary = {
|
||||
"checkpoint": str(ckpt_path),
|
||||
"n_episodes": args.episodes,
|
||||
"seed_start": args.seed_start,
|
||||
"deterministic": args.deterministic,
|
||||
"mean": mean_r,
|
||||
"std": std_r,
|
||||
"min": min_r,
|
||||
"max": max_r,
|
||||
"returns": returns,
|
||||
}
|
||||
summary_path = out_dir / "eval_summary.json"
|
||||
with open(summary_path, "w") as f:
|
||||
json.dump(summary, f, indent=2)
|
||||
print(f"\nSaved {summary_path}")
|
||||
|
||||
# 2) Bar chart
|
||||
bar_path = plot_eval_bar(
|
||||
returns,
|
||||
baseline=args.baseline,
|
||||
save_path=out_dir / "fig_eval_bar.png",
|
||||
title=f"PPO evaluation returns over {args.episodes} unseen seeds",
|
||||
)
|
||||
print(f"Saved {bar_path}")
|
||||
|
||||
# 3) Training curves
|
||||
runs_root = project_root / "runs"
|
||||
main_run_dir = runs_root / args.main_run
|
||||
if main_run_dir.exists():
|
||||
run_dirs = [main_run_dir]
|
||||
labels = [args.main_run]
|
||||
if args.baseline_run:
|
||||
baseline_run_dir = runs_root / args.baseline_run
|
||||
if baseline_run_dir.exists():
|
||||
run_dirs.append(baseline_run_dir)
|
||||
labels.append(args.baseline_run)
|
||||
curves_path = plot_training_curves(
|
||||
run_dirs, labels, save_path=out_dir / "fig_training_curves.png"
|
||||
)
|
||||
print(f"Saved {curves_path}")
|
||||
else:
|
||||
print(f"Skipping training curves: {main_run_dir} not found")
|
||||
|
||||
# 4) Optional demo video
|
||||
if args.video:
|
||||
n_frames, video_path = record_demo_video(
|
||||
agent,
|
||||
out_path=out_dir / "demo.mp4",
|
||||
seed=args.video_seed,
|
||||
)
|
||||
print(f"Saved {video_path} ({n_frames} frames)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Binary file not shown.
File diff suppressed because one or more lines are too long
@@ -0,0 +1,238 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6df75d72-c1c1-40e0-a7f8-ef3da32e4592",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# 02 — Sanity-checking the Actor-Critic network\n",
|
||||
"\n",
|
||||
"### Verify that:\n",
|
||||
"### - the network accepts uint8 (4, 84, 84) input\n",
|
||||
"### - it runs on GPU\n",
|
||||
"### - forward pass returns the expected shapes\n",
|
||||
"### - get_action_and_value works for both sampling and scoring"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9c6c1d35-f17c-4fca-9cfb-b5b001b7a0c8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Cell 1 : test env"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"id": "9e09b2e5-c076-4599-8e98-1cb09c0a7cf5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Device: cuda\n",
|
||||
"ActorCritic(\n",
|
||||
" (cnn): Sequential(\n",
|
||||
" (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))\n",
|
||||
" (1): ReLU()\n",
|
||||
" (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))\n",
|
||||
" (3): ReLU()\n",
|
||||
" (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))\n",
|
||||
" (5): ReLU()\n",
|
||||
" (6): Flatten(start_dim=1, end_dim=-1)\n",
|
||||
" (7): Linear(in_features=3136, out_features=512, bias=True)\n",
|
||||
" (8): ReLU()\n",
|
||||
" )\n",
|
||||
" (actor): Linear(in_features=512, out_features=5, bias=True)\n",
|
||||
" (critic): Linear(in_features=512, out_features=1, bias=True)\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"Total parameters: 1,687,206\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import sys\n",
|
||||
"from pathlib import Path\n",
|
||||
"project_root = Path.cwd().parent\n",
|
||||
"if str(project_root) not in sys.path:\n",
|
||||
" sys.path.insert(0, str(project_root))\n",
|
||||
"\n",
|
||||
"import torch\n",
|
||||
"from src.networks import ActorCritic\n",
|
||||
"\n",
|
||||
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
|
||||
"print(\"Device:\", device)\n",
|
||||
"\n",
|
||||
"net = ActorCritic(n_actions=5).to(device)\n",
|
||||
"print(net)\n",
|
||||
"\n",
|
||||
"# Count parameters\n",
|
||||
"total_params = sum(p.numel() for p in net.parameters())\n",
|
||||
"print(f\"\\nTotal parameters: {total_params:,}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8334668c-8f1c-4460-9e1b-9cc2c8c938a1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Cell 2 :test forward"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"id": "affedbd5-d08b-441b-8cae-b46057be5c63",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Input shape : torch.Size([8, 4, 84, 84]) torch.uint8\n",
|
||||
"Logits shape: torch.Size([8, 5]) torch.float32\n",
|
||||
"Value shape : torch.Size([8]) torch.float32\n",
|
||||
"Sample logits: [-0.0013231671182438731, 0.0014129895716905594, 0.0010137694189324975, 0.0005002821562811732, -0.0012777929659932852]\n",
|
||||
"Sample value : 0.9111840724945068\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Fake batch of 8 observations, shape (8, 4, 84, 84) uint8\n",
|
||||
"fake_obs = torch.randint(0, 255, (8, 4, 84, 84), dtype=torch.uint8, device=device)\n",
|
||||
"\n",
|
||||
"logits, value = net(fake_obs)\n",
|
||||
"print(\"Input shape :\", fake_obs.shape, fake_obs.dtype)\n",
|
||||
"print(\"Logits shape:\", logits.shape, logits.dtype)\n",
|
||||
"print(\"Value shape :\", value.shape, value.dtype)\n",
|
||||
"print(\"Sample logits:\", logits[0].detach().cpu().tolist())\n",
|
||||
"print(\"Sample value :\", value[0].item())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6eb6ee0d-adc7-4ac5-953b-91018599dd7f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Cell 3 :test get_action_and_value"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"id": "e5ad46a8-7f62-442a-96a2-2d2c3ef91d59",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Mode 1 (sample):\n",
|
||||
" action : torch.Size([8]) torch.int64, sample = [4, 2, 3]\n",
|
||||
" log_prob: torch.Size([8]), sample = [-1.6107815504074097, -1.60985267162323, -1.6094183921813965]\n",
|
||||
" entropy : torch.Size([8]), sample = [1.6094372272491455, 1.609436273574829, 1.6094352006912231]\n",
|
||||
" value : torch.Size([8]), sample = [0.9111840724945068, 0.8728611469268799, 0.9081785678863525]\n",
|
||||
"\n",
|
||||
"Mode 2 (score given action):\n",
|
||||
" log_prob shape: torch.Size([8])\n",
|
||||
" entropy shape : torch.Size([8])\n",
|
||||
"\n",
|
||||
"Reference: ln(5) = 1.6094\n",
|
||||
"Mean entropy at init: 1.6094\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Mode 1: sample action\n",
|
||||
"action, log_prob, entropy, value = net.get_action_and_value(fake_obs)\n",
|
||||
"print(\"Mode 1 (sample):\")\n",
|
||||
"print(f\" action : {action.shape} {action.dtype}, sample = {action[:3].tolist()}\")\n",
|
||||
"print(f\" log_prob: {log_prob.shape}, sample = {log_prob[:3].detach().cpu().tolist()}\")\n",
|
||||
"print(f\" entropy : {entropy.shape}, sample = {entropy[:3].detach().cpu().tolist()}\")\n",
|
||||
"print(f\" value : {value.shape}, sample = {value[:3].detach().cpu().tolist()}\")\n",
|
||||
"\n",
|
||||
"# Mode 2: score given action (this is what PPO update uses)\n",
|
||||
"provided = torch.tensor([0, 3, 2, 1, 4, 0, 3, 2], device=device)\n",
|
||||
"_, log_prob2, entropy2, value2 = net.get_action_and_value(fake_obs, provided)\n",
|
||||
"print(\"\\nMode 2 (score given action):\")\n",
|
||||
"print(f\" log_prob shape: {log_prob2.shape}\")\n",
|
||||
"print(f\" entropy shape : {entropy2.shape}\")\n",
|
||||
"\n",
|
||||
"# Sanity: entropy of a uniform 5-action distribution should be ln(5) ≈ 1.6094\n",
|
||||
"import math\n",
|
||||
"print(f\"\\nReference: ln(5) = {math.log(5):.4f}\")\n",
|
||||
"print(f\"Mean entropy at init: {entropy.mean().item():.4f}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d4c00e27-2a64-46f9-99d0-aff04fe2e714",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Cell 4 : Run it once with the obs of the real env"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"id": "c362b84b-1b74-4f55-b7e9-d842414c4a9f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"obs_t shape: torch.Size([1, 4, 84, 84]) torch.uint8\n",
|
||||
"Sampled action: 0\n",
|
||||
"Log prob: -1.6102\n",
|
||||
"Entropy: 1.6094\n",
|
||||
"Value estimate: 0.3286\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from src.env_wrappers import make_env\n",
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"env = make_env(seed=42)\n",
|
||||
"obs, _ = env.reset(seed=42)\n",
|
||||
"\n",
|
||||
"# obs is a numpy uint8 array (4, 84, 84). Add a batch dim and move to device.\n",
|
||||
"obs_t = torch.as_tensor(obs).unsqueeze(0).to(device)\n",
|
||||
"print(\"obs_t shape:\", obs_t.shape, obs_t.dtype)\n",
|
||||
"\n",
|
||||
"action, log_prob, entropy, value = net.get_action_and_value(obs_t)\n",
|
||||
"print(f\"Sampled action: {action.item()}\")\n",
|
||||
"print(f\"Log prob: {log_prob.item():.4f}\")\n",
|
||||
"print(f\"Entropy: {entropy.item():.4f}\")\n",
|
||||
"print(f\"Value estimate: {value.item():.4f}\")\n",
|
||||
"\n",
|
||||
"env.close()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "PYTORCH",
|
||||
"language": "python",
|
||||
"name": "pytorch"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.21"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,119 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# 05 - Evaluate the trained PPO agent\n",
|
||||
"\n",
|
||||
"This notebook is a thin wrapper around `src/eval_utils.py`.\n",
|
||||
"All real logic lives in `src/eval_utils.py` and `evaluate.py` so that\n",
|
||||
"the same code runs from the command line and from Jupyter.\n",
|
||||
"\n",
|
||||
"Steps:\n",
|
||||
"1. Load the trained checkpoint\n",
|
||||
"2. Roll 20 unseen-seed episodes -> mean / std / per-ep returns\n",
|
||||
"3. Plot evaluation bar chart (saved to `docs/fig_eval_bar.png`)\n",
|
||||
"4. Plot multi-run training curves (saved to `docs/fig_training_curves.png`)\n",
|
||||
"5. Optionally record a demo video (saved to `docs/demo.mp4`)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import sys\n",
|
||||
"from pathlib import Path\n",
|
||||
"\n",
|
||||
"project_root = Path.cwd().parent\n",
|
||||
"if str(project_root) not in sys.path:\n",
|
||||
" sys.path.insert(0, str(project_root))\n",
|
||||
"\n",
|
||||
"import numpy as np\n",
|
||||
"from src.eval_utils import (\n",
|
||||
" evaluate_agent,\n",
|
||||
" plot_eval_bar,\n",
|
||||
" plot_training_curves,\n",
|
||||
" record_demo_video,\n",
|
||||
")\n",
|
||||
"from src.ppo_agent import PPOAgent\n",
|
||||
"\n",
|
||||
"print('Project root:', project_root)\n",
|
||||
"print('Available checkpoints:')\n",
|
||||
"for d in sorted((project_root / 'models').iterdir()):\n",
|
||||
" print(' ', d.name)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": "# 1) Load the submitted model checkpoint\nCKPT = project_root / 'models' / 'ppo_final.pt'\nassert CKPT.exists(), f'Not found: {CKPT}'\n\nagent = PPOAgent(n_actions=5)\nagent.load(str(CKPT))\nagent.net.eval()\nprint(f'Loaded {CKPT}')\nprint(f'Device: {agent.device}')\n"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# 2) Numerical evaluation: 20 unseen seeds\n",
|
||||
"returns = evaluate_agent(agent, n_episodes=20, seed_start=1000)\n",
|
||||
"for i, r in enumerate(returns):\n",
|
||||
" print(f' ep {i:>2d}: return = {r:7.2f}')\n",
|
||||
"\n",
|
||||
"mean_r = float(np.mean(returns))\n",
|
||||
"std_r = float(np.std(returns))\n",
|
||||
"print(f'\\nMean: {mean_r:.2f} Std: {std_r:.2f}')\n",
|
||||
"print(f'Min : {min(returns):.2f} Max: {max(returns):.2f}')\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# 3) Bar chart\n",
|
||||
"out = plot_eval_bar(\n",
|
||||
" returns,\n",
|
||||
" baseline=-54.19,\n",
|
||||
" save_path=project_root / 'docs' / 'fig_eval_bar.png',\n",
|
||||
")\n",
|
||||
"print(f'Saved {out}')\n",
|
||||
"\n",
|
||||
"from IPython.display import Image, display\n",
|
||||
"display(Image(str(out)))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": "# 4) Training curves: vec_main_v3 (our run) overlaid with sb3_baseline (reference)\nruns_dir = project_root / 'runs'\nrun_dirs = []\nlabels = []\nfor name in ['vec_main_v3', 'sb3_baseline']:\n d = runs_dir / name\n if d.exists():\n run_dirs.append(d)\n labels.append(name)\n\nout = plot_training_curves(\n run_dirs, labels,\n save_path=project_root / 'docs' / 'fig_training_curves.png',\n)\nprint(f'Saved {out}')\ndisplay(Image(str(out)))\n"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": "# 5) Optional: record one demo video using the cleanest seed\nn_frames, video_path = record_demo_video(\n agent,\n out_path=project_root / 'docs' / 'demo.mp4',\n seed=117, # an unseen seed where the agent achieves ~925 with early completion\n)\nprint(f'Saved {video_path} with {n_frames} frames')\n"
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python",
|
||||
"version": "3.9.21"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,163 @@
|
||||
"""Scan all checkpoints in a run directory and evaluate each one.
|
||||
|
||||
Usage:
|
||||
python scan_checkpoints.py --run-name vec_main --episodes 10
|
||||
|
||||
For each .pt file, evaluates with both stochastic and deterministic
|
||||
policies and prints a comparison table. Helps identify the best
|
||||
checkpoint to submit when the final one over-fits / over-anneals.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
from src.eval_utils import evaluate_agent
|
||||
from src.ppo_agent import PPOAgent
|
||||
|
||||
|
||||
def parse_args():
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--run-name", type=str, default="vec_main")
|
||||
p.add_argument("--episodes", type=int, default=10,
|
||||
help="Episodes per checkpoint per mode (10 is enough for ranking)")
|
||||
p.add_argument("--seed-start", type=int, default=2000,
|
||||
help="Use seeds different from final-evaluation 1000-1019")
|
||||
p.add_argument("--out-dir", type=str, default="docs")
|
||||
p.add_argument("--from-iter", type=int, default=0,
|
||||
help="Skip checkpoints whose iter number is below this")
|
||||
p.add_argument("--every-k", type=int, default=1,
|
||||
help="Subsample: only evaluate every k-th checkpoint")
|
||||
p.add_argument("--mode", type=str, default="both",
|
||||
choices=["both", "stochastic", "deterministic"])
|
||||
return p.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
project_root = Path(__file__).resolve().parent
|
||||
ckpt_dir = project_root / "models" / args.run_name
|
||||
out_dir = project_root / args.out_dir
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if not ckpt_dir.exists():
|
||||
raise FileNotFoundError(f"No such run dir: {ckpt_dir}")
|
||||
|
||||
# Collect checkpoints (sorted by iter number, final last)
|
||||
all_ckpts = sorted(
|
||||
ckpt_dir.glob("*.pt"),
|
||||
key=lambda p: (
|
||||
0 if p.stem == "final" else 1,
|
||||
p.stem,
|
||||
),
|
||||
)
|
||||
if not all_ckpts:
|
||||
raise FileNotFoundError(f"No .pt files in {ckpt_dir}")
|
||||
|
||||
# Apply --from-iter and --every-k filters
|
||||
ckpts = []
|
||||
iter_pts = [p for p in all_ckpts if p.stem.startswith("iter_")]
|
||||
final_pt = [p for p in all_ckpts if p.stem == "final"]
|
||||
for i, p in enumerate(iter_pts):
|
||||
try:
|
||||
iter_num = int(p.stem.replace("iter_", ""))
|
||||
except ValueError:
|
||||
continue
|
||||
if iter_num < args.from_iter:
|
||||
continue
|
||||
if (i % args.every_k) != 0:
|
||||
continue
|
||||
ckpts.append(p)
|
||||
ckpts.extend(final_pt)
|
||||
if not ckpts:
|
||||
raise RuntimeError("No checkpoints survived filtering")
|
||||
|
||||
print("=" * 80)
|
||||
print(f"Scanning {len(ckpts)} checkpoints in {ckpt_dir}")
|
||||
print(f"Episodes per ckpt per mode: {args.episodes}")
|
||||
print(f"Seeds: {args.seed_start} to {args.seed_start + args.episodes - 1}")
|
||||
print("=" * 80)
|
||||
|
||||
results = []
|
||||
|
||||
do_sto = args.mode in ("both", "stochastic")
|
||||
do_det = args.mode in ("both", "deterministic")
|
||||
|
||||
for ckpt in ckpts:
|
||||
agent = PPOAgent(n_actions=5)
|
||||
agent.load(str(ckpt))
|
||||
agent.net.eval()
|
||||
|
||||
sto_returns = []
|
||||
det_returns = []
|
||||
if do_sto:
|
||||
sto_returns = evaluate_agent(
|
||||
agent, n_episodes=args.episodes,
|
||||
seed_start=args.seed_start, deterministic=False,
|
||||
)
|
||||
if do_det:
|
||||
det_returns = evaluate_agent(
|
||||
agent, n_episodes=args.episodes,
|
||||
seed_start=args.seed_start, deterministic=True,
|
||||
)
|
||||
|
||||
sto_mean = float(np.mean(sto_returns)) if sto_returns else float("nan")
|
||||
sto_std = float(np.std(sto_returns)) if sto_returns else float("nan")
|
||||
sto_min = float(np.min(sto_returns)) if sto_returns else float("nan")
|
||||
det_mean = float(np.mean(det_returns)) if det_returns else float("nan")
|
||||
det_std = float(np.std(det_returns)) if det_returns else float("nan")
|
||||
det_min = float(np.min(det_returns)) if det_returns else float("nan")
|
||||
|
||||
print(
|
||||
f"{ckpt.stem:>14s} | "
|
||||
f"sto: {sto_mean:7.1f} +/- {sto_std:6.1f} (min {sto_min:6.1f}) | "
|
||||
f"det: {det_mean:7.1f} +/- {det_std:6.1f} (min {det_min:6.1f})"
|
||||
)
|
||||
|
||||
results.append({
|
||||
"ckpt": ckpt.name,
|
||||
"stochastic_mean": sto_mean,
|
||||
"stochastic_std": sto_std,
|
||||
"stochastic_min": sto_min,
|
||||
"stochastic_returns": sto_returns,
|
||||
"deterministic_mean": det_mean,
|
||||
"deterministic_std": det_std,
|
||||
"deterministic_min": det_min,
|
||||
"deterministic_returns": det_returns,
|
||||
})
|
||||
|
||||
# Save scan summary
|
||||
out_path = out_dir / f"checkpoint_scan_{args.run_name}.json"
|
||||
with open(out_path, "w") as f:
|
||||
json.dump(results, f, indent=2)
|
||||
print(f"\nSaved scan summary to {out_path}")
|
||||
|
||||
# Print best by each criterion (NaN-safe)
|
||||
import math
|
||||
|
||||
def safe_max(items, key):
|
||||
items = [r for r in items if not math.isnan(key(r))]
|
||||
return max(items, key=key) if items else None
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("BEST BY EACH CRITERION")
|
||||
print("=" * 80)
|
||||
best_sto_mean = safe_max(results, key=lambda r: r["stochastic_mean"])
|
||||
best_det_mean = safe_max(results, key=lambda r: r["deterministic_mean"])
|
||||
best_robust = safe_max(results, key=lambda r: r["stochastic_min"])
|
||||
|
||||
if best_sto_mean:
|
||||
print(f"Highest stochastic mean : {best_sto_mean['ckpt']:>14s} "
|
||||
f"({best_sto_mean['stochastic_mean']:.1f})")
|
||||
if best_det_mean:
|
||||
print(f"Highest deterministic : {best_det_mean['ckpt']:>14s} "
|
||||
f"({best_det_mean['deterministic_mean']:.1f})")
|
||||
if best_robust:
|
||||
print(f"Most robust (high min) : {best_robust['ckpt']:>14s} "
|
||||
f"(stochastic min {best_robust['stochastic_min']:.1f})")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,96 @@
|
||||
"""Environment wrappers for CarRacing-v3.
|
||||
|
||||
We stack four standard wrappers on top of the raw env:
|
||||
- SkipFrame: repeat each action k times to reduce decision frequency
|
||||
- GrayScaleResize: RGB(96, 96, 3) -> Gray(84, 84) to shrink the input
|
||||
- FrameStack: stack the last k frames so the agent can perceive motion
|
||||
- make_env: factory that returns a fully wrapped environment
|
||||
|
||||
After wrapping, an observation has shape (4, 84, 84) uint8.
|
||||
"""
|
||||
|
||||
from collections import deque
|
||||
|
||||
import cv2
|
||||
import gymnasium as gym
|
||||
import numpy as np
|
||||
|
||||
|
||||
class SkipFrame(gym.Wrapper):
|
||||
"""Repeat the action ``k`` times and accumulate the rewards."""
|
||||
|
||||
def __init__(self, env: gym.Env, k: int = 4):
|
||||
super().__init__(env)
|
||||
self.k = k
|
||||
|
||||
def step(self, action):
|
||||
total_reward = 0.0
|
||||
terminated = False
|
||||
truncated = False
|
||||
info = {}
|
||||
obs = None
|
||||
for _ in range(self.k):
|
||||
obs, reward, terminated, truncated, info = self.env.step(action)
|
||||
total_reward += reward
|
||||
if terminated or truncated:
|
||||
break
|
||||
return obs, total_reward, terminated, truncated, info
|
||||
|
||||
|
||||
class GrayScaleResize(gym.ObservationWrapper):
|
||||
"""Convert RGB frames to grayscale and resize to ``size`` x ``size``."""
|
||||
|
||||
def __init__(self, env: gym.Env, size: int = 84):
|
||||
super().__init__(env)
|
||||
self.size = size
|
||||
self.observation_space = gym.spaces.Box(
|
||||
low=0, high=255, shape=(size, size), dtype=np.uint8
|
||||
)
|
||||
|
||||
def observation(self, obs):
|
||||
gray = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
|
||||
resized = cv2.resize(gray, (self.size, self.size), interpolation=cv2.INTER_AREA)
|
||||
return resized
|
||||
|
||||
|
||||
class FrameStack(gym.Wrapper):
|
||||
"""Stack the most recent ``k`` frames along a new leading axis."""
|
||||
|
||||
def __init__(self, env: gym.Env, k: int = 4):
|
||||
super().__init__(env)
|
||||
self.k = k
|
||||
self.frames = deque(maxlen=k)
|
||||
h, w = env.observation_space.shape
|
||||
self.observation_space = gym.spaces.Box(
|
||||
low=0, high=255, shape=(k, h, w), dtype=np.uint8
|
||||
)
|
||||
|
||||
def reset(self, **kwargs):
|
||||
obs, info = self.env.reset(**kwargs)
|
||||
self.frames.clear()
|
||||
for _ in range(self.k):
|
||||
self.frames.append(obs)
|
||||
return self._get_obs(), info
|
||||
|
||||
def step(self, action):
|
||||
obs, reward, terminated, truncated, info = self.env.step(action)
|
||||
self.frames.append(obs)
|
||||
return self._get_obs(), reward, terminated, truncated, info
|
||||
|
||||
def _get_obs(self):
|
||||
return np.stack(self.frames, axis=0)
|
||||
|
||||
|
||||
def make_env(seed: int = 0, skip: int = 4, size: int = 84, stack: int = 4) -> gym.Env:
|
||||
"""Create a CarRacing-v3 env with our standard preprocessing stack.
|
||||
|
||||
Returns an environment whose observations are uint8 arrays of shape
|
||||
(stack, size, size), ready to feed into a CNN backbone.
|
||||
"""
|
||||
env = gym.make("CarRacing-v3", continuous=False, render_mode="rgb_array")
|
||||
env = SkipFrame(env, k=skip)
|
||||
env = GrayScaleResize(env, size=size)
|
||||
env = FrameStack(env, k=stack)
|
||||
env.action_space.seed(seed)
|
||||
env.reset(seed=seed)
|
||||
return env
|
||||
@@ -0,0 +1,222 @@
|
||||
"""Evaluation helpers for PPO on CarRacing-v3.
|
||||
|
||||
Functions:
|
||||
- evaluate_agent(agent, n_episodes, seed_start) -> list[float] returns
|
||||
- record_demo_video(agent, out_path, seed) -> save mp4 of one episode
|
||||
- load_tb_scalars(run_dir, tag) -> (steps, values) from TensorBoard
|
||||
- plot_eval_bar(returns, baseline, save_path) -> bar chart of eval returns
|
||||
- plot_training_curves(run_dirs, labels, save_path) -> 6-panel curves figure
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import gymnasium as gym
|
||||
import imageio
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
from src.env_wrappers import FrameStack, GrayScaleResize, SkipFrame, make_env
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Numerical evaluation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def evaluate_agent(agent, n_episodes=20, seed_start=1000, deterministic=False):
|
||||
"""Roll the agent for n_episodes on freshly-seeded envs.
|
||||
|
||||
Args:
|
||||
agent: a PPOAgent (loaded with a checkpoint).
|
||||
n_episodes: how many evaluation episodes.
|
||||
seed_start: starting seed; each ep uses seed_start + ep.
|
||||
deterministic: if True, take argmax over policy logits instead of
|
||||
sampling (slightly higher mean, slightly lower variance).
|
||||
|
||||
Returns:
|
||||
list[float] of per-episode returns.
|
||||
"""
|
||||
import torch
|
||||
from torch.distributions import Categorical
|
||||
|
||||
returns = []
|
||||
env = make_env(seed=seed_start)
|
||||
for ep in range(n_episodes):
|
||||
obs, _ = env.reset(seed=seed_start + ep)
|
||||
ep_return, done = 0.0, False
|
||||
while not done:
|
||||
if deterministic:
|
||||
obs_t = torch.as_tensor(obs, device=agent.device).unsqueeze(0)
|
||||
logits, _ = agent.net(obs_t)
|
||||
action = int(logits.argmax(dim=-1).item())
|
||||
else:
|
||||
action, _, _ = agent.act(obs)
|
||||
obs, r, term, trunc, _ = env.step(action)
|
||||
ep_return += r
|
||||
done = term or trunc
|
||||
returns.append(ep_return)
|
||||
env.close()
|
||||
return returns
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Video recording
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class _DualEnv:
|
||||
"""Step a wrapped env (for the agent) and a raw env (for nice video) in lockstep."""
|
||||
|
||||
def __init__(self, seed):
|
||||
self.wrapped = make_env(seed=seed)
|
||||
self.raw = gym.make("CarRacing-v3", continuous=False, render_mode="rgb_array")
|
||||
# Don't pre-reset raw env here; reset() below will do it once cleanly.
|
||||
|
||||
def reset(self, seed):
|
||||
# Re-create wrapped to ensure both envs see the EXACT same first reset
|
||||
# under the given seed. This mirrors how the eval pipeline scores seeds.
|
||||
self.wrapped = make_env(seed=seed)
|
||||
wrapped_obs, _ = self.wrapped.reset(seed=seed)
|
||||
raw_obs, _ = self.raw.reset(seed=seed)
|
||||
return wrapped_obs, raw_obs
|
||||
|
||||
def step(self, action):
|
||||
wrapped_obs, _, term, trunc, _ = self.wrapped.step(action)
|
||||
# SkipFrame inside wrapped env runs 4 raw frames per call; mirror that.
|
||||
raw_frames = []
|
||||
raw_done = False
|
||||
for _ in range(4):
|
||||
if not raw_done:
|
||||
raw_obs, _, t, tr, _ = self.raw.step(action)
|
||||
raw_done = t or tr
|
||||
raw_frames.append(raw_obs.copy())
|
||||
return wrapped_obs, raw_frames, (term or trunc) or raw_done
|
||||
|
||||
def close(self):
|
||||
self.wrapped.close()
|
||||
self.raw.close()
|
||||
|
||||
|
||||
def record_demo_video(agent, out_path, seed=42, fps=30, max_steps=600):
|
||||
"""Record one evaluation episode as an mp4 using the original RGB renderer."""
|
||||
out_path = Path(out_path)
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
denv = _DualEnv(seed=seed)
|
||||
wrapped_obs, _ = denv.reset(seed=seed)
|
||||
|
||||
frames = []
|
||||
total_r = 0.0
|
||||
for _ in range(max_steps):
|
||||
action, _, _ = agent.act(wrapped_obs)
|
||||
wrapped_obs, raw_frames, done = denv.step(action)
|
||||
frames.extend(raw_frames)
|
||||
if done:
|
||||
break
|
||||
|
||||
denv.close()
|
||||
imageio.mimsave(str(out_path), frames, fps=fps)
|
||||
return len(frames), out_path
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# TensorBoard data extraction
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def load_tb_scalars(run_dir, tag):
|
||||
"""Read a single scalar tag from a TensorBoard run directory.
|
||||
|
||||
Returns (steps_list, values_list); empty if tag is absent.
|
||||
"""
|
||||
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
|
||||
|
||||
ea = EventAccumulator(str(run_dir))
|
||||
ea.Reload()
|
||||
if tag not in ea.Tags()["scalars"]:
|
||||
return [], []
|
||||
events = ea.Scalars(tag)
|
||||
return [e.step for e in events], [e.value for e in events]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Plotting
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def plot_eval_bar(returns, baseline, save_path, title=None):
|
||||
"""Bar chart of per-episode returns + baseline reference line."""
|
||||
save_path = Path(save_path)
|
||||
save_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
mean_r = float(np.mean(returns))
|
||||
std_r = float(np.std(returns))
|
||||
|
||||
fig, ax = plt.subplots(figsize=(10, 5))
|
||||
xs = np.arange(len(returns))
|
||||
ax.bar(xs, returns, color="steelblue", edgecolor="black", alpha=0.7)
|
||||
ax.axhline(y=mean_r, color="red", linestyle="--",
|
||||
label=f"Mean = {mean_r:.1f} ± {std_r:.1f}")
|
||||
ax.axhline(y=baseline, color="gray", linestyle=":",
|
||||
label=f"Random baseline = {baseline:.1f}")
|
||||
ax.set_xlabel("Evaluation episode")
|
||||
ax.set_ylabel("Episode return")
|
||||
ax.set_title(title or f"Evaluation returns over {len(returns)} unseen seeds")
|
||||
ax.legend()
|
||||
ax.grid(True, alpha=0.3)
|
||||
plt.tight_layout()
|
||||
plt.savefig(save_path, dpi=150)
|
||||
plt.close(fig)
|
||||
return save_path
|
||||
|
||||
|
||||
def plot_training_curves(run_dirs, labels, save_path,
|
||||
tags=None, smooth_window=10):
|
||||
"""Multi-run multi-panel training curves (one panel per tag).
|
||||
|
||||
Args:
|
||||
run_dirs: list[Path] — TensorBoard run directories
|
||||
labels: list[str] — label per run for the legend
|
||||
save_path: where to write the PNG
|
||||
tags: list[str] of TensorBoard scalar tags to plot. Defaults to a
|
||||
standard 6-panel set.
|
||||
smooth_window: rolling-mean window for visual smoothing (1 = none).
|
||||
"""
|
||||
save_path = Path(save_path)
|
||||
save_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if tags is None:
|
||||
tags = [
|
||||
"episode/avg_return_100",
|
||||
"losses/value_loss",
|
||||
"losses/entropy",
|
||||
"losses/approx_kl",
|
||||
"losses/clip_frac",
|
||||
"episode/length",
|
||||
]
|
||||
|
||||
n = len(tags)
|
||||
rows = (n + 2) // 3
|
||||
fig, axes = plt.subplots(rows, 3, figsize=(15, 4 * rows))
|
||||
axes = axes.flatten()
|
||||
|
||||
for ax, tag in zip(axes, tags):
|
||||
for run_dir, label in zip(run_dirs, labels):
|
||||
steps, values = load_tb_scalars(run_dir, tag)
|
||||
if not steps:
|
||||
continue
|
||||
if smooth_window > 1 and len(values) > smooth_window:
|
||||
values = np.convolve(
|
||||
values, np.ones(smooth_window) / smooth_window, mode="valid"
|
||||
)
|
||||
steps = steps[smooth_window - 1:]
|
||||
ax.plot(steps, values, label=label, alpha=0.85)
|
||||
ax.set_title(tag)
|
||||
ax.set_xlabel("Env steps")
|
||||
ax.grid(True, alpha=0.3)
|
||||
ax.legend(fontsize=8)
|
||||
|
||||
# Hide unused axes
|
||||
for ax in axes[n:]:
|
||||
ax.set_visible(False)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(save_path, dpi=150)
|
||||
plt.close(fig)
|
||||
return save_path
|
||||
@@ -0,0 +1,59 @@
|
||||
"""Shared-CNN Actor-Critic network for discrete CarRacing-v3 PPO.
|
||||
|
||||
Input : uint8 tensor (B, 4, 84, 84), values in [0, 255]
|
||||
Output :
|
||||
- logits (B, n_actions) for a Categorical policy
|
||||
- value (B,) scalar state-value V(s)
|
||||
"""
|
||||
|
||||
import math
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.distributions import Categorical
|
||||
|
||||
|
||||
def layer_init(layer, std=math.sqrt(2), bias=0.0):
|
||||
"""Orthogonal init with configurable gain (PPO best practice)."""
|
||||
nn.init.orthogonal_(layer.weight, std)
|
||||
nn.init.constant_(layer.bias, bias)
|
||||
return layer
|
||||
|
||||
|
||||
class ActorCritic(nn.Module):
|
||||
"""Shared-CNN actor-critic for discrete visual control."""
|
||||
|
||||
def __init__(self, n_actions=5):
|
||||
super().__init__()
|
||||
self.cnn = nn.Sequential(
|
||||
layer_init(nn.Conv2d(4, 32, kernel_size=8, stride=4)),
|
||||
nn.ReLU(),
|
||||
layer_init(nn.Conv2d(32, 64, kernel_size=4, stride=2)),
|
||||
nn.ReLU(),
|
||||
layer_init(nn.Conv2d(64, 64, kernel_size=3, stride=1)),
|
||||
nn.ReLU(),
|
||||
nn.Flatten(),
|
||||
layer_init(nn.Linear(64 * 7 * 7, 512)),
|
||||
nn.ReLU(),
|
||||
)
|
||||
# Small std on the actor head -> initial policy is nearly uniform
|
||||
self.actor = layer_init(nn.Linear(512, n_actions), std=0.01)
|
||||
# Standard std on the critic head
|
||||
self.critic = layer_init(nn.Linear(512, 1), std=1.0)
|
||||
|
||||
def forward(self, x):
|
||||
# uint8 [0, 255] -> float32 [0, 1]
|
||||
x = x.float() / 255.0
|
||||
feat = self.cnn(x)
|
||||
logits = self.actor(feat)
|
||||
value = self.critic(feat).squeeze(-1)
|
||||
return logits, value
|
||||
|
||||
def get_action_and_value(self, x, action=None):
|
||||
logits, value = self(x)
|
||||
dist = Categorical(logits=logits)
|
||||
if action is None:
|
||||
action = dist.sample()
|
||||
log_prob = dist.log_prob(action)
|
||||
entropy = dist.entropy()
|
||||
return action, log_prob, entropy, value
|
||||
@@ -0,0 +1,255 @@
|
||||
"""PPO agent: clipped surrogate objective + value loss + entropy bonus.
|
||||
|
||||
Implements the PPO-Clip algorithm (Schulman et al. 2017) on top of our
|
||||
shared-CNN ActorCritic network and a vectorised rollout buffer. Includes
|
||||
production-grade refinements catalogued in *The 37 Implementation Details
|
||||
of PPO* (Huang et al. 2022) and *RAD* (Laskin et al. 2020):
|
||||
|
||||
- Clipped value-function loss (SB3 standard)
|
||||
- KL early stopping within update epochs (target_kl)
|
||||
- Linear schedule for clip range (clip_init -> clip_floor)
|
||||
- Random-shift data augmentation on observations during the update
|
||||
- Linear annealing of learning rate and entropy coefficient with floors
|
||||
|
||||
Public API:
|
||||
- PPOAgent.act(obs) -> (action, log_prob, value)
|
||||
- PPOAgent.act_batch(obs_batch) -> batched act for n_envs obs
|
||||
- PPOAgent.evaluate_value_batch(obs) -> bootstrap value for GAE
|
||||
- PPOAgent.update_vec(buffer) -> PPO update over vectorised rollout
|
||||
- PPOAgent.step_schedule(progress) -> linear LR/entropy/clip annealing
|
||||
- PPOAgent.save / load -> state_dict checkpoints
|
||||
"""
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torch.optim as optim
|
||||
|
||||
from src.networks import ActorCritic
|
||||
|
||||
|
||||
class PPOAgent:
|
||||
"""PPO-Clip agent for discrete action spaces."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_actions=5,
|
||||
lr=2.5e-4,
|
||||
clip=0.2,
|
||||
vf_coef=0.5,
|
||||
ent_coef=0.01,
|
||||
max_grad_norm=0.5,
|
||||
n_epochs=6,
|
||||
batch_size=128,
|
||||
device=None,
|
||||
anneal_lr=False,
|
||||
anneal_ent=False,
|
||||
clip_floor=None,
|
||||
target_kl=None,
|
||||
use_data_aug=False,
|
||||
aug_pad=4,
|
||||
):
|
||||
self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
||||
self.net = ActorCritic(n_actions=n_actions).to(self.device)
|
||||
self.optim = optim.Adam(self.net.parameters(), lr=lr, eps=1e-5)
|
||||
|
||||
# Save initial values for scheduling
|
||||
self.lr_init = lr
|
||||
self.clip_init = clip
|
||||
self.ent_coef_init = ent_coef
|
||||
|
||||
self.clip = clip
|
||||
self.vf_coef = vf_coef
|
||||
self.ent_coef = ent_coef
|
||||
self.max_grad_norm = max_grad_norm
|
||||
self.n_epochs = n_epochs
|
||||
self.batch_size = batch_size
|
||||
|
||||
# Schedule and refinement flags
|
||||
self.anneal_lr = anneal_lr
|
||||
self.anneal_ent = anneal_ent
|
||||
self.clip_floor = clip_floor # None = no clip annealing
|
||||
self.target_kl = target_kl # None = no KL early stopping
|
||||
self.use_data_aug = use_data_aug
|
||||
self.aug_pad = aug_pad
|
||||
|
||||
@torch.no_grad()
|
||||
def act(self, obs):
|
||||
"""Sample one action for the rollout phase."""
|
||||
obs_t = torch.as_tensor(obs, device=self.device).unsqueeze(0)
|
||||
action, log_prob, _, value = self.net.get_action_and_value(obs_t)
|
||||
return action.item(), log_prob.item(), value.item()
|
||||
|
||||
@torch.no_grad()
|
||||
def act_batch(self, obs_batch):
|
||||
"""Vectorised act for n_envs obs at once."""
|
||||
obs_t = torch.as_tensor(obs_batch, device=self.device)
|
||||
action, log_prob, _, value = self.net.get_action_and_value(obs_t)
|
||||
return (
|
||||
action.cpu().numpy(),
|
||||
log_prob.cpu().numpy(),
|
||||
value.cpu().numpy(),
|
||||
)
|
||||
|
||||
@torch.no_grad()
|
||||
def evaluate_value_batch(self, obs_batch):
|
||||
obs_t = torch.as_tensor(obs_batch, device=self.device)
|
||||
_, value = self.net(obs_t)
|
||||
return value.cpu().numpy()
|
||||
|
||||
def _random_shift(self, obs):
|
||||
"""Random-shift data augmentation (DrQ / RAD style), fully vectorised.
|
||||
|
||||
Pads the (B, C, H, W) image by ``aug_pad`` on each side using
|
||||
replicate padding, then crops a random H x W window per sample.
|
||||
Uses a single grid_sample call instead of a Python loop.
|
||||
"""
|
||||
n, c, h, w = obs.shape
|
||||
pad = self.aug_pad
|
||||
x = F.pad(obs.float(), (pad, pad, pad, pad), mode="replicate")
|
||||
# Per-sample random integer offsets in [0, 2*pad]
|
||||
h_off = torch.randint(0, 2 * pad + 1, (n,), device=obs.device)
|
||||
w_off = torch.randint(0, 2 * pad + 1, (n,), device=obs.device)
|
||||
|
||||
# Build per-sample affine grid that translates by the offset.
|
||||
# Padded image is (h + 2*pad) x (w + 2*pad). To crop a (h x w) window
|
||||
# starting at (h_off, w_off), we sample at normalized coords:
|
||||
# y_norm in [(h_off + 0.5)/H' * 2 - 1, (h_off + h - 0.5)/H' * 2 - 1]
|
||||
# where H' = h + 2*pad. We build that grid in one shot.
|
||||
Hp = h + 2 * pad
|
||||
Wp = w + 2 * pad
|
||||
# Base grid for an (h, w) crop in normalized [-1, 1] coords on the padded image
|
||||
ys = torch.arange(h, device=obs.device, dtype=torch.float32)
|
||||
xs = torch.arange(w, device=obs.device, dtype=torch.float32)
|
||||
# (n, h)
|
||||
y_indices = h_off.unsqueeze(1).float() + ys.unsqueeze(0)
|
||||
# (n, w)
|
||||
x_indices = w_off.unsqueeze(1).float() + xs.unsqueeze(0)
|
||||
# Convert to normalized coords on the padded image, [-1, 1]
|
||||
y_norm = (y_indices + 0.5) / Hp * 2.0 - 1.0 # (n, h)
|
||||
x_norm = (x_indices + 0.5) / Wp * 2.0 - 1.0 # (n, w)
|
||||
# Build (n, h, w, 2) grid: [..., 0] = x, [..., 1] = y per grid_sample API
|
||||
grid = torch.stack(
|
||||
[x_norm.unsqueeze(1).expand(n, h, w),
|
||||
y_norm.unsqueeze(2).expand(n, h, w)],
|
||||
dim=-1,
|
||||
)
|
||||
out = F.grid_sample(x, grid, mode="nearest", align_corners=False,
|
||||
padding_mode="border")
|
||||
return out.to(obs.dtype)
|
||||
|
||||
def update_vec(self, vec_buffer):
|
||||
"""PPO update for a vectorised buffer (flattens n_steps * n_envs)."""
|
||||
obs_shape = vec_buffer.obs_shape
|
||||
b_obs_flat = vec_buffer.obs.reshape(-1, *obs_shape)
|
||||
b_actions_flat = vec_buffer.actions.reshape(-1)
|
||||
b_old_logp_flat = vec_buffer.log_probs.reshape(-1)
|
||||
b_old_values_flat = vec_buffer.values.reshape(-1)
|
||||
b_adv_flat = vec_buffer.advantages.reshape(-1)
|
||||
b_ret_flat = vec_buffer.returns.reshape(-1)
|
||||
|
||||
pg_losses, v_losses, ent_losses, approx_kls, clip_fracs = [], [], [], [], []
|
||||
epochs_completed = 0
|
||||
early_stopped = False
|
||||
|
||||
for epoch in range(self.n_epochs):
|
||||
epoch_kls = []
|
||||
for idx in vec_buffer.get_minibatches(self.batch_size):
|
||||
b_obs = b_obs_flat[idx]
|
||||
b_actions = b_actions_flat[idx]
|
||||
b_old_logp = b_old_logp_flat[idx]
|
||||
b_old_values = b_old_values_flat[idx]
|
||||
b_adv = b_adv_flat[idx]
|
||||
b_ret = b_ret_flat[idx]
|
||||
|
||||
# Random shift data augmentation (refinement #4)
|
||||
if self.use_data_aug:
|
||||
b_obs = self._random_shift(b_obs)
|
||||
|
||||
_, new_logp, entropy, value = self.net.get_action_and_value(
|
||||
b_obs, b_actions
|
||||
)
|
||||
|
||||
log_ratio = new_logp - b_old_logp
|
||||
ratio = log_ratio.exp()
|
||||
|
||||
# Clipped policy loss
|
||||
surr1 = ratio * b_adv
|
||||
surr2 = torch.clamp(ratio, 1 - self.clip, 1 + self.clip) * b_adv
|
||||
policy_loss = -torch.min(surr1, surr2).mean()
|
||||
|
||||
# Clipped value loss (refinement #1, SB3 standard)
|
||||
v_clipped = b_old_values + torch.clamp(
|
||||
value - b_old_values, -self.clip, self.clip
|
||||
)
|
||||
v_loss_unclipped = (value - b_ret).pow(2)
|
||||
v_loss_clipped = (v_clipped - b_ret).pow(2)
|
||||
value_loss = 0.5 * torch.max(v_loss_unclipped, v_loss_clipped).mean()
|
||||
|
||||
entropy_loss = entropy.mean()
|
||||
|
||||
loss = (
|
||||
policy_loss
|
||||
+ self.vf_coef * value_loss
|
||||
- self.ent_coef * entropy_loss
|
||||
)
|
||||
|
||||
self.optim.zero_grad()
|
||||
loss.backward()
|
||||
nn.utils.clip_grad_norm_(self.net.parameters(), self.max_grad_norm)
|
||||
self.optim.step()
|
||||
|
||||
with torch.no_grad():
|
||||
approx_kl = ((ratio - 1) - log_ratio).mean().item()
|
||||
clip_frac = ((ratio - 1.0).abs() > self.clip).float().mean().item()
|
||||
|
||||
pg_losses.append(policy_loss.item())
|
||||
v_losses.append(value_loss.item())
|
||||
ent_losses.append(entropy_loss.item())
|
||||
approx_kls.append(approx_kl)
|
||||
clip_fracs.append(clip_frac)
|
||||
epoch_kls.append(approx_kl)
|
||||
|
||||
epochs_completed += 1
|
||||
# KL early stopping (refinement #2): stop epochs if mean KL exceeds 1.5 * target
|
||||
if self.target_kl is not None and len(epoch_kls) > 0:
|
||||
if sum(epoch_kls) / len(epoch_kls) > 1.5 * self.target_kl:
|
||||
early_stopped = True
|
||||
break
|
||||
|
||||
return {
|
||||
"policy_loss": sum(pg_losses) / len(pg_losses),
|
||||
"value_loss": sum(v_losses) / len(v_losses),
|
||||
"entropy": sum(ent_losses) / len(ent_losses),
|
||||
"approx_kl": sum(approx_kls) / len(approx_kls),
|
||||
"clip_frac": sum(clip_fracs) / len(clip_fracs),
|
||||
"epochs_completed": epochs_completed,
|
||||
"early_stopped": float(early_stopped),
|
||||
"current_clip": self.clip,
|
||||
}
|
||||
|
||||
def save(self, path):
|
||||
torch.save(self.net.state_dict(), path)
|
||||
|
||||
def load(self, path):
|
||||
self.net.load_state_dict(torch.load(path, map_location=self.device))
|
||||
|
||||
def step_schedule(self, progress, ent_floor=0.0, lr_floor=0.0):
|
||||
"""Linearly decay lr / ent_coef / clip toward floors over training.
|
||||
|
||||
- LR: lr_init -> lr_floor
|
||||
- Entropy coefficient: ent_coef_init -> ent_floor (preserves exploration)
|
||||
- Clip range: clip_init -> clip_floor (only if clip_floor is set)
|
||||
"""
|
||||
progress = min(max(progress, 0.0), 1.0)
|
||||
if self.anneal_lr:
|
||||
target_lr = self.lr_init * (1.0 - progress)
|
||||
for g in self.optim.param_groups:
|
||||
g["lr"] = max(target_lr, lr_floor)
|
||||
if self.anneal_ent:
|
||||
target_ent = self.ent_coef_init * (1.0 - progress)
|
||||
self.ent_coef = max(target_ent, ent_floor)
|
||||
# Clip range schedule (refinement #6)
|
||||
if self.clip_floor is not None:
|
||||
target_clip = self.clip_init * (1.0 - progress) + self.clip_floor * progress
|
||||
self.clip = max(target_clip, self.clip_floor)
|
||||
@@ -0,0 +1,23 @@
|
||||
"""Small helpers used across the training and evaluation code."""
|
||||
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
|
||||
def set_seed(seed: int = 42):
|
||||
"""Make Python / NumPy / PyTorch / CUDA randomness reproducible."""
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
|
||||
|
||||
def format_seconds(s: float) -> str:
|
||||
"""Pretty-print a duration in seconds as HH:MM:SS."""
|
||||
h = int(s // 3600)
|
||||
m = int((s % 3600) // 60)
|
||||
sec = int(s % 60)
|
||||
return f"{h:02d}:{m:02d}:{sec:02d}"
|
||||
@@ -0,0 +1,31 @@
|
||||
"""Vectorised CarRacing-v3 factory.
|
||||
|
||||
Wraps the existing single-env preprocessing stack inside a Gymnasium
|
||||
vector env so n_envs copies can step in parallel processes (async mode).
|
||||
"""
|
||||
|
||||
import gymnasium as gym
|
||||
|
||||
from src.env_wrappers import FrameStack, GrayScaleResize, SkipFrame
|
||||
|
||||
|
||||
def _make_one(rank: int, seed: int):
|
||||
def _init():
|
||||
env = gym.make("CarRacing-v3", continuous=False)
|
||||
env = SkipFrame(env, k=4)
|
||||
env = GrayScaleResize(env, size=84)
|
||||
env = FrameStack(env, k=4)
|
||||
env.action_space.seed(seed + rank)
|
||||
return env
|
||||
return _init
|
||||
|
||||
|
||||
def make_vec_env(n_envs: int = 4, seed: int = 0, async_mode: bool = True):
|
||||
"""Build a vectorised CarRacing-v3 env.
|
||||
|
||||
Returns a gym.vector env whose obs has shape (n_envs, 4, 84, 84) uint8.
|
||||
"""
|
||||
fns = [_make_one(i, seed) for i in range(n_envs)]
|
||||
if async_mode:
|
||||
return gym.vector.AsyncVectorEnv(fns)
|
||||
return gym.vector.SyncVectorEnv(fns)
|
||||
@@ -0,0 +1,70 @@
|
||||
"""Vectorised rollout buffer (n_steps, n_envs, ...) with GAE.
|
||||
|
||||
Uses CleanRL's indexing convention:
|
||||
dones[t] flags whether obs[t] is the FIRST obs of a fresh episode
|
||||
(i.e., the previous action terminated). GAE then uses dones[t+1]
|
||||
as the mask for V(s_{t+1}) at time t.
|
||||
"""
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
class VecRolloutBuffer:
|
||||
def __init__(self, n_steps, n_envs, obs_shape, device):
|
||||
self.n_steps = n_steps
|
||||
self.n_envs = n_envs
|
||||
self.obs_shape = obs_shape
|
||||
self.device = device
|
||||
|
||||
self.obs = torch.zeros(
|
||||
(n_steps, n_envs, *obs_shape), dtype=torch.uint8, device=device
|
||||
)
|
||||
self.actions = torch.zeros((n_steps, n_envs), dtype=torch.long, device=device)
|
||||
self.log_probs = torch.zeros((n_steps, n_envs), dtype=torch.float32, device=device)
|
||||
self.rewards = torch.zeros((n_steps, n_envs), dtype=torch.float32, device=device)
|
||||
self.values = torch.zeros((n_steps, n_envs), dtype=torch.float32, device=device)
|
||||
self.dones = torch.zeros((n_steps, n_envs), dtype=torch.float32, device=device)
|
||||
|
||||
self.advantages = torch.zeros((n_steps, n_envs), dtype=torch.float32, device=device)
|
||||
self.returns = torch.zeros((n_steps, n_envs), dtype=torch.float32, device=device)
|
||||
|
||||
self.ptr = 0
|
||||
|
||||
def add(self, obs, action, log_prob, reward, value, done):
|
||||
i = self.ptr
|
||||
self.obs[i] = torch.as_tensor(obs, device=self.device)
|
||||
self.actions[i] = torch.as_tensor(action, device=self.device, dtype=torch.long)
|
||||
self.log_probs[i] = torch.as_tensor(log_prob, device=self.device, dtype=torch.float32)
|
||||
self.rewards[i] = torch.as_tensor(reward, device=self.device, dtype=torch.float32)
|
||||
self.values[i] = torch.as_tensor(value, device=self.device, dtype=torch.float32)
|
||||
self.dones[i] = torch.as_tensor(done, device=self.device, dtype=torch.float32)
|
||||
self.ptr += 1
|
||||
|
||||
def compute_gae(self, last_value, last_done, gamma=0.99, lam=0.95):
|
||||
last_value = torch.as_tensor(last_value, device=self.device, dtype=torch.float32)
|
||||
last_done = torch.as_tensor(last_done, device=self.device, dtype=torch.float32)
|
||||
|
||||
next_values = torch.cat([self.values[1:], last_value.unsqueeze(0)], dim=0)
|
||||
next_non_terminal = 1.0 - torch.cat([self.dones[1:], last_done.unsqueeze(0)], dim=0)
|
||||
|
||||
deltas = self.rewards + gamma * next_values * next_non_terminal - self.values
|
||||
|
||||
adv = torch.zeros((self.n_envs,), device=self.device)
|
||||
for t in reversed(range(self.n_steps)):
|
||||
adv = deltas[t] + gamma * lam * next_non_terminal[t] * adv
|
||||
self.advantages[t] = adv
|
||||
|
||||
self.returns = self.advantages + self.values
|
||||
|
||||
flat = self.advantages.reshape(-1)
|
||||
flat = (flat - flat.mean()) / (flat.std() + 1e-8)
|
||||
self.advantages = flat.reshape(self.n_steps, self.n_envs)
|
||||
|
||||
def get_minibatches(self, batch_size):
|
||||
total = self.n_steps * self.n_envs
|
||||
idx = torch.randperm(total, device=self.device)
|
||||
for start in range(0, total, batch_size):
|
||||
yield idx[start: start + batch_size]
|
||||
|
||||
def reset(self):
|
||||
self.ptr = 0
|
||||
@@ -0,0 +1,103 @@
|
||||
"""Stable-Baselines3 PPO baseline for fair comparison.
|
||||
|
||||
We compare against SB3's default CNN PPO under the same observation
|
||||
preprocessing (4-frame stacked grayscale 84x84) and similar core
|
||||
hyperparameters. Note: SB3 has additional optimisations on by default
|
||||
(orthogonal init, reward normalisation in some versions, vectorised env);
|
||||
this baseline is intentionally a "production" reference, not a like-for-like
|
||||
comparison. The report should discuss this honestly.
|
||||
|
||||
Usage:
|
||||
python train_sb3_baseline.py --total-steps 500000 --run-name sb3_baseline
|
||||
|
||||
This script is ONLY for the evaluation/comparison phase. The main PPO
|
||||
implementation in src/ uses no SB3 code.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import gymnasium as gym
|
||||
from stable_baselines3 import PPO
|
||||
from stable_baselines3.common.callbacks import CheckpointCallback
|
||||
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
|
||||
|
||||
from src.env_wrappers import FrameStack, GrayScaleResize, SkipFrame
|
||||
|
||||
|
||||
def _make_one(rank, seed):
|
||||
def _init():
|
||||
env = gym.make("CarRacing-v3", continuous=False)
|
||||
env = SkipFrame(env, k=4)
|
||||
env = GrayScaleResize(env, size=84)
|
||||
env = FrameStack(env, k=4)
|
||||
env.reset(seed=seed + rank)
|
||||
return env
|
||||
return _init
|
||||
|
||||
|
||||
def parse_args():
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--total-steps", type=int, default=500_000)
|
||||
p.add_argument("--n-envs", type=int, default=4)
|
||||
p.add_argument("--run-name", type=str, default="sb3_baseline")
|
||||
p.add_argument("--seed", type=int, default=42)
|
||||
return p.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
project_root = Path(__file__).resolve().parent
|
||||
|
||||
log_dir = project_root / "runs" / args.run_name
|
||||
ckpt_dir = project_root / "models" / args.run_name
|
||||
log_dir.mkdir(parents=True, exist_ok=True)
|
||||
ckpt_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print("=" * 60)
|
||||
print(f"SB3 PPO baseline: {args.run_name}")
|
||||
print(f"Total steps: {args.total_steps:,} n_envs: {args.n_envs}")
|
||||
print(f"Logs: {log_dir}")
|
||||
print(f"Ckpts: {ckpt_dir}")
|
||||
print("=" * 60)
|
||||
|
||||
fns = [_make_one(i, args.seed) for i in range(args.n_envs)]
|
||||
vec_env = SubprocVecEnv(fns) if args.n_envs > 1 else DummyVecEnv(fns)
|
||||
|
||||
# Match our hyperparameters as closely as possible
|
||||
model = PPO(
|
||||
policy="CnnPolicy",
|
||||
env=vec_env,
|
||||
learning_rate=2.5e-4,
|
||||
n_steps=512,
|
||||
batch_size=64,
|
||||
n_epochs=10,
|
||||
gamma=0.99,
|
||||
gae_lambda=0.95,
|
||||
clip_range=0.2,
|
||||
vf_coef=0.5,
|
||||
ent_coef=0.01,
|
||||
max_grad_norm=0.5,
|
||||
tensorboard_log=str(log_dir),
|
||||
seed=args.seed,
|
||||
verbose=1,
|
||||
)
|
||||
|
||||
ckpt_cb = CheckpointCallback(
|
||||
save_freq=max(50_000 // args.n_envs, 1),
|
||||
save_path=str(ckpt_dir),
|
||||
name_prefix="sb3",
|
||||
)
|
||||
model.learn(
|
||||
total_timesteps=args.total_steps,
|
||||
callback=ckpt_cb,
|
||||
tb_log_name="run",
|
||||
)
|
||||
model.save(str(ckpt_dir / "final.zip"))
|
||||
print(f"\nSaved final SB3 model to {ckpt_dir / 'final.zip'}")
|
||||
|
||||
vec_env.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,226 @@
|
||||
"""Train PPO on CarRacing-v3 with vectorised envs (parallel rollout).
|
||||
|
||||
Usage (Windows):
|
||||
python train_vec.py --n-envs 4 --total-steps 10000 --run-name vec_smoke
|
||||
python train_vec.py --n-envs 4 --total-steps 500000 --run-name vec_main \
|
||||
--anneal-lr --anneal-ent --reward-clip 1.0
|
||||
|
||||
The ``if __name__ == "__main__"`` guard at the bottom is mandatory on
|
||||
Windows for AsyncVectorEnv (otherwise child processes infinite-spawn).
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import time
|
||||
from collections import deque
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
from src.ppo_agent import PPOAgent
|
||||
from src.utils import format_seconds, set_seed
|
||||
from src.vec_env_wrappers import make_vec_env
|
||||
from src.vec_rollout_buffer import VecRolloutBuffer
|
||||
|
||||
|
||||
def parse_args():
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--total-steps", type=int, default=3_000_000)
|
||||
p.add_argument("--n-envs", type=int, default=8)
|
||||
p.add_argument("--n-steps", type=int, default=256)
|
||||
p.add_argument("--n-epochs", type=int, default=6)
|
||||
p.add_argument("--batch-size", type=int, default=128)
|
||||
p.add_argument("--lr", type=float, default=2.5e-4)
|
||||
p.add_argument("--gamma", type=float, default=0.99)
|
||||
p.add_argument("--lam", type=float, default=0.95)
|
||||
p.add_argument("--clip", type=float, default=0.2)
|
||||
p.add_argument("--ent-coef", type=float, default=0.01)
|
||||
p.add_argument("--vf-coef", type=float, default=0.5)
|
||||
p.add_argument("--max-grad-norm", type=float, default=0.5)
|
||||
p.add_argument("--seed", type=int, default=42)
|
||||
p.add_argument("--run-name", type=str, default="ppo_vec_main")
|
||||
p.add_argument("--save-every-iters", type=int, default=20)
|
||||
p.add_argument("--anneal-lr", action="store_true")
|
||||
p.add_argument("--anneal-ent", action="store_true")
|
||||
p.add_argument("--reward-clip", type=float, default=None)
|
||||
p.add_argument("--ent-floor", type=float, default=0.0,
|
||||
help="Lower bound on ent_coef when --anneal-ent is on")
|
||||
p.add_argument("--clip-floor", type=float, default=None,
|
||||
help="Linearly anneal clip range to this floor (e.g. 0.05). "
|
||||
"None disables clip annealing.")
|
||||
p.add_argument("--target-kl", type=float, default=None,
|
||||
help="Stop the current update epoch early if mean approx_kl "
|
||||
"exceeds 1.5 * target_kl. None disables. SB3 default 0.015.")
|
||||
p.add_argument("--use-data-aug", action="store_true",
|
||||
help="Apply random-shift augmentation to obs during PPO update")
|
||||
p.add_argument("--sync-mode", action="store_true",
|
||||
help="Use SyncVectorEnv (debug mode)")
|
||||
return p.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
project_root = Path(__file__).resolve().parent
|
||||
|
||||
run_dir = project_root / "runs" / args.run_name
|
||||
ckpt_dir = project_root / "models" / args.run_name
|
||||
run_dir.mkdir(parents=True, exist_ok=True)
|
||||
ckpt_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
set_seed(args.seed)
|
||||
|
||||
# Throughput tweak: let cuDNN auto-pick the fastest conv algorithm
|
||||
# for our fixed (B, 4, 84, 84) input shape.
|
||||
torch.backends.cudnn.benchmark = True
|
||||
|
||||
vec_env = make_vec_env(
|
||||
n_envs=args.n_envs,
|
||||
seed=args.seed,
|
||||
async_mode=not args.sync_mode,
|
||||
)
|
||||
|
||||
agent = PPOAgent(
|
||||
n_actions=5,
|
||||
lr=args.lr,
|
||||
clip=args.clip,
|
||||
vf_coef=args.vf_coef,
|
||||
ent_coef=args.ent_coef,
|
||||
max_grad_norm=args.max_grad_norm,
|
||||
n_epochs=args.n_epochs,
|
||||
batch_size=args.batch_size,
|
||||
anneal_lr=args.anneal_lr,
|
||||
anneal_ent=args.anneal_ent,
|
||||
clip_floor=args.clip_floor,
|
||||
target_kl=args.target_kl,
|
||||
use_data_aug=args.use_data_aug,
|
||||
)
|
||||
buffer = VecRolloutBuffer(
|
||||
n_steps=args.n_steps,
|
||||
n_envs=args.n_envs,
|
||||
obs_shape=(4, 84, 84),
|
||||
device=agent.device,
|
||||
)
|
||||
writer = SummaryWriter(str(run_dir))
|
||||
|
||||
samples_per_iter = args.n_steps * args.n_envs
|
||||
|
||||
print("=" * 60)
|
||||
print(f"Run: {args.run_name}")
|
||||
mode_str = "sync" if args.sync_mode else "async"
|
||||
print(f"Mode: {mode_str} vec env, n_envs={args.n_envs}")
|
||||
print(f"Total steps: {args.total_steps:,}")
|
||||
print(f"Per-iter samples: {samples_per_iter} (n_steps={args.n_steps} x n_envs={args.n_envs})")
|
||||
print(f"lr={args.lr} gamma={args.gamma} lam={args.lam} clip={args.clip}")
|
||||
print(f"anneal_lr={args.anneal_lr} anneal_ent={args.anneal_ent} "
|
||||
f"ent_floor={args.ent_floor} reward_clip={args.reward_clip}")
|
||||
print(f"clip_floor={args.clip_floor} target_kl={args.target_kl} "
|
||||
f"use_data_aug={args.use_data_aug}")
|
||||
print(f"n_epochs={args.n_epochs} batch_size={args.batch_size}")
|
||||
print(f"Device: {agent.device}")
|
||||
print(f"Logs: {run_dir}")
|
||||
print(f"Ckpts: {ckpt_dir}")
|
||||
print("=" * 60)
|
||||
|
||||
obs, _ = vec_env.reset(seed=args.seed)
|
||||
next_done = np.zeros(args.n_envs, dtype=np.float32)
|
||||
|
||||
global_step = 0
|
||||
iteration = 0
|
||||
episode_returns = deque(maxlen=100)
|
||||
cur_ep_returns = np.zeros(args.n_envs, dtype=np.float32)
|
||||
cur_ep_lens = np.zeros(args.n_envs, dtype=np.int64)
|
||||
start_time = time.time()
|
||||
|
||||
while global_step < args.total_steps:
|
||||
iteration += 1
|
||||
|
||||
agent.step_schedule(
|
||||
global_step / args.total_steps,
|
||||
ent_floor=args.ent_floor,
|
||||
)
|
||||
|
||||
# Rollout (n_steps per env, total samples = n_steps * n_envs)
|
||||
for step in range(args.n_steps):
|
||||
actions, log_probs, values = agent.act_batch(obs)
|
||||
|
||||
next_obs, rewards, terms, truncs, _ = vec_env.step(actions)
|
||||
done = np.logical_or(terms, truncs).astype(np.float32)
|
||||
|
||||
train_rewards = (
|
||||
np.maximum(rewards, -args.reward_clip)
|
||||
if args.reward_clip is not None
|
||||
else rewards
|
||||
)
|
||||
|
||||
# Use CleanRL convention: dones[step] = was obs[step] a fresh start
|
||||
buffer.add(obs, actions, log_probs, train_rewards, values, next_done)
|
||||
|
||||
cur_ep_returns += rewards
|
||||
cur_ep_lens += 1
|
||||
for i in range(args.n_envs):
|
||||
if done[i]:
|
||||
episode_returns.append(float(cur_ep_returns[i]))
|
||||
writer.add_scalar("episode/return", cur_ep_returns[i], global_step)
|
||||
writer.add_scalar("episode/length", cur_ep_lens[i], global_step)
|
||||
cur_ep_returns[i] = 0.0
|
||||
cur_ep_lens[i] = 0
|
||||
|
||||
obs = next_obs
|
||||
next_done = done
|
||||
global_step += args.n_envs
|
||||
|
||||
# GAE
|
||||
last_value = agent.evaluate_value_batch(obs)
|
||||
buffer.compute_gae(
|
||||
last_value=last_value,
|
||||
last_done=next_done,
|
||||
gamma=args.gamma,
|
||||
lam=args.lam,
|
||||
)
|
||||
|
||||
# Update
|
||||
losses = agent.update_vec(buffer)
|
||||
for k, v in losses.items():
|
||||
writer.add_scalar(f"losses/{k}", v, global_step)
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
steps_per_sec = global_step / max(elapsed, 1e-6)
|
||||
avg_ret = sum(episode_returns) / len(episode_returns) if episode_returns else 0.0
|
||||
writer.add_scalar("perf/steps_per_sec", steps_per_sec, global_step)
|
||||
writer.add_scalar("episode/avg_return_100", avg_ret, global_step)
|
||||
writer.add_scalar("hp/lr", agent.optim.param_groups[0]["lr"], global_step)
|
||||
writer.add_scalar("hp/ent_coef", agent.ent_coef, global_step)
|
||||
writer.add_scalar("hp/clip", agent.clip, global_step)
|
||||
|
||||
epochs_done = int(losses.get("epochs_completed", args.n_epochs))
|
||||
early = losses.get("early_stopped", 0.0) > 0.5
|
||||
mark = "*" if early else " "
|
||||
print(
|
||||
f"iter {iteration:4d} | step {global_step:>9,} | "
|
||||
f"avg_ret(100) {avg_ret:7.2f} | "
|
||||
f"pg {losses['policy_loss']:+.4f} | "
|
||||
f"v {losses['value_loss']:7.3f} | "
|
||||
f"ent {losses['entropy']:.3f} | "
|
||||
f"kl {losses['approx_kl']:.4f} | "
|
||||
f"clip {agent.clip:.3f} | "
|
||||
f"clip% {losses['clip_frac']:.2%} | "
|
||||
f"ep {epochs_done}{mark}/{args.n_epochs} | "
|
||||
f"sps {steps_per_sec:5.0f} | "
|
||||
f"{format_seconds(elapsed)}"
|
||||
)
|
||||
|
||||
if iteration % args.save_every_iters == 0:
|
||||
agent.save(str(ckpt_dir / f"iter_{iteration:04d}.pt"))
|
||||
|
||||
buffer.reset()
|
||||
|
||||
final_path = ckpt_dir / "final.pt"
|
||||
agent.save(str(final_path))
|
||||
print(f"\nTraining done. Final model: {final_path}")
|
||||
writer.close()
|
||||
vec_env.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user