update rl report

2026-05-05 15:30:07 +08:00
parent d5c9baffe6
commit b3e240506e
5 changed files with 315 additions and 302 deletions
@@ -3,44 +3,37 @@
 \providecommand*\HyPL@Entry[1]{}
 \HyPL@Entry{0<</S/D>>}
 \@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{section.1}\protected@file@percent }
-\@writefile{toc}{\contentsline {subsection}{\numberline {1.1}Game Selection and Challenges}{1}{subsection.1.1}\protected@file@percent }
-\@writefile{toc}{\contentsline {subsection}{\numberline {1.2}Motivation}{1}{subsection.1.2}\protected@file@percent }
-\@writefile{toc}{\contentsline {section}{\numberline {2}Literature Review}{2}{section.2}\protected@file@percent }
-\@writefile{toc}{\contentsline {subsection}{\numberline {2.1}Deep Reinforcement Learning in Atari Games}{2}{subsection.2.1}\protected@file@percent }
-\@writefile{toc}{\contentsline {subsection}{\numberline {2.2}Algorithm Comparison}{2}{subsection.2.2}\protected@file@percent }
-\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Comparison of reinforcement learning algorithms}}{2}{table.caption.1}\protected@file@percent }
+\@writefile{toc}{\contentsline {subsection}{\numberline {1.1}Game and Motivation}{1}{subsection.1.1}\protected@file@percent }
+\@writefile{toc}{\contentsline {subsection}{\numberline {1.2}Related Work}{1}{subsection.1.2}\protected@file@percent }
+\@writefile{toc}{\contentsline {section}{\numberline {2}Algorithm}{2}{section.2}\protected@file@percent }
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.1}DQN Basics}{2}{subsection.2.1}\protected@file@percent }
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.2}Double DQN}{2}{subsection.2.2}\protected@file@percent }
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.3}Dueling Architecture}{2}{subsection.2.3}\protected@file@percent }
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.4}Prioritized Experience Replay}{2}{subsection.2.4}\protected@file@percent }
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.5}Network and Preprocessing}{3}{subsection.2.5}\protected@file@percent }
+\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Dueling Q-Network architecture}}{3}{table.caption.1}\protected@file@percent }
 \providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}}
-\newlabel{tab:algorithm_comparison}{{1}{2}{Comparison of reinforcement learning algorithms}{table.caption.1}{}}
-\@writefile{toc}{\contentsline {section}{\numberline {3}Algorithm and Implementation}{3}{section.3}\protected@file@percent }
-\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}DQN Algorithm}{3}{subsection.3.1}\protected@file@percent }
-\@writefile{toc}{\contentsline {subsubsection}{\numberline {3.1.1}Q-Learning Foundation}{3}{subsubsection.3.1.1}\protected@file@percent }
-\@writefile{toc}{\contentsline {subsubsection}{\numberline {3.1.2}Experience Replay}{3}{subsubsection.3.1.2}\protected@file@percent }
-\@writefile{toc}{\contentsline {subsubsection}{\numberline {3.1.3}Target Network}{3}{subsubsection.3.1.3}\protected@file@percent }
-\@writefile{toc}{\contentsline {subsubsection}{\numberline {3.1.4}Double DQN Extension}{3}{subsubsection.3.1.4}\protected@file@percent }
-\@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Network Architecture}{3}{subsection.3.2}\protected@file@percent }
-\@writefile{lot}{\contentsline {table}{\numberline {2}{\ignorespaces Network architecture details}}{4}{table.caption.2}\protected@file@percent }
-\newlabel{tab:network}{{2}{4}{Network architecture details}{table.caption.2}{}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {3.3}Environment Preprocessing}{4}{subsection.3.3}\protected@file@percent }
-\@writefile{toc}{\contentsline {subsection}{\numberline {3.4}Training Details}{4}{subsection.3.4}\protected@file@percent }
-\@writefile{lot}{\contentsline {table}{\numberline {3}{\ignorespaces Training hyperparameters}}{4}{table.caption.3}\protected@file@percent }
-\newlabel{tab:hyperparameters}{{3}{4}{Training hyperparameters}{table.caption.3}{}}
-\@writefile{toc}{\contentsline {section}{\numberline {4}Experimental Results}{4}{section.4}\protected@file@percent }
-\@writefile{toc}{\contentsline {subsection}{\numberline {4.1}Training Performance}{4}{subsection.4.1}\protected@file@percent }
-\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Training curves showing reward, loss, and Q-value evolution}}{5}{figure.caption.4}\protected@file@percent }
-\newlabel{fig:training_curves}{{1}{5}{Training curves showing reward, loss, and Q-value evolution}{figure.caption.4}{}}
-\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces Evaluation reward at different training checkpoints with standard deviation error bars}}{5}{figure.caption.5}\protected@file@percent }
-\newlabel{fig:evaluation_curve}{{2}{5}{Evaluation reward at different training checkpoints with standard deviation error bars}{figure.caption.5}{}}
-\@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces Epsilon decay curve during training}}{6}{figure.caption.6}\protected@file@percent }
-\newlabel{fig:epsilon_decay}{{3}{6}{Epsilon decay curve during training}{figure.caption.6}{}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {4.2}Evaluation Results}{6}{subsection.4.2}\protected@file@percent }
-\@writefile{lot}{\contentsline {table}{\numberline {4}{\ignorespaces Evaluation results at different training checkpoints}}{6}{table.caption.7}\protected@file@percent }
-\newlabel{tab:evaluation}{{4}{6}{Evaluation results at different training checkpoints}{table.caption.7}{}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {4.3}Comparison with Baselines}{7}{subsection.4.3}\protected@file@percent }
-\@writefile{lot}{\contentsline {table}{\numberline {5}{\ignorespaces Comparison with baselines}}{7}{table.caption.8}\protected@file@percent }
-\newlabel{tab:comparison}{{5}{7}{Comparison with baselines}{table.caption.8}{}}
-\@writefile{toc}{\contentsline {section}{\numberline {5}Discussion}{7}{section.5}\protected@file@percent }
-\@writefile{toc}{\contentsline {subsection}{\numberline {5.1}Performance Analysis}{7}{subsection.5.1}\protected@file@percent }
-\@writefile{toc}{\contentsline {subsection}{\numberline {5.2}Limitations}{7}{subsection.5.2}\protected@file@percent }
-\@writefile{toc}{\contentsline {subsection}{\numberline {5.3}Potential Improvements}{7}{subsection.5.3}\protected@file@percent }
-\@writefile{toc}{\contentsline {section}{\numberline {6}Conclusion}{8}{section.6}\protected@file@percent }
+\newlabel{tab:network}{{1}{3}{Dueling Q-Network architecture}{table.caption.1}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.6}Hyperparameters}{3}{subsection.2.6}\protected@file@percent }
+\@writefile{lot}{\contentsline {table}{\numberline {2}{\ignorespaces Training hyperparameters}}{3}{table.caption.2}\protected@file@percent }
+\newlabel{tab:hyperparameters}{{2}{3}{Training hyperparameters}{table.caption.2}{}}
+\@writefile{toc}{\contentsline {section}{\numberline {3}Results}{4}{section.3}\protected@file@percent }
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Training Progress}{4}{subsection.3.1}\protected@file@percent }
+\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Training reward, loss, and Q-value over 500 episodes}}{4}{figure.caption.3}\protected@file@percent }
+\newlabel{fig:training_curves}{{1}{4}{Training reward, loss, and Q-value over 500 episodes}{figure.caption.3}{}}
+\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces $\epsilon $ decay from 1.0 to 0.01 over 1M steps}}{5}{figure.caption.4}\protected@file@percent }
+\newlabel{fig:epsilon_decay}{{2}{5}{$\epsilon $ decay from 1.0 to 0.01 over 1M steps}{figure.caption.4}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Checkpoint Evaluation}{5}{subsection.3.2}\protected@file@percent }
+\@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces Evaluation scores across checkpoints, mean $\pm $ std over 10 episodes}}{5}{figure.caption.5}\protected@file@percent }
+\newlabel{fig:evaluation_curve}{{3}{5}{Evaluation scores across checkpoints, mean $\pm $ std over 10 episodes}{figure.caption.5}{}}
+\@writefile{lot}{\contentsline {table}{\numberline {3}{\ignorespaces Evaluation at each checkpoint, 10 episodes each. The live-tracked best model underperformed most saved checkpoints when re-evaluated.}}{6}{table.caption.6}\protected@file@percent }
+\newlabel{tab:evaluation}{{3}{6}{Evaluation at each checkpoint, 10 episodes each. The live-tracked best model underperformed most saved checkpoints when re-evaluated}{table.caption.6}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.3}Baseline Comparison}{6}{subsection.3.3}\protected@file@percent }
+\@writefile{lot}{\contentsline {table}{\numberline {4}{\ignorespaces Score comparison. Note the large difference in training budget.}}{7}{table.caption.7}\protected@file@percent }
+\newlabel{tab:comparison}{{4}{7}{Score comparison. Note the large difference in training budget}{table.caption.7}{}}
+\@writefile{toc}{\contentsline {section}{\numberline {4}Discussion}{7}{section.4}\protected@file@percent }
+\@writefile{toc}{\contentsline {subsection}{\numberline {4.1}Why Things Worked (When They Did)}{7}{subsection.4.1}\protected@file@percent }
+\@writefile{toc}{\contentsline {subsection}{\numberline {4.2}Why Things Didn't Converge}{7}{subsection.4.2}\protected@file@percent }
+\@writefile{toc}{\contentsline {subsection}{\numberline {4.3}What Would Help}{7}{subsection.4.3}\protected@file@percent }
+\@writefile{toc}{\contentsline {section}{\numberline {5}Conclusion}{8}{section.5}\protected@file@percent }
 \gdef \@abspage@last{8}
@@ -1,9 +1,8 @@
-This is XeTeX, Version 3.141592653-2.6-0.999997 (TeX Live 2025) (preloaded format=xelatex 2025.6.5)  3 MAY 2026 13:16
+This is XeTeX, Version 3.141592653-2.6-0.999997 (TeX Live 2025) (preloaded format=xelatex 2025.6.5)  5 MAY 2026 01:06
 entering extended mode
 restricted \write18 enabled.
- file:line:error style messages enabled.
 %&-line parsing enabled.
-**report
+**report.tex
 (./report.tex
 LaTeX2e <2024-11-01> patch level 2
 L3 programming layer <2025-01-18>
@@ -23,9 +22,13 @@ File: size11.clo 2024/06/29 v1.4n Standard LaTeX file (size option)
 \abovecaptionskip=\skip49
 \belowcaptionskip=\skip50
 \bibindent=\dimen141
-) (d:/settings/Language/texlive/2025/texmf-dist/tex/xelatex/xecjk/xeCJK.sty (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/l3kernel/expl3.sty
+)
+(d:/settings/Language/texlive/2025/texmf-dist/tex/xelatex/xecjk/xeCJK.sty
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/l3kernel/expl3.sty
 Package: expl3 2025-01-18 L3 programming layer (loader) 
- (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/l3backend/l3backend-xetex.def
+
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/l3backend/l3backend-xet
+ex.def
 File: l3backend-xetex.def 2024-05-08 L3 backend support: XeTeX
 \g__graphics_track_int=\count266
 \l__pdf_internal_box=\box52
@@ -33,9 +36,12 @@ File: l3backend-xetex.def 2024-05-08 L3 backend support: XeTeX
 \g__pdf_backend_link_int=\count268
 ))
 Package: xeCJK 2022/08/05 v3.9.1 Typesetting CJK scripts with XeLaTeX
- (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/ctex/ctexhook.sty
+
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/ctex/ctexhook.sty
 Package: ctexhook 2022/07/14 v2.5.10 Document and package hooks (CTEX)
-) (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/l3packages/xtemplate/xtemplate.sty
+)
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/l3packages/xtemplate/xt
+emplate.sty
 Package: xtemplate 2024-08-16 L3 Experimental prototype document functions
 )
 \l__xeCJK_tmp_int=\count269
@@ -98,12 +104,18 @@ LaTeX template Info: Declaring template type 'xeCJK/punctuation' taking 0
 \g__xeCJK_fam_allocation_int=\count279
 \l__xeCJK_verb_case_int=\count280
 \l__xeCJK_verb_exspace_skip=\skip57
- (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/fontspec/fontspec.sty (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/l3packages/xparse/xparse.sty
+
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/fontspec/fontspec.sty
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/l3packages/xparse/xpars
+e.sty
 Package: xparse 2024-08-16 L3 Experimental document command parser
 )
 Package: fontspec 2024/05/11 v2.9e Font selection for XeLaTeX and LuaLaTeX
- (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/fontspec/fontspec-xetex.sty
-Package: fontspec-xetex 2024/05/11 v2.9e Font selection for XeLaTeX and LuaLaTeX
+
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/fontspec/fontspec-xetex
+.sty
+Package: fontspec-xetex 2024/05/11 v2.9e Font selection for XeLaTeX and LuaLaTe
+X
 \l__fontspec_script_int=\count281
 \l__fontspec_language_int=\count282
 \l__fontspec_strnum_int=\count283
@@ -120,7 +132,9 @@ Package: fontspec-xetex 2024/05/11 v2.9e Font selection for XeLaTeX and LuaLaTeX
 \l__fontspec_tmpc_dim=\dimen171
 (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/base/fontenc.sty
 Package: fontenc 2021/04/29 v2.0v Standard LaTeX package
-) (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/fontspec/fontspec.cfg))) (d:/settings/Language/texlive/2025/texmf-dist/tex/xelatex/xecjk/xeCJK.cfg
+)
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/fontspec/fontspec.cfg))
+) (d:/settings/Language/texlive/2025/texmf-dist/tex/xelatex/xecjk/xeCJK.cfg
 File: xeCJK.cfg 2022/08/05 v3.9.1 Configuration file for xeCJK package
 ))

@@ -149,20 +163,27 @@ Package fontspec Info:
 (fontspec)             - 'normal' (m/n) with NFSS spec.:
 (fontspec)             <->"SimSun/OT:script=hani;language=dflt;"

- (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/graphics/graphicx.sty
+
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/graphics/graphicx.sty
 Package: graphicx 2021/09/16 v1.2d Enhanced LaTeX Graphics (DPC,SPQR)
- (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/graphics/keyval.sty
+
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/graphics/keyval.sty
 Package: keyval 2022/05/29 v1.15 key=value parser (DPC)
 \KV@toks@=\toks17
-) (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/graphics/graphics.sty
+)
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/graphics/graphics.sty
 Package: graphics 2024/08/06 v1.4g Standard LaTeX Graphics (DPC,SPQR)
- (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/graphics/trig.sty
+
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/graphics/trig.sty
 Package: trig 2023/12/02 v1.11 sin cos tan (DPC)
-) (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/graphics-cfg/graphics.cfg
+)
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/graphics-cfg/graphics.c
+fg
 File: graphics.cfg 2016/06/04 v1.11 sample graphics configuration
 )
 Package graphics Info: Driver file: xetex.def on input line 106.
- (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/graphics-def/xetex.def
+
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/graphics-def/xetex.def
 File: xetex.def 2022/09/22 v5.0n Graphics/color driver for xetex
 ))
 \Gin@req@height=\dimen172
@@ -174,14 +195,17 @@ Package: amsmath 2024/11/05 v2.17t AMS math features
 For additional information on amsmath, use the `?' option.
 (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/amsmath/amstext.sty
 Package: amstext 2021/08/26 v2.01 AMS text
- (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/amsmath/amsgen.sty
+
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/amsmath/amsgen.sty
 File: amsgen.sty 1999/11/30 v2.0 generic functions
 \@emptytoks=\toks18
 \ex@=\dimen174
-)) (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/amsmath/amsbsy.sty
+))
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/amsmath/amsbsy.sty
 Package: amsbsy 1999/11/29 v1.2d Bold Symbols
 \pmbraise@=\dimen175
-) (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/amsmath/amsopn.sty
+)
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/amsmath/amsopn.sty
 Package: amsopn 2022/04/08 v2.04 operator names
 )
 \inf@bad=\count292
@@ -231,16 +255,19 @@ LaTeX Info: Redefining \Relbar on input line 970.
 \mathdisplay@stack=\toks22
 LaTeX Info: Redefining \[ on input line 2953.
 LaTeX Info: Redefining \] on input line 2954.
-) (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/amsfonts/amsfonts.sty
+)
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/amsfonts/amsfonts.sty
 Package: amsfonts 2013/01/14 v3.01 Basic AMSFonts support
 \symAMSa=\mathgroup4
 \symAMSb=\mathgroup5
 LaTeX Font Info:    Redeclaring math symbol \hbar on input line 98.
 LaTeX Font Info:    Overwriting math alphabet `\mathfrak' in version `bold'
 (Font)                  U/euf/m/n --> U/euf/b/n on input line 106.
-) (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/amsfonts/amssymb.sty
+)
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/amsfonts/amssymb.sty
 Package: amssymb 2013/01/14 v3.01 AMS font symbols
-) (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/booktabs/booktabs.sty
+)
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/booktabs/booktabs.sty
 Package: booktabs 2020/01/12 v1.61803398 Publication quality tables
 \heavyrulewidth=\dimen183
 \lightrulewidth=\dimen184
@@ -259,43 +286,66 @@ Package: booktabs 2020/01/12 v1.61803398 Publication quality tables
 \@thisruleclass=\count306
 \@lastruleclass=\count307
 \@thisrulewidth=\dimen195
-) (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/hyperref/hyperref.sty
+)
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/hyperref/hyperref.sty
 Package: hyperref 2024-11-05 v7.01l Hypertext links for LaTeX
- (d:/settings/Language/texlive/2025/texmf-dist/tex/generic/iftex/iftex.sty
+
+(d:/settings/Language/texlive/2025/texmf-dist/tex/generic/iftex/iftex.sty
 Package: iftex 2024/12/12 v1.0g TeX engine tests
-) (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/kvsetkeys/kvsetkeys.sty
+)
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/kvsetkeys/kvsetkeys.sty
 Package: kvsetkeys 2022-10-05 v1.19 Key value parser (HO)
-) (d:/settings/Language/texlive/2025/texmf-dist/tex/generic/kvdefinekeys/kvdefinekeys.sty
+)
+(d:/settings/Language/texlive/2025/texmf-dist/tex/generic/kvdefinekeys/kvdefine
+keys.sty
 Package: kvdefinekeys 2019-12-19 v1.6 Define keys (HO)
-) (d:/settings/Language/texlive/2025/texmf-dist/tex/generic/pdfescape/pdfescape.sty
+)
+(d:/settings/Language/texlive/2025/texmf-dist/tex/generic/pdfescape/pdfescape.s
+ty
 Package: pdfescape 2019/12/09 v1.15 Implements pdfTeX's escape features (HO)
- (d:/settings/Language/texlive/2025/texmf-dist/tex/generic/ltxcmds/ltxcmds.sty
+
+(d:/settings/Language/texlive/2025/texmf-dist/tex/generic/ltxcmds/ltxcmds.sty
 Package: ltxcmds 2023-12-04 v1.26 LaTeX kernel commands for general use (HO)
-) (d:/settings/Language/texlive/2025/texmf-dist/tex/generic/pdftexcmds/pdftexcmds.sty
-Package: pdftexcmds 2020-06-27 v0.33 Utility functions of pdfTeX for LuaTeX (HO)
- (d:/settings/Language/texlive/2025/texmf-dist/tex/generic/infwarerr/infwarerr.sty
+)
+(d:/settings/Language/texlive/2025/texmf-dist/tex/generic/pdftexcmds/pdftexcmds
+.sty
+Package: pdftexcmds 2020-06-27 v0.33 Utility functions of pdfTeX for LuaTeX (HO
+)
+
+(d:/settings/Language/texlive/2025/texmf-dist/tex/generic/infwarerr/infwarerr.s
+ty
 Package: infwarerr 2019/12/03 v1.5 Providing info/warning/error messages (HO)
 )
 Package pdftexcmds Info: \pdf@primitive is available.
 Package pdftexcmds Info: \pdf@ifprimitive is available.
 Package pdftexcmds Info: \pdfdraftmode not found.
-)) (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/hycolor/hycolor.sty
+))
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/hycolor/hycolor.sty
 Package: hycolor 2020-01-27 v1.10 Color options for hyperref/bookmark (HO)
-) (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/hyperref/nameref.sty
+)
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/hyperref/nameref.sty
 Package: nameref 2023-11-26 v2.56 Cross-referencing by name of section
- (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/refcount/refcount.sty
+
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/refcount/refcount.sty
 Package: refcount 2019/12/15 v3.6 Data extraction from label references (HO)
-) (d:/settings/Language/texlive/2025/texmf-dist/tex/generic/gettitlestring/gettitlestring.sty
+)
+(d:/settings/Language/texlive/2025/texmf-dist/tex/generic/gettitlestring/gettit
+lestring.sty
 Package: gettitlestring 2019/12/15 v1.6 Cleanup title references (HO)
- (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/kvoptions/kvoptions.sty
+
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/kvoptions/kvoptions.sty
 Package: kvoptions 2022-06-15 v3.15 Key value format for package options (HO)
 ))
 \c@section@level=\count308
-) (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/etoolbox/etoolbox.sty
+)
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/etoolbox/etoolbox.sty
 Package: etoolbox 2025/02/11 v2.5l e-TeX tools for LaTeX (JAW)
 \etb@tempcnta=\count309
-) (d:/settings/Language/texlive/2025/texmf-dist/tex/generic/stringenc/stringenc.sty
-Package: stringenc 2019/11/29 v1.12 Convert strings between diff. encodings (HO)
+)
+(d:/settings/Language/texlive/2025/texmf-dist/tex/generic/stringenc/stringenc.s
+ty
+Package: stringenc 2019/11/29 v1.12 Convert strings between diff. encodings (HO
+)
 )
 \@linkdim=\dimen196
 \Hy@linkcounter=\count310
@@ -317,16 +367,21 @@ Package hyperref Info: Backreferencing OFF on input line 4177.
 Package hyperref Info: Implicit mode ON; LaTeX internals redefined.
 Package hyperref Info: Bookmarks ON on input line 4424.
 \c@Hy@tempcnt=\count313
- (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/url/url.sty
+
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/url/url.sty
 \Urlmuskip=\muskip18
 Package: url 2013/09/16  ver 3.4  Verb mode for urls, etc.
 )
 LaTeX Info: Redefining \url on input line 4763.
 \XeTeXLinkMargin=\dimen197
- (d:/settings/Language/texlive/2025/texmf-dist/tex/generic/bitset/bitset.sty
+
+(d:/settings/Language/texlive/2025/texmf-dist/tex/generic/bitset/bitset.sty
 Package: bitset 2019/12/09 v1.3 Handle bit-vector datatype (HO)
- (d:/settings/Language/texlive/2025/texmf-dist/tex/generic/bigintcalc/bigintcalc.sty
-Package: bigintcalc 2019/12/15 v1.5 Expandable calculations on big integers (HO)
+
+(d:/settings/Language/texlive/2025/texmf-dist/tex/generic/bigintcalc/bigintcalc
+.sty
+Package: bigintcalc 2019/12/15 v1.5 Expandable calculations on big integers (HO
+)
 ))
 \Fld@menulength=\count314
 \Field@Width=\dimen198
@@ -338,7 +393,8 @@ Package hyperref Info: backreferencing OFF on input line 6057.
 Package hyperref Info: Link coloring OFF on input line 6062.
 Package hyperref Info: Link coloring with OCG OFF on input line 6067.
 Package hyperref Info: PDF/A mode OFF on input line 6072.
- (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/base/atbegshi-ltx.sty
+
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/base/atbegshi-ltx.sty
 Package: atbegshi-ltx 2021/01/10 v1.0c Emulation of the original atbegshi
 package with kernel methods
 )
@@ -347,34 +403,44 @@ package with kernel methods
 \c@Hfootnote=\count317
 )
 Package hyperref Info: Driver (autodetected): hxetex.
- (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/hyperref/hxetex.def
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/hyperref/hxetex.def
 File: hxetex.def 2024-11-05 v7.01l Hyperref driver for XeTeX
 \pdfm@box=\box57
 \c@Hy@AnnotLevel=\count318
 \HyField@AnnotCount=\count319
 \Fld@listcount=\count320
 \c@bookmark@seq@number=\count321
- (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/rerunfilecheck/rerunfilecheck.sty
+
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/rerunfilecheck/rerunfil
+echeck.sty
 Package: rerunfilecheck 2022-07-10 v1.10 Rerun checks for auxiliary files (HO)
- (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/base/atveryend-ltx.sty
-Package: atveryend-ltx 2020/08/19 v1.0a Emulation of the original atveryend package
+
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/base/atveryend-ltx.sty
+Package: atveryend-ltx 2020/08/19 v1.0a Emulation of the original atveryend pac
+kage
 with kernel methods
-) (d:/settings/Language/texlive/2025/texmf-dist/tex/generic/uniquecounter/uniquecounter.sty
+)
+(d:/settings/Language/texlive/2025/texmf-dist/tex/generic/uniquecounter/uniquec
+ounter.sty
 Package: uniquecounter 2019/12/15 v1.4 Provide unlimited unique counter (HO)
 )
-Package uniquecounter Info: New unique counter `rerunfilecheck' on input line 285.
+Package uniquecounter Info: New unique counter `rerunfilecheck' on input line 2
+85.
 )
 \Hy@SectionHShift=\skip61
-) (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/float/float.sty
+)
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/float/float.sty
 Package: float 2001/11/08 v1.3d Float enhancements (AL)
 \c@float@type=\count322
 \float@exts=\toks23
 \float@box=\box58
 \@float@everytoks=\toks24
 \@floatcapt=\box59
-) (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/caption/caption.sty
+)
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/caption/caption.sty
 Package: caption 2023/08/05 v3.6o Customizing captions (AR)
- (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/caption/caption3.sty
+
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/caption/caption3.sty
 Package: caption3 2023/07/31 v2.4d caption3 kernel (AR)
 \caption@tempdima=\dimen256
 \captionmargin=\dimen257
@@ -390,7 +456,8 @@ Package caption Info: Standard document class detected.
 \c@continuedfloat=\count324
 Package caption Info: float package is loaded.
 Package caption Info: hyperref package is loaded.
-) (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/caption/subcaption.sty
+)
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/caption/subcaption.sty
 Package: subcaption 2023/07/28 v1.6b Sub-captions (AR)
 Package caption Info: New subtype `subfigure' on input line 238.
 \c@subfigure=\count325
@@ -398,7 +465,8 @@ Package caption Info: New subtype `subtable' on input line 238.
 \c@subtable=\count326
 ) (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/geometry/geometry.sty
 Package: geometry 2020/01/02 v5.9 Page Geometry
- (d:/settings/Language/texlive/2025/texmf-dist/tex/generic/iftex/ifvtex.sty
+
+(d:/settings/Language/texlive/2025/texmf-dist/tex/generic/iftex/ifvtex.sty
 Package: ifvtex 2019/10/25 v1.7 ifvtex legacy package. Use iftex instead.
 )
 \Gm@cnth=\count327
@@ -413,9 +481,11 @@ Package: ifvtex 2019/10/25 v1.7 ifvtex legacy package. Use iftex instead.
 \Gm@layouthoffset=\dimen270
 \Gm@layoutvoffset=\dimen271
 \Gm@dimlist=\toks25
-) (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/setspace/setspace.sty
+)
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/setspace/setspace.sty
 Package: setspace 2022/12/04 v6.7b set line spacing
-) (./report.aux)
+)
+(./report.aux)
 \openout1 = `report.aux'.

 LaTeX Font Info:    Checking defaults for OML/cmm/m/it on input line 25.
@@ -538,31 +608,43 @@ Package caption Info: End \AtBeginDocument code.
 * (1in=72.27pt=25.4mm, 1cm=28.453pt)

 LaTeX Font Info:    Trying to load font information for U+msa on input line 28.
+
 (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/amsfonts/umsa.fd
 File: umsa.fd 2013/01/14 v3.01 AMS symbols A
 )
 LaTeX Font Info:    Trying to load font information for U+msb on input line 28.
- (d:/settings/Language/texlive/2025/texmf-dist/tex/latex/amsfonts/umsb.fd
+
+
+(d:/settings/Language/texlive/2025/texmf-dist/tex/latex/amsfonts/umsb.fd
 File: umsb.fd 2013/01/14 v3.01 AMS symbols B
 )

 [1

 ]
+Overfull \hbox (22.55884pt too wide) in paragraph at lines 52--53
+[]\TU/lmr/bx/n/10.95 Target network\TU/lmr/m/n/10.95 : A frozen copy of the Q-n
+etwork computes the target $\OML/cmm/m/it/10.95 r \OT1/cmr/m/n/10.95 + \OML/cmm
+/m/it/10.95 ^^M [][] Q[]\OT1/cmr/m/n/10.95 (\OML/cmm/m/it/10.95 s[]; a[]\OT1/cm
+r/m/n/10.95 )$\TU/lmr/m/n/10.95 ,
+ []
+
+

 [2]

 [3]
-
-[4]
 File: ../plots/training_curves.png Graphic file (type bmp)
 <../plots/training_curves.png>
-File: ../plots/evaluation_curve.png Graphic file (type bmp)
-<../plots/evaluation_curve.png>
 File: ../plots/epsilon_decay.png Graphic file (type bmp)
 <../plots/epsilon_decay.png>


+[4]
+File: ../plots/evaluation_curve.png Graphic file (type bmp)
+<../plots/evaluation_curve.png>
+
+
 [5]

 [6]
@@ -575,15 +657,15 @@ LaTeX2e <2024-11-01> patch level 2
 L3 programming layer <2022/08/05>
 ***********
 Package rerunfilecheck Info: File `report.out' has not changed.
-(rerunfilecheck)             Checksum: A2A8A50B7B0BEEA9E24F458CB249099C;3723.
+(rerunfilecheck)             Checksum: 55BC28EC504B72D949D79E9823924129;2693.
 ) 
 Here is how much of TeX's memory you used:
- 15249 strings out of 473832
- 312556 string characters out of 5733159
- 749680 words of memory out of 5000000
- 38136 multiletter control sequences out of 15000+600000
- 566068 words of font info for 78 fonts, out of 8000000 for 9000
+ 15262 strings out of 473832
+ 312777 string characters out of 5733159
+ 749622 words of memory out of 5000000
+ 38143 multiletter control sequences out of 15000+600000
+ 567783 words of font info for 88 fonts, out of 8000000 for 9000
 1348 hyphenation exceptions out of 8191
- 74i,10n,92p,599b,409s stack positions out of 10000i,1000n,20000p,200000b,200000s
+ 74i,10n,92p,751b,411s stack positions out of 10000i,1000n,20000p,200000b,200000s

 Output written on report.pdf (8 pages).
@@ -27,288 +27,226 @@
 \maketitle

 \begin{abstract}
-This report presents the implementation and evaluation of a Deep Q-Network (DQN) agent for playing the Atari game Space Invaders. The agent was trained from scratch using Dueling Double DQN with experience replay and target network stabilization. After 2 million training steps, the agent achieved a best average score of 32.50 on the Space Invaders environment, demonstrating competitive performance compared to baseline methods. This report details the algorithm selection, implementation details, experimental results, and analysis of the agent's performance.
+I implemented a Dueling Double DQN agent with Prioritized Experience Replay (PER) to play Space Invaders from raw pixels. The agent was trained from scratch for 2M steps on an RTX 4060 GPU. The best checkpoint---at 1.2M steps---averaged 32.50 over 10 evaluation episodes, roughly 6.5$\times$ above random play. However, scores varied wildly across checkpoints (11.20 at 600K, back to 32.50 at 1.2M, then 18.65 at 1.8M), and the standard deviation stayed high throughout. Training never really converged. This report covers the algorithm, implementation choices, results, and why things were unstable.
 \end{abstract}

 \section{Introduction}

-\subsection{Game Selection and Challenges}
-Space Invaders is a classic Atari arcade game where the player controls a laser cannon at the bottom of the screen, shooting at rows of alien invaders that move horizontally and gradually descend. The game presents several challenges:
+\subsection{Game and Motivation}
+Space Invaders is one of the canonical Atari benchmarks for deep RL. The player controls a cannon at the bottom, shooting up at rows of aliens that move sideways and gradually descend. It has a discrete action space (6 actions: move left/right, fire, combinations), 210$\times$160 RGB frames as input, and sparse rewards---points only come from destroying aliens.
+
+Deep Q-Networks (DQN) made Atari games a standard RL benchmark when Mnih et al.\ showed they could reach human-level play from pixels alone. Since then, a series of improvements (Double Q-learning, Dueling architecture, prioritized replay) have been shown to help. I picked Space Invaders because it's a real challenge---not as simple as Pong, not as complex as Montezuma's Revenge---and I wanted to see how far a from-scratch implementation with these improvements could get on a limited compute budget.
+
+\subsection{Related Work}
+Mnih et al.\ (2015) first showed DQN achieving human-level scores on many Atari games using a CNN, experience replay, and a target network. Van Hasselt et al.\ (2016) fixed DQN's tendency to overestimate Q-values by decoupling action selection from evaluation (Double DQN). Wang et al.\ (2016) redesigned the network to output separate value and advantage streams (Dueling DQN), which helps when the exact action matters less than whether the state is good. Schaul et al.\ (2016) proposed sampling transitions by TD error magnitude rather than uniformly (Prioritized Experience Replay, PER). Hessel et al.\ (2018) combined these and more into ``Rainbow,'' showing the improvements are largely complementary.
+
+I considered PPO as an alternative---it's generally more stable and handles continuous actions---but for a discrete-action Atari game with a tight 2M-step budget, DQN-based methods are simpler to tune and debug. Dueling made sense because in Space Invaders, being alive with no enemies overhead is good regardless of whether you move left or right. PER helps make the most of limited samples, which matters at 2M steps.
+
+\section{Algorithm}
+
+\subsection{DQN Basics}
+Q-learning estimates $Q(s,a)$, the expected return from taking action $a$ in state $s$ and following the optimal policy afterward. The Bellman target is $r + \gamma \max_{a'} Q(s', a')$. DQN uses a neural network for $Q$, with two tricks to make this work with non-linear function approximation:

 \begin{itemize}
-    \item \textbf{Discrete Action Space}: The player can choose from 6 actions (noop, fire, left, right, left+fire, right+fire)
-    \item \textbf{Visual Input}: The agent must process raw pixel inputs (210×160 RGB images)
-    \item \textbf{Temporal Dependencies}: Success requires understanding movement patterns and predicting enemy trajectories
-    \item \textbf{Sparse Rewards}: Points are only earned when destroying aliens or completing a level
-    \item \textbf{Partial Observability}: The agent must remember past states to make informed decisions
+    \item \textbf{Experience replay}: Transitions $(s, a, r, s', done)$ go into a buffer and are sampled randomly for training, breaking the correlation between consecutive samples.
+    \item \textbf{Target network}: A frozen copy of the Q-network computes the target $r + \gamma \max_{a'} Q_{\text{target}}(s', a')$, updated every $C$ steps.
 \end{itemize}

-\subsection{Motivation}
-Deep reinforcement learning has shown remarkable success in playing Atari games directly from pixel inputs. The DQN algorithm, introduced by Mnih et al. (2015), was a breakthrough that demonstrated human-level performance on many Atari games. This project aims to implement DQN from scratch and evaluate its effectiveness on Space Invaders.
-
-\section{Literature Review}
-
-\subsection{Deep Reinforcement Learning in Atari Games}
-The application of deep reinforcement learning to Atari games has been a significant research area:
-
-\begin{itemize}
-    \item \textbf{DQN (2015)}: Mnih et al. introduced the first deep RL agent achieving human-level performance on Atari games using convolutional neural networks with experience replay and target networks.
-    \item \textbf{Double DQN (2016)}: Van Hasselt et al. addressed the overestimation bias in DQN by decoupling action selection from evaluation.
-    \item \textbf{Dueling DQN (2016)}: Wang et al. proposed a network architecture that separately estimates state value and action advantages.
-    \item \textbf{Prioritized Experience Replay (2016)}: Schaul et al. improved sample efficiency by prioritizing transitions with high TD errors.
-    \item \textbf{A3C (2016)}: Mnih et al. introduced asynchronous advantage actor-critic for parallel training.
-\end{itemize}
-
-\subsection{Algorithm Comparison}
-Several algorithms were considered for this project:
-
-\begin{table}[H]
-\centering
-\begin{tabular}{@{}lccc@{}}
-\toprule
-\textbf{Algorithm} & \textbf{Action Space} & \textbf{Sample Efficiency} & \textbf{Stability} \\
-\midrule
-DQN & Discrete & Moderate & High \\
-Double DQN & Discrete & Moderate & High \\
-Dueling DQN & Discrete & High & High \\
-PPO & Both & High & Very High \\
-A2C & Both & Moderate & Moderate \\
-\bottomrule
-\end{tabular}
-\caption{Comparison of reinforcement learning algorithms}
-\label{tab:algorithm_comparison}
-\end{table}
-
-\textbf{Why DQN?} DQN was selected for this project because:
-\begin{enumerate}
-    \item It is well-suited for discrete action spaces like Space Invaders
-    \item The algorithm is relatively simple to implement and understand
-    \item It has a strong track record on Atari games
-    \item The implementation demonstrates fundamental RL concepts clearly
-\end{enumerate}
-
-\section{Algorithm and Implementation}
-
-\subsection{DQN Algorithm}
-
-\subsubsection{Q-Learning Foundation}
-DQN builds upon the Q-learning algorithm, which learns a function $Q(s, a)$ that estimates the expected return of taking action $a$ in state $s$:
-
-\begin{equation}
-Q^*(s, a) = \mathbb{E}[r + \gamma \max_{a'} Q^*(s', a') | s, a]
-\end{equation}
-
-where $\gamma$ is the discount factor.
-
-\subsubsection{Experience Replay}
-To break the correlation between consecutive samples, DQN uses experience replay:
-\begin{itemize}
-    \item Store transitions $(s, a, r, s', done)$ in a replay buffer
-    \item Sample random mini-batches for training
-    \item This stabilizes training and improves sample efficiency
-\end{itemize}
-
-\subsubsection{Target Network}
-To further stabilize training, DQN uses a separate target network:
-\begin{itemize}
-    \item The target network is a copy of the Q-network
-    \item It is updated periodically (every $C$ steps)
-    \item Used to compute the target Q-values during training
-\end{itemize}
-
-\subsubsection{Double DQN Extension}
-This implementation uses Double DQN to address overestimation bias:
+\subsection{Double DQN}
+Standard DQN uses the same network to pick and evaluate actions, which biases Q-values upward. Double DQN separates them: pick the action with the online network, evaluate with the target network:

 \begin{equation}
 y = r + \gamma Q(s', \arg\max_{a'} Q(s', a'; \theta); \theta^-)
 \end{equation}

-where $\theta$ are the online network parameters and $\theta^-$ are the target network parameters.
+\subsection{Dueling Architecture}
+The network splits after the conv layers into two streams:
+\begin{equation}
+Q(s,a) = V(s) + A(s,a) - \frac{1}{|\mathcal{A}|}\sum_{a'}A(s,a')
+\end{equation}

-\subsection{Network Architecture}
+$V(s)$ learns which states are good; $A(s,a)$ learns which actions are better than average. In Space Invaders, many states have similar values---survival is about positioning, and any button press that doesn't get you killed is fine. The Dueling split lets the network learn this without evaluating every action separately.

-The Q-network uses a convolutional neural network:
+\subsection{Prioritized Experience Replay}
+Instead of uniform sampling, PER samples transition $i$ with probability proportional to $|\delta_i|^\alpha$, where $\delta_i$ is the TD error. An importance-sampling weight $w_i = (N \cdot P(i))^{-\beta}$ corrects the bias this introduces, with $\beta$ annealed from 0.4 to 1.0. New transitions get max priority so they're seen at least once. In theory, this focuses updates on surprising experiences. In practice, it also makes the sampling distribution non-stationary, which can hurt stability.
+
+\subsection{Network and Preprocessing}
+Table~\ref{tab:network} shows the Dueling architecture. The shared encoder is three conv layers (same as Mnih et al.), then value and advantage streams each have a 512-unit hidden layer. Total: 3.29M parameters.

 \begin{table}[H]
 \centering
 \begin{tabular}{@{}lll@{}}
 \toprule
-\textbf{Layer} & \textbf{Output Shape} & \textbf{Parameters} \\
+\textbf{Component} & \textbf{Output} & \textbf{Params} \\
 \midrule
-Conv2d(4, 32, 8×8, stride=4) & 20×20×32 & 8,224 \\
-Conv2d(32, 64, 4×4, stride=2) & 9×9×64 & 32,832 \\
-Conv2d(64, 64, 3×3, stride=1) & 7×7×64 & 36,928 \\
-Linear(3136, 512) & 512 & 1,606,144 \\
-Linear(512, 6) & 6 & 3,078 \\
+Conv2d(4, 32, 8$\times$8, /4) & 20$\times$20$\times$32 & 8,224 \\
+Conv2d(32, 64, 4$\times$4, /2) & 9$\times$9$\times$64 & 32,832 \\
+Conv2d(64, 64, 3$\times$3, /1) & 7$\times$7$\times$64 & 36,928 \\
+\multicolumn{3}{@{}l@{}}{\textbf{Value Stream}} \\
+\quad Linear(3136, 512) + ReLU & 512 & 1,606,144 \\
+\quad Linear(512, 1) & 1 & 513 \\
+\multicolumn{3}{@{}l@{}}{\textbf{Advantage Stream}} \\
+\quad Linear(3136, 512) + ReLU & 512 & 1,606,144 \\
+\quad Linear(512, 6) & 6 & 3,078 \\
 \midrule
-\textbf{Total} & & 1,687,206 \\
+\textbf{Total} & & 3,293,863 \\
 \bottomrule
 \end{tabular}
-\caption{Network architecture details}
+\caption{Dueling Q-Network architecture}
 \label{tab:network}
 \end{table}

-\subsection{Environment Preprocessing}
+Preprocessing follows the standard Atari pipeline: grayscale $\rightarrow$ resize to 84$\times$84 $\rightarrow$ stack 4 frames $\rightarrow$ frame-skip (repeat action 4 frames, take pixel-wise max of the last two). Rewards are clipped to $[-1, 1]$. At the start of each episode, 1--30 no-op actions are inserted to randomize initial conditions.

-The environment is preprocessed with:
-\begin{itemize}
-    \item \textbf{Grayscale Conversion}: RGB to grayscale to reduce input dimensionality
-    \item \textbf{Resizing}: Downsample to 84×84 pixels
-    \item \textbf{Frame Stacking}: Stack 4 consecutive frames to capture motion
-    \item \textbf{Reward Clipping}: Clip rewards to [-1, 1] for stability
-    \item \textbf{Noop Reset}: Random no-op actions at episode start for exploration
-    \item \textbf{Frame Skipping}: Skip 4 frames and take max to reduce computation
-\end{itemize}
-
-\subsection{Training Details}
+\subsection{Hyperparameters}
+Table~\ref{tab:hyperparameters} lists the final configuration. The learning rate and $\epsilon$ schedule are fairly conservative. PER hyperparameters use standard values from Schaul et al.

 \begin{table}[H]
 \centering
 \begin{tabular}{@{}ll@{}}
 \toprule
-\textbf{Hyperparameter} & \textbf{Value} \\
+\textbf{Parameter} & \textbf{Value} \\
 \midrule
-Learning Rate & $1 \times 10^{-4}$ \\
-Discount Factor ($\gamma$) & 0.99 \\
-Batch Size & 32 \\
-Replay Buffer Size & 100,000 \\
-$\epsilon$ Start & 1.0 \\
-$\epsilon$ End & 0.01 \\
-$\epsilon$ Decay Steps & 1,000,000 \\
-Target Network Update & Every 1,000 steps \\
-Total Training Steps & 2,000,000 \\
-Warmup Steps & 10,000 \\
+Learning rate & $5\times10^{-5}$, halved after 1M steps \\
+Discount $\gamma$ & 0.99 \\
+Batch size & 64 \\
+Replay buffer & 500K transitions \\
+PER $\alpha$ & 0.6 \\
+PER $\beta$ & 0.4 $\rightarrow$ 1.0 (linear) \\
+$\epsilon$ schedule & 1.0 $\rightarrow$ 0.01, linear over 1M steps \\
+Target network update & Every 1,000 steps \\
+Total steps & 2,000,000 \\
+Warmup (random only) & 10,000 steps \\
+Optimizer & Adam ($\epsilon$=1e-5) \\
+Gradient clipping & Max norm = 10 \\
 \bottomrule
 \end{tabular}
 \caption{Training hyperparameters}
 \label{tab:hyperparameters}
 \end{table}

-\section{Experimental Results}
+\section{Results}

-\subsection{Training Performance}
+\subsection{Training Progress}
+Training took about 6 hours on an RTX 4060 laptop GPU. Figure~\ref{fig:training_curves} shows the training curves (reward, loss, Q-values). Figure~\ref{fig:epsilon_decay} shows the $\epsilon$ schedule.

-The agent was trained for 2 million steps on an NVIDIA RTX 4060 GPU. Key observations:
-
-\begin{itemize}
-    \item \textbf{Initial Phase} (0-100K steps): Score reaches 15-23 but with high variance
-    \item \textbf{Learning Phase} (100K-600K steps): Score peaks at 30.45 at 400K, followed by regression to 11.20 at 600K
-    \item \textbf{Convergence Phase} (600K-2M steps): Score peaks at 32.50 at 1.2M steps, with recurring fluctuations between 18-32
-\end{itemize}
+The training reward curve is noisy but trends upward through $\sim$400 episodes, then oscillates. The loss curve shows the expected decay from initial high values as the network starts making better predictions, though it stays bumpy. Average Q-values rise consistently, which could mean the network is learning or could mean it's overestimating---hard to tell without ground truth.

 \begin{figure}[H]
 \centering
 \includegraphics[width=0.8\textwidth]{../plots/training_curves.png}
-\caption{Training curves showing reward, loss, and Q-value evolution}
+\caption{Training reward, loss, and Q-value over 500 episodes}
 \label{fig:training_curves}
 \end{figure}

-\begin{figure}[H]
-\centering
-\includegraphics[width=0.8\textwidth]{../plots/evaluation_curve.png}
-\caption{Evaluation reward at different training checkpoints with standard deviation error bars}
-\label{fig:evaluation_curve}
-\end{figure}
-
 \begin{figure}[H]
 \centering
 \includegraphics[width=0.8\textwidth]{../plots/epsilon_decay.png}
-\caption{Epsilon decay curve during training}
+\caption{$\epsilon$ decay from 1.0 to 0.01 over 1M steps}
 \label{fig:epsilon_decay}
 \end{figure}

-\subsection{Evaluation Results}
+\subsection{Checkpoint Evaluation}
+After training, I evaluated 11 evenly spaced checkpoints plus the ``best'' model tracked during training. Each was tested over 10 episodes with $\epsilon=0$ (greedy). Figure~\ref{fig:evaluation_curve} plots the results with error bars. Table~\ref{tab:evaluation} gives the numbers.

-The trained agent was evaluated over 20 episodes at different training checkpoints:
+\begin{figure}[H]
+\centering
+\includegraphics[width=0.8\textwidth]{../plots/evaluation_curve.png}
+\caption{Evaluation scores across checkpoints, mean $\pm$ std over 10 episodes}
+\label{fig:evaluation_curve}
+\end{figure}

 \begin{table}[H]
 \centering
 \begin{tabular}{@{}lcc@{}}
 \toprule
-\textbf{Checkpoint} & \textbf{Average Score} & \textbf{Std Dev} \\
+\textbf{Checkpoint} & \textbf{Mean Score} & \textbf{Std} \\
 \midrule
-100K steps & 15.00 & 12.84 \\
-200K steps & 23.55 & 18.66 \\
-400K steps & 30.45 & 16.47 \\
-800K steps & 18.20 & 6.28 \\
-1.0M steps & 22.95 & 12.10 \\
-1.2M steps & \textbf{32.50} & 11.43 \\
-1.6M steps & 25.35 & 11.88 \\
-2.0M steps (final) & 24.70 & 17.15 \\
+100K & 15.00 & 12.84 \\
+200K & 23.55 & 18.66 \\
+400K & 30.45 & 16.47 \\
+600K & 11.20 & 8.49 \\
+800K & 18.20 & 6.28 \\
+1.0M & 22.95 & 12.10 \\
+1.2M & \textbf{32.50} & 11.43 \\
+1.4M & 21.15 & 7.80 \\
+1.6M & 25.35 & 11.88 \\
+1.8M & 18.65 & 7.35 \\
+2.0M (final) & 24.70 & 17.15 \\
+\midrule
+best.pt (tracked live) & 20.40 & 11.43 \\
 \bottomrule
 \end{tabular}
-\caption{Evaluation results at different training checkpoints}
+\caption{Evaluation at each checkpoint, 10 episodes each. The live-tracked best model underperformed most saved checkpoints when re-evaluated.}
 \label{tab:evaluation}
 \end{table}

-The best performance was achieved at 1.2M training steps with an average score of 32.50, representing a 6.5x improvement over random play ($\sim$5). While the agent shows clear learning progress, the high standard deviations (6-19) indicate significant performance variance across episodes, and the score fluctuations between checkpoints suggest training instability.
+A few things stand out:

-\subsection{Comparison with Baselines}
+\begin{enumerate}
+    \item \textbf{The 600K collapse}. Score dropped from 30.45 at 400K to 11.20 at 600K---below the 100K checkpoint (15.00). The network effectively unlearned whatever it had figured out, then climbed back to 32.50 by 1.2M. This isn't just noise; it's a real regression.
+    \item \textbf{No convergence}. Even in the second half of training, scores jump between 18 and 32. There's no point where things stabilize. The final checkpoint at 2M scores 24.70, which is worse than the checkpoint 800K steps earlier.
+    \item \textbf{best.pt isn't best}. The model the trainer tracked as best scored 20.40 when re-evaluated, worse than 7 out of 11 checkpoints. The in-training evaluation happened to catch a lucky or unlucky seed; with only 10 episodes and std of 12--17, a single evaluation can easily be 10+ points off.
+    \item \textbf{High variance throughout}. Standard deviations range from 6 to 19, meaning even the best checkpoint sometimes scores single digits and sometimes above 40.
+\end{enumerate}
+
+\subsection{Baseline Comparison}
+Table~\ref{tab:comparison} puts the numbers in context. Random play gets $\sim$5. My best checkpoint gets 32.50. Published DQN scores on Space Invaders are in the 170--580 range, but those used 50M frames (12.5M steps), a larger replay buffer, and a single-frame input with different preprocessing. The comparison isn't apples-to-apples; 2M steps is roughly 16\% of the original training budget. Whether the agent would reach competitive scores with more steps is an open question---the instability at 2M doesn't inspire confidence, but it also doesn't rule it out.

 \begin{table}[H]
 \centering
 \begin{tabular}{@{}lcc@{}}
 \toprule
-\textbf{Method} & \textbf{Average Score} & \textbf{Training Time} \\
+\textbf{Method} & \textbf{Score} & \textbf{Compute} \\
 \midrule
-Random Agent & $\sim$5 & N/A \\
-Our DQN (Best) & 32.50 & $\sim$6 hours \\
-Our DQN (Final) & 24.70 & $\sim$6 hours \\
-Human Player & $\sim$200 & N/A \\
+Random & $\sim$5 & --- \\
+This work (best) & 32.50 & 6h, RTX 4060 \\
+This work (final, 2.0M) & 24.70 & 6h, RTX 4060 \\
+Human (Bellemare et al., 2013) & $\sim$165 & --- \\
+DQN (Mnih et al., 2015) & 170--580 & 7--10 days, single GPU \\
 \bottomrule
 \end{tabular}
-\caption{Comparison with baselines}
+\caption{Score comparison. Note the large difference in training budget.}
 \label{tab:comparison}
 \end{table}

 \section{Discussion}

-\subsection{Performance Analysis}
-The DQN agent achieved competitive performance on Space Invaders, with the best checkpoint reaching an average score of 32.50. The algorithm's success can be attributed to:
+\subsection{Why Things Worked (When They Did)}
+The Dueling architecture probably helped the most. In Space Invaders, the difference between good and bad states is much larger than the difference between good and bad actions in a given state. Dueling separates these---$V(s)$ captures whether the agent is safe, $A(s,a)$ captures whether shooting or dodging is better right now. Double DQN likely reduced the worst overestimation, and PER probably helped early on by focusing on transitions where the agent was surprised.

+\subsection{Why Things Didn't Converge}
+The 600K collapse and ongoing oscillation suggest a few interacting problems:
+
+\textbf{Too much exploration for too long.} $\epsilon$ decays linearly from 1.0 to 0.01 over 1M steps. At 600K, $\epsilon \approx 0.4$, so 40\% of actions are random. That's enough randomness to frequently land in bad states the network hasn't learned to recover from. The data from those bad trajectories then contaminates the replay buffer and training.
+
+\textbf{PER makes the data distribution non-stationary.} As the network improves, which transitions get high priority changes. The optimizer is essentially chasing a moving target. With only 2M environment steps, it may never settle.
+
+\textbf{3.3M parameters on 2M steps is under-training.} The network has enough capacity to overfit to recent batches, and gradient variance from mini-batches may be high enough to destabilize the Q-function.
+
+\textbf{The best.pt issue.} With std of 12--17 across 10 evaluation episodes, a single eval can easily be off by 10+ points. The live tracker keeps whichever model happened to score highest on one evaluation---but that evaluation might have been lucky. Checkpoint-based evaluation (saving every 50K steps) would have caught the actual best model, but the live-saved best.pt is misleading.
+
+\subsection{What Would Help}
+Based on the above, the most promising fixes are:
 \begin{itemize}
-    \item Dueling DQN architecture separating state value and action advantage streams
-    \item Experience replay breaking temporal correlations
-    \item Target network stabilizing training
-    \item Double DQN reducing overestimation bias
-    \item Effective preprocessing reducing visual complexity
-\end{itemize}
-
-\subsection{Limitations}
-Several limitations were observed:
-
-\begin{itemize}
-    \item \textbf{Sample Efficiency}: DQN requires millions of samples to learn effectively
-    \item \textbf{Overestimation}: Despite Double DQN, some overestimation persists
-    \item \textbf{Hyperparameter Sensitivity}: Performance is sensitive to learning rate and $\epsilon$ schedule
-    \item \textbf{Visual Processing}: The CNN may not capture all relevant game features
-\end{itemize}
-
-\subsection{Potential Improvements}
-Future improvements could include:
-
-\begin{itemize}
-    \item Implementing Prioritized Experience Replay for more efficient sampling
-    \item Increasing training steps to 10-50M for better convergence
-    \item Using Noisy Networks for more effective exploration
-    \item Adding Rainbow DQN extensions (C51, N-step returns)
-    \item Using distributed training for faster convergence
+    \item Train longer. 2M steps clearly isn't enough; 10M would give the network more updates per parameter.
+    \item Switch to a Noisy Network for exploration instead of $\epsilon$-greedy. This replaces random action selection with learned parametric noise in the network weights, which provides state-dependent exploration and often converges faster.
+    \item Use N-step returns. Instead of bootstrapping after one step, accumulate actual rewards over $n$ steps. This propagates reward information faster and reduces variance from the value function.
+    \item Add an SB3 DQN baseline trained identically. This would isolate whether the instability comes from the algorithm or my implementation.
 \end{itemize}

 \section{Conclusion}
-
-This project successfully implemented a Dueling Double DQN agent for playing Space Invaders from raw pixel inputs. The agent achieved a best average score of 32.50 at 1.2M training steps, representing a 6.5x improvement over random agents ($\sim$5). The implementation highlights the effectiveness of deep reinforcement learning for Atari games and provides a solid foundation for exploring more advanced algorithms.
-
-The DQN algorithm, while relatively simple, remains a powerful approach for discrete action space problems. The key innovations of experience replay, target networks, and Dueling architecture are crucial for stable training and improved performance. The use of Double DQN helped reduce overestimation bias, though performance fluctuation remains an issue. Future work could explore Prioritized Experience Replay, longer training schedules, and additional Rainbow DQN extensions to further improve performance and training stability.
+I built a Dueling Double DQN with PER for Space Invaders from scratch. After 2M steps of training, it learned to score 32.50---better than random, but far below published baselines and still bouncing around wildly. The Dueling architecture, Double Q-learning, and PER each contributed to the learning that did happen, but 2M steps wasn't enough for convergence, and the combination of $\epsilon$-greedy exploration with PER's non-stationary sampling distribution made things unstable. The codebase now supports parallel training, AMP mixed precision, and torch.compile, so scaling to 10M+ steps would be straightforward. Getting SB3 baselines and trying Noisy Nets or N-step returns are the logical next steps.

 \section*{References}

 \begin{enumerate}
-    \item Mnih, V., et al. (2015). Human-level control through deep reinforcement learning. \textit{Nature}, 518(7540), 529-533.
-    \item Van Hasselt, H., et al. (2016). Deep Reinforcement Learning with Double Q-learning. \textit{AAAI}.
-    \item Wang, Z., et al. (2016). Dueling Network Architectures for Deep Reinforcement Learning. \textit{ICML}.
-    \item Schaul, T., et al. (2016). Prioritized Experience Replay. \textit{ICLR}.
-    \item Bellemare, M. G., et al. (2013). The Arcade Learning Environment: An Evaluation Platform for General Agents. \textit{JAIR}.
+    \item Mnih, V., Kavukcuoglu, K., Silver, D., et al.\ (2015). Human-level control through deep reinforcement learning. \textit{Nature}, 518(7540), 529--533.
+    \item Van Hasselt, H., Guez, A., and Silver, D.\ (2016). Deep Reinforcement Learning with Double Q-learning. \textit{AAAI}.
+    \item Wang, Z., Schaul, T., Hessel, M., et al.\ (2016). Dueling Network Architectures for Deep Reinforcement Learning. \textit{ICML}.
+    \item Schaul, T., Quan, J., Antonoglou, I., and Silver, D.\ (2016). Prioritized Experience Replay. \textit{ICLR}.
+    \item Bellemare, M. G., Naddaf, Y., Veness, J., and Bowling, M.\ (2013). The Arcade Learning Environment: An Evaluation Platform for General Agents. \textit{JAIR}, 47, 253--279.
+    \item Hessel, M., Modayil, J., Van Hasselt, H., et al.\ (2018). Rainbow: Combining Improvements in Deep Reinforcement Learning. \textit{AAAI}.
 \end{enumerate}

-\end{document}
+\end{document}