% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} 
%% In your camera-ready you should use the 'accepted' parameter. This shows the authors and how an accepted paper will look like. The footer is 'Acccepted for X'. In the final version, the proceedings chairs will add the page numbers for PMLR and the final footer will be 'Proceedings of X'.
%
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros

%% beginning of transcription of abaisero.sty

\usepackage{amsmath}
\usepackage{amssymb}

% math commands

\newcommand\naturalset{\mathbb{Z}}
\newcommand\realset{\mathbb{R}}
\newcommand\kstar{^{*}}
\newcommand\kplus{^{+}}

\DeclareMathOperator*\softmax{softmax}
\DeclareMathOperator*\softmin{softmin}
\DeclareMathOperator\sign{sign}

% linalg commands

\DeclareMathOperator*\diag{diag}
\DeclareMathOperator*\rank{rank}
\DeclareMathOperator*\trace{tr}

\DeclareMathOperator*\colspace{col}
\DeclareMathOperator*\nullspace{ker}
\DeclareMathOperator*\spanspace{span}

\newcommand\T{^\top}
\newcommand\I{^{-1}}
\newcommand\PI{^{+}}
\newcommand\IT{^{-\top}}
\newcommand\PIT{^{+\top}}

% optim commands

\newcommand\opt{^{*}}
\DeclareMathOperator*\argmax{argmax}
\DeclareMathOperator*\argmin{argmin}

% stats commands

\DeclareMathOperator\Cov{\mathbb{C}}
\DeclareMathOperator\DKL{{D_\text{KL}}}
\DeclareMathOperator\Ent{\mathbb{H}}
\DeclareMathOperator\Exp{\mathbb{E}}
\DeclareMathOperator\Ind{\mathbb{I}}
\DeclareMathOperator\KL{KL}
\DeclareMathOperator\MI{\mathbb{I}}
\DeclareMathOperator\Var{\mathbb{V}}

% dists commands

\newcommand\Categorical{\operatorname{Categorical}}
\newcommand\Dirichlet{\operatorname{Dirichlet}}
\newcommand\Normal{\operatorname{Normal}}
\newcommand\Uniform{\operatorname{Uniform}}

% ml commands

\newcommand\data{{\mathcal{D}}}
\newcommand\loss{\mathcal{L}}
\DeclareMathOperator\nll{nll}
\DeclareMathOperator\mse{MSE}

% rl commands

\newcommand\aset{\mathcal{A}}
\newcommand\bset{\mathcal{B}}
\newcommand\hset{\mathcal{H}}
\newcommand\oset{\mathcal{O}}
\newcommand\rset{\mathcal{R}}
\newcommand\sset{\mathcal{S}}

\newcommand\dfn{\mathrm{D}}
\newcommand\gfn{\mathrm{G}}
\newcommand\ofn{\mathrm{O}}
\newcommand\rfn{\mathrm{R}}
\newcommand\tfn{\mathrm{T}}

\newcommand\nohistory{\varepsilon}

\newcommand\policy{\pi}

\newcommand\qpolicy{Q^\policy}
\newcommand\qmodel{\hat Q}

\newcommand\vpolicy{V^\policy}
\newcommand\vmodel{\hat V}

\newcommand\upolicy{U^\policy}
\newcommand\umodel{\hat U}

% misc options

\newcommand\iter[1]{^{(#1)}}

%% end of abaisero.sty

\usepackage{todonotes} % TODO remove eventually
\usepackage[inline]{enumitem}
\usepackage{amsthm}
\usepackage{cleveref}
\usepackage{subcaption}
\usepackage{algorithm}
\usepackage{algpseudocode}
\usepackage{xr}
\externaldocument{baisero_636}

% note: amsthm must be loaded before cleveref, but the theorems must be defined after cleveref.
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

% useful for pseudocode lines which are too long
\newcommand{\algparbox}[1]{\parbox[t]{\dimexpr\linewidth-\algorithmicindent}{#1\strut}}

\newcommand\qset{\mathcal{Q}}
\newcommand\uset{\mathcal{U}}
\newcommand\Ppolicy{P_{\policy}}
\newcommand\Bpolicy{B_{\policy}}
\newcommand\Bpolicyopt{B_{\policy\opt}}

% To make just enough space for some of the longer equation, I've renamed "stop" to "SG" for "stop-gradient" (I think I've seen this somewhere before)
\DeclareMathOperator{\Stop}{SG}
\newcommand\qloss{\loss_{\qmodel}}
\newcommand\uloss{\loss_{\umodel}}

% \renewcommand\paragraph[1]{\noindent\textbf{#1}\;}
\let\oldqmodel\qmodel
\renewcommand\qmodel{\smash{\oldqmodel}}
\let\oldumodel\umodel
\renewcommand\umodel{\smash{\oldumodel}}
\newcommand\pmodel{\hat\policy}

\newcommand\envlabel[1]{\textbf{#1}}
\newcommand\heavenhell{\envlabel{Heaven-Hell}}
\newcommand\heavenhellthree{\envlabel{Heaven-Hell-3}}
\newcommand\heavenhellfour{\envlabel{Heaven-Hell-4}}
\newcommand\carflag{\envlabel{Car-Flag}}
\newcommand\cleaner{\envlabel{Cleaner}}
\newcommand\gvmemoryfourrooms{\envlabel{GV-MemoryFourRooms-7x7}}

\newcommand\algolabel[1]{\textbf{#1}}
\newcommand\dqn{\algolabel{DQN}}
\newcommand\adqn{\algolabel{ADQN}}
\newcommand\adqnvr{\algolabel{ADQN-VR}}
\newcommand\adqnstate{\algolabel{ADQN-State}}
\newcommand\adqnstatevr{\algolabel{ADQN-State-VR}}

\title{Asymmetric DQN for Partially Observable Reinforcement Learning\\(Supplementary Material)}


% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is automatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Important:  case of equal contributions, we strongly recommend to NOT show it in this part of the paper, but rather describe it in the appropriate section at the end of the paper "Author Contribution", where you have more space to describe how each author contributed.
%
% Add authors
% Remember to use the order convention "First/Given name" "Last/Family name", e.g. John Smith, Hanako Yamada, Marco Rossi, Wei Zhang
\author[1]{\href{mailto:<baisero.a@northeastern.edu>?Subject=Your UAI 2022 paper}{Andrea~Baisero}{}}
\author[1]{Brett~Daley}
\author[1]{Christopher~Amato}
% Add affiliations after the authors
\affil[1]{%
    Khoury College of Computer Sciences\\
    Northeastern University\\
    Boston, Massachusetts, USA
}
  
\begin{document}

\onecolumn
\maketitle
\appendix

\section{Proofs}

\subsection{Proof of Lemma~\ref{thm:bpolicy:q}}\label{sec:proof:thm:bpolicy:q}

We first note that $\Bpolicy$ may also be expressed as $\Bpolicy\colon Q \mapsto R+\gamma\Ppolicy Q$, where $\Ppolicy\colon\qset\to\qset$ is the linear operator defined by $\Ppolicy Q(h, a) \doteq \Exp_{o\mid h,a}\left[ Q(hao, \policy(hao)) \right]$, and has operator $\infty$-norm $\| \Ppolicy \| = 1$.
%
Next, we show that $\Bpolicy$ is a $\gamma$-contraction in $\infty$-norm, i.e., for any two $Q, Q'\in\qset$, the following inequality holds:
%
\begin{align}
	%
	\| \Bpolicy Q - \Bpolicy Q' \| & = \| R + \gamma \Ppolicy Q - R - \gamma \Ppolicy Q' \| \nonumber \\
	%
	                               & = \gamma \| \Ppolicy \left( Q - Q' \right) \| \nonumber          \\
	%
	                               & \leq \gamma \| \Ppolicy \| \| Q - Q' \| \nonumber                \\
	%
	                               & = \gamma \| Q - Q' \| \,.
	%
\end{align}

Therefore, we conclude that $\Bpolicy$ has a unique fixed point which is $\qpolicy$ by definition.

\subsection{Proof of Lemma~\ref{thm:b:q}}\label{sec:proof:thm:b:q}

\begin{proof}
	%
	We show that $B$ is a $\gamma$-contraction in $\infty$-norm, i.e., for any two $Q, Q'\in\qset$, the following inequality holds:
	%
	\begin{align}
		%
		\| BQ - BQ' \| & = \max_{h, a} \left\lvert R(h, a) + \gamma\Exp_{o\mid h, a} \max_{a'} Q(hao, a') - R(h, a) - \gamma\Exp_{o\mid h, a} \max_{a'} Q'(hao, a') \right\rvert \nonumber   \\
		%
		               & = \gamma \max_{h, a} \left\lvert \Exp_{o\mid h, a}\left[ \max_{a'} Q(hao, a') \right] - \Exp_{o\mid h,a}\left[ \max_{a'} Q'(hao, a') \right] \right\rvert \nonumber \\
		%
		               & = \gamma \max_{h, a} \left\lvert \Exp_{o\mid h, a}\left[ \max_{a'} Q(hao, a') - \max_{a'} Q'(hao, a') \right] \right\rvert \nonumber                                \\
		%
		               & \leq \gamma \max_{h, a} \Exp_{o\mid h, a}\left[ \left\lvert \max_{a'} Q(hao, a') - \max_{a'} Q'(hao, a') \right\rvert \right] \nonumber                             \\
		%
		               & \leq \gamma \max_{h,a,o} \left\lvert \max_{a'} Q(hao, a') - \max_{a'} Q'(hao, a') \right\rvert \nonumber                                                            \\
		%
		               & \leq \gamma \max_{h,a,o,a'} \left\lvert Q(hao, a') - Q'(hao, a') \right\rvert \nonumber                                                                             \\
		%
		               & = \gamma \| Q - Q' \| \,.
		%
	\end{align}

	Therefore, we conclude that $B$ has a unique fixed point which is $Q\opt$ by definition.
	%
\end{proof}

\subsection{Proof of Lemma~\ref{thm:bpolicy:u}}\label{sec:proof:thm:bpolicy:u}

\begin{proof}
	%
	We first note that $\Bpolicy$ may also be expressed as $\Bpolicy\colon U \mapsto R+\gamma\Ppolicy U$, where $\Ppolicy\colon\uset\to\uset$ is the linear operator defined by $\Ppolicy U(h, s, a) \doteq \Exp_{s', o\mid s,a}\left[ U(hao, s', \policy(hao)) \right]$, and has operator $\infty$-norm $\| \Ppolicy \| = 1$.
	%
	Next, we show that $\Bpolicy$ is a $\gamma$-contraction in $\infty$-norm, i.e., for any two $U, U'\in\uset$, the following inequality holds:
	%
	\begin{align}
		%
		\| \Bpolicy U - \Bpolicy U' \| & = \| R + \gamma \Ppolicy U - R - \gamma \Ppolicy U' \| \nonumber \\
		%
		                               & = \gamma \| \Ppolicy \left( U - U' \right) \| \nonumber          \\
		%
		                               & \leq \gamma \| \Ppolicy \| \| U - U' \| \nonumber                \\
		%
		                               & = \gamma \| U - U' \| \,.
		%
	\end{align}

	Therefore, we conclude that $\Bpolicy$ has a unique fixed point which is $\upolicy$ by definition.
	%
\end{proof}

\subsection{Proof of Lemma~\ref{thm:asym-bellman-equivalence} (Asymmetric Bellman Equivalence)}\label{sec:proof:thm:asym-bellman-equivalence}

\begin{proof}
	%
	We assume $Q=EU$, and show that the following elementwise identity holds,
	%
	\begin{align}
		%
		(EB_{g(Q)}U)(h, a) & = \Exp_{s\mid h}\left[ R(s, a) + \gamma\Exp_{s',o\mid s,a}\left[ U(hao, s', \argmax_{a'} Q(hao, a')) \right] \right] \nonumber \\
		%
		                   & = R(h, a) + \gamma\Exp_{s\mid h}\left[ \Exp_{s',o\mid s,a}\left[ U(hao, s', \argmax_{a'} Q(hao, a')) \right] \right] \nonumber \\
		%
		                   & = R(h, a) + \gamma\Exp_{s',o\mid h,a}\left[ U(hao, s', \argmax_{a'} Q(hao, a')) \right] \nonumber                              \\
		%
		                   & = R(h, a) + \gamma\Exp_{o\mid h,a}\left[ \Exp_{s'\mid hao}\left[ U(hao, s', \argmax_{a'} Q(hao, a')) \right] \right] \nonumber \\
		%
		                   & = R(h, a) + \gamma\Exp_{o\mid h,a}\left[ Q(hao, \argmax_{a'} Q(hao, a')) \right] \nonumber                                     \\
		%
		                   & = R(h, a) + \gamma\Exp_{o\mid h,a}\left[ \max_{a'} Q(hao, a') \right] \nonumber                                                \\
		%
		                   & = BQ(h, a) \,.
		%
	\end{align}
	%
	Therefore, $EB_{g(Q)}U = BQ$.
	%
\end{proof}

\subsection{Proof of Theorem~\ref{thm:aql:optimality} (AQL Optimality)}\label{sec:aql_proof}

\begin{proof}

	% \begin{align} Y_k &\doteq ,\\ Z_k &\doteq . \end{align} Here, $w_k \in
	% \uset$ and $v_k \in \qset$ are zero-mean noise processes that represent the
	% randomness in the environment and action selection at iteration $k$. AQL
	% then conducts the following updates based on the stochastic targets $Y_k$
	% and $Z_k$:
	Let $(h_k,s_k,a_k)$ denote the history, state, and action visited at the
	$k$-th iteration of the AQL algorithm. To facilitate our analysis, we would
	like to remove the explicit conditional updates in
	\Cref{eq:aql_update_u,eq:aql_update_q}. We define the binary random process
	$\chi_k \in \uset$ such that
	% 
	\begin{align}
		% 
		\chi_k(h,s,a) & = \begin{cases} 1 & \text{\ if\ } (h,s,a) = (h_k,s_k,a_k)
              \\ 0 & \text{\ otherwise}                    \\\end{cases} \,.
		% 
	\end{align}
	% 
	Using elementwise multiplication and division, the AQL updates in
	\Cref{eq:aql_update_u,eq:aql_update_q} can be written as
	% 
	\begin{align}
		% 
		U_{k+1} & \gets U_k + \alpha_k \chi_k (B_{g(Q_k)} U_k + w_k - U_k) \,,   \\
		% 
		Q_{k+1} & \gets Q_k + \alpha_k (E \chi_k) (E B_{g(Q_k)} U_k + v_k - Q_k)
		\,. \label{eq:aql_q_async_noncontraction}
		% 
	\end{align}
	% 
	We note that the noise processes $w_k$ and $v_k$ are not statistically
	independent because they are computed using a shared transition, which
	guarantees that $v_k = E w_k$. This allows us to prove that $U_k$ and $Q_k$
	remain mutually consistent after each update, i.e., $Q_k = EU_k$, which we
	show by induction. From the assumed initialization in the theorem, the base
	case $Q_0 = E U_0$ is satisfied. Assume that $Q_k = E U_k$ holds for the
	inductive hypothesis. It follows that
	%
	\begin{align}
		%
		E U_{k+1} & = E(U_k + \alpha_k \chi_k (B_{g(Q_k)} U_k + w_k - U_k))
		\nonumber                                                               \\
		%
		          & = E U_k + \alpha_k (E \chi_k) (E B_{g(Q_k)} U_k + E w_k - E
		U_k)) \nonumber                                                         \\
		%
		          & = Q_k + \alpha_k (E \chi_k) (E B_{g(Q_k)} U_k + v_k - Q_k)
		\nonumber                                                               \\
		%
		          & = Q_{k+1} \,,
		%
	\end{align}
	%
	and therefore $U_k$ and $Q_k$ are mutually consistent for all $k \geq 0$. By
	\Cref{thm:asym-bellman-equivalence}, \Cref{eq:aql_q_async_noncontraction}
	reduces to
	%
	\begin{equation}
		%
		Q_{k+1} = Q_k + \alpha_k (E \chi_k) (B Q_k + v_k - Q_k) \,.
		%
	\end{equation}
	%
	Now let $p_k(h,s,a) \in [0,1]$ be the probability that $(h,s,a) =
		(h_k,s_k,a_k)$ conditioned on iteration $k-1$.
	% $\chi_k(h,s,a)=\psi_k(h,a)=1$ if $(h,s,a) = (h_k,s_k,a_k)$, and
	% $\chi_k(h,s,a)=\psi_k(h,a)=0$ otherwise.
	Additionally, let $\mathbf{1}$ denote vectors whose components are all equal
	to one.
	% $\forall (h,s,a) \in \hset \times \sset \times \aset$.
	We can equivalently express \Cref{eq:aql_update_u,eq:aql_update_q} in the
	form \begin{align}
		%
		%
		U_{k+1} & = (\mathbf{1} - \alpha_k p_k) U_k + \alpha_k p_k (B_{g(Q_k)} U_k
		+ w'_k) \,, \label{eq:aql_async_u}
		\\
		% 
		Q_{k+1} & = (\mathbf{1} - \alpha_k E p_k) Q_k + \alpha_k E p_k (B Q_k +
		v'_k) \,, \label{eq:aql_async_q}
		\\
		%
		\intertext{where}
		%
		w'_k    & \doteq w_k + \left(\frac{\chi_k}{p_k} - \mathbf{1}\right)
		(B_{g(Q_k)} U_k + w_k - U_k) \,,                                           \\
		%
		v'_k    & \doteq v_k + \left(\frac{E \chi_k}{E p_k} - \mathbf{1}\right) (B
		Q_k + v_k - Q_k) \,.
		%
	\end{align}
	%
	It can be verified that $\Exp\left[ w'_k \right] = \Exp\left[ v'_k \right] =
		0$ and that the conditional variances of $w'_k$ and $v'_k$ are bounded such
	that Proposition~4.4 of \citet{bertsekas_neuro-dynamic_1995} applies given
	the conditions on $\alpha_k$. It follows that $Q_k$ converges with
	probability 1 to $Q\opt$, the unique fixed point of the contraction mapping
	$B$ (\Cref{thm:b:q}).

	% Since the conditional variance is still bounded (need to show this), the result of \Cref{thm:aql:optimality} still holds, and we determine that AQL converges even when updates are performed in any arbitrary order, per \Cref{eq:aql_update_u,eq:aql_update_q}.

	% The remainder of the proof closely follows that of \Cref{thm:aavi:optimality}, with some additional effort to handle the stochastic updates.
	% We start with \Cref{eq:aql_update_q} and invoke \Cref{thm:asym-bellman-equivalence}:
	% \begin{align}
	%     Q_{k+1}
	%     &= (1-\alpha_k) Q_k + \alpha_k (E B_{g(Q_k)} U_k + v_k) \\
	%     &= (1-\alpha_k) Q_k + \alpha_k (B Q_k + v_k)
	%     % &= (1-\beta_k) Q_k + \beta_k (E ((1-\alpha_k) U_k + \alpha_k (B_{g(Q_k)} U_k + w_k)) + v_k) \\
	%     % &= (1-\beta_k) Q_k + (1-\alpha_k) \beta_k E U_k + \alpha_k \beta_k E B_{g(Q_k)} U_k + \alpha_k \beta_k E w_k + \beta_k v_k \\
	%     % &= (1-\beta_k) Q_k + (1-\alpha_k) \beta_k Q_k + \alpha_k \beta_k E B_{g(E U_k)} U_k + \alpha_k \beta_k E w_k + \beta_k v_k \\
	%     % &= (1-\alpha_k\beta_k) Q_k + \alpha_k \beta_k B E U_k + \alpha_k \beta_k E w_k + \beta_k v_k \\
	%     % &= (1-\alpha_k\beta_k) Q_k + \alpha_k \beta_k B Q_k + \alpha_k \beta_k E w_k + \beta_k v_k
	%     .
	% \end{align}
	% The operator $B$ is a contraction mapping with respect to the maximum norm .
	% Furthermore, $v_k$ is a zero-mean noise process with bounded variance.

	The fact that $Q_k \to Q\opt$ guarantees the existence of some iteration
	$k\opt$, with probability $1$, such that $g(Q_k) = \policy\opt$, for $k \geq
		k\opt$. Therefore,
	%
	\begin{equation}
		%
		U_{k+1} = (\mathbf{1} - \alpha_k p_k) U_k + \alpha_k p_k (B_{\policy\opt}
		U_k + w'_k) \,, \quad \forall k \geq k\opt \,.
		%
	\end{equation}
	%
	$B_{\policy\opt}$ is a contraction mapping that admits $U\opt$ as its unique
	fixed point (\Cref{thm:bpolicy:u}). Once again, Proposition~4.4 of
	\citet{bertsekas_neuro-dynamic_1995} applies, and we conclude that $U_k \to
		U\opt$ with probability $1$.

\end{proof}

\section{Model Architectures}\label{sec:architectures}

\begin{figure*}
	%
	\centering
	%
	\begin{subfigure}{.3\linewidth}
		%
		\centering
		%
		\includegraphics[width=\linewidth]{images/architecture.categorical.pdf}
		%
		\caption{Observation, state, and action models for
			\emph{categorical} POMDPs.}\label{fig:architecture:representations:categorical}
		%
	\end{subfigure}
	%
	\qquad\qquad
	%
	\begin{subfigure}{.5\linewidth}
		%
		\centering
		%
		\includegraphics[width=\linewidth]{images/architecture.gv.pdf}
		%
		\caption{Observation, state, and action models for \emph{gridverse} POMDPs.}\label{fig:architecture:representations:gridverse}
		%
	\end{subfigure}

	\begin{subfigure}{.8\linewidth}
		%
		\centering
		%
		\includegraphics[width=\linewidth]{images/architecture.dqn.pdf}
		%
		\caption{DQN and ADQN architectures.  Separate components are used for
			$\qmodel(h, a)$, $\umodel(h, s, a)$, and $\umodel(s, a)$.  In each case,
			the final layer returns an array of values, one for each action
			$a\in\aset$.}\label{fig:architecture:dqn}
		%
	\end{subfigure}
	%
	\caption{For \emph{categorical} and \emph{gridverse} environments, the
		observation, state, and action models $\phi(o)$, $\phi(s)$, and $\phi(a)$,
		are those respectively depicted in
		\Cref{fig:architecture:representations:categorical} and
		\Cref{fig:architecture:representations:gridverse}. For the
		\emph{feature-vector} environments, observation and state models $\phi(o)$
		and $\phi(s)$ are directly provided as feature-vectors by the
		environment itself, while action models $\phi(a)$ are implemented as
		one-hot encodings. \Cref{fig:architecture:dqn} shows the architecture for the DQN
		models $\qmodel(h, a)$, $\umodel(h, s, a)$, and $\umodel(s,
			a)$.}\label{fig:architecture}
	%
\end{figure*}

This section contains the model architectures used by each method when run on each environment of our evaluation (see \Cref{fig:architecture}).  Some components of the architectures are the same for all methods and environments, while some components are domain-specific, e.g. to accommodate the different structures of states and observations in the different environments.
%
In this section, we refer to the \heavenhellthree\ and \heavenhellfour\ environments as \emph{categorical} environments; to the \carflag\ and \cleaner\ environments as \emph{feature-vector} environments; and to the \gvmemoryfourrooms\ environment as a \emph{gridverse} environment. For a thorough description of each environment, refer to Appendix~C of \cite{baisero_unbiased_2022}.

\paragraph{General Architecture}

The architecture components are shown in \Cref{fig:architecture}. Action and observation features are concatenated to form the input to a $128$-dimensional single-layer \emph{gated recurrent unit} (GRU)~\cite{cho_properties_2014}, which acts as the history model $\phi(h)$.  The value NN is a feedforward model which varies in each type of environment.

\paragraph{Categorical POMDPs}

The action, observation, and state feature components are shown in
\Cref{fig:architecture:representations:categorical}.  Categorical environments
provide actions, observations, and states, as categorical indices, which we convert to parametric feature vectors using $64$-dimensional embedding models.
%
The value NN model is a $2$-layer feedforward network with $512$ and $256$ nodes,
and ReLU non-linearities.

\paragraph{Feature-Vector POMDPs}

The observations and states provided by the feature-vector environments already
come in a feature-vector form, which we do not process further.  Actions are
modeles as one-hot encodings.
%
The value NN model is a $2$-layer feedforward network with $512$ and $256$, and
ReLU non-linearities.

\paragraph{Gridverse POMDPs}

The gridverse environments provide observations and states in a dictionary
format containing different fields representing different aspects of the
environment, see Appendix~C of \cite{baisero_unbiased_2022}.  Because
observations and states already contain a lot of relevant information about the
past, we use scalar $1$-dimensional embedding models for the actions.
%
The $3\times 2\times 3$ observations are first processed using an
$8$-dimensional embedding layer, and then flattened, which produces a
$144$-dimensional observation feature $\phi(o)$.
%
The states contain relevant information in different forms, and require a more
complex model.  The \texttt{grid} component is processed using an embedding
layer, which is then stacked with the \texttt{agent\_id\_grid} component, and
processed by a $1$- or $2-$ layer feedforward network (this is hyper-parameter
$L$ in \Cref{sec:hpsearch}) with ReLU non-linearities.  All outputs of the
\texttt{grid} and \texttt{agent\_id\_grid} components are then concated to form
the overall state feature $\phi(s)$.

\section{Training Details}\label{sec:training}

We perform \emph{fully episodic} training, by which we mean that various aspects of the training involve and are measured based on complete episodes:
%
\begin{itemize}
	%
	\item The replay buffer is more specifically an \emph{episode buffer} which contains full episodes.
	      %
	\item The episode buffer is pre-populated using episodes sampled from a random policy.  Episodes are sampled and inserted into the episode buffer until the episode buffer contains a total of $50k$ timesteps.
	      %
	\item The main training loop iterates an environment interaction phase with an optimization phase.
	      %
	\item In the environment interaction phase, a full episode is sampled from the environment using an $\epsilon$-greedy policy based on the current $\qmodel$.  The sampled episode is then inserted into the episode buffer.
	      %
	\item Any time the episode buffer exceeds a total of $1M$ timesteps, old episodes are removed until that is no longer the case.
	      %
	\item In the optimization phase, a variable number of optimization steps are performed.  Optimization steps are performed until the total number of timesteps used for training exceeds the number of total timesteps sampled from the real environment by a given factor.  This factor is determined by hyper-parameter $F$ (usually $8$ or $16$).  In each optimization step, a number of full episodes are sampled from the episode buffer, and used fully to form the minibatch of transitions used for the optimization step.  Because each episode may contain a variable number of transitions, that means that the size of the minibatch of transitions used for optimization is variable.  The number of episodes which is sampled is determined by hyper-parameter $B$ (usually $1$).
	      %
	\item The whole process is repeated until a given total number of timesteps have been sampled from the environment.
	      %
\end{itemize}

Although this form of training introduces correlations between the transitions in a minibatch $\{ (h, s, a, r, s', o)_i \}_{i=1}^N$, it is also significantly more efficient than if transitions were completely i.i.d, due to the shared computation between the features of consecutive histories.

\section{Hyperparameters and Grid Search}\label{sec:hpsearch}

For each combination of control problem and method, we perform a separate grid-search over hyper-parameters, and find the combination of hyper-parameters which results in the best performance, in each case using statistics aggregated over $20$ independent runs.  The hyper-parameter grid is domain dependent.  For \heavenhellthree, \heavenhellfour, \carflag, and \cleaner, we search over the following:
%
\begin{itemize}
	%
	\item $\alpha\in\{0.0001, 0.0003\}$, the learning rate.
	      %
	\item $N_\epsilon\in \{1M, 2M\}$, the number of timesteps it takes for $\epsilon$ to decay linearly from its initial value of $1.0$ to its final value of $0.1$.
	      %
	\item $F\in \{8, 16\}$, the ratio between number of training timesteps and number of simulation timesteps, used to determine the frequency of optimization steps.
	      %
	\item $B\in \{1, 2\}$, the number of episodes sampled from the episode buffer for each optimization step.
	      %
\end{itemize}

For \gvmemoryfourrooms, we set $\alpha = 0.0001$, and search over the following:
%
\begin{itemize}
	%
	\item $N_\epsilon\in \{1M, 2M, 4M\}$, the number of timesteps it takes for $\epsilon$ to decay linearly from its initial value of $1.0$ to its final value of $0.1$.
	      %
	\item $F\in \{8, 16\}$, the ratio between number of training timesteps and number of simulation timesteps, used to determine the frequency of optimization steps.
	      %
	\item $B\in \{2, 4\}$, the number of episodes sampled from the episode buffer for each optimization step.
	      %
	\item $L\in \{1, 2\}$, the number of final linear layers in the state and observation representation models.
	      %
\end{itemize}

Factoring in the $4$ control problems with $2^4$ combinations of hyper-parameters, and $1$ control problem with $3\cdot 2^3$ combinations of hyper-parameters, $5$ methods for each, and $20$ independent runs for each combination, we obtain a total number of $8800$ independent runs necessary to present all the results of this work.
%
\Cref{tab:hparams} shows the hyper-parameters chosen from each grid search, which are the ones used for the results depicted in \Cref{fig:performance}.

\begin{table}
	%
	\centering
	%
	\caption{Hyper-parameter grid search results.}\label{tab:hparams}
	%
	\begin{tabular}{lllllll}
		%
		\toprule
		%
		Domain             & Method       & $\alpha$ & $N_\epsilon$ & $F$  & $B$ & $L$ \\
		%
		\midrule
		%
		\heavenhellthree   & \dqn         & $0.0003$ & $2M$         & $16$ & $2$ & --  \\
		%
		                   & \adqn        & $0.0001$ & $2M$         & $16$ & $2$ & --  \\
		%
		                   & \adqnvr      & $0.0001$ & $2M$         & $16$ & $2$ & --  \\
		%
		                   & \adqnstate   & $0.0001$ & $2M$         & $16$ & $2$ & --  \\
		%
		                   & \adqnstatevr & $0.0001$ & $2M$         & $16$ & $2$ & --  \\
		%
		\midrule
		%
		\heavenhellfour    & \dqn         & $0.0003$ & $2M$         & $16$ & $1$ & --  \\
		%
		                   & \adqn        & $0.0001$ & $2M$         & $16$ & $2$ & --  \\
		%
		                   & \adqnvr      & $0.0001$ & $2M$         & $16$ & $1$ & --  \\
		%
		                   & \adqnstate   & $0.0001$ & $2M$         & $16$ & $2$ & --  \\
		%
		                   & \adqnstatevr & $0.0001$ & $2M$         & $16$ & $2$ & --  \\
		%
		\midrule
		%
		\carflag           & \dqn         & $0.0003$ & $1M$         & $16$ & $2$ & --  \\
		%
		                   & \adqn        & $0.0003$ & $1M$         & $16$ & $1$ & --  \\
		%
		                   & \adqnvr      & $0.0003$ & $1M$         & $16$ & $2$ & --  \\
		%
		                   & \adqnstate   & $0.0003$ & $1M$         & $16$ & $2$ & --  \\
		%
		                   & \adqnstatevr & $0.0003$ & $1M$         & $16$ & $1$ & --  \\
		%
		\midrule
		%
		\cleaner           & \dqn         & $0.0001$ & $2M$         & $8$  & $2$ & --  \\
		%
		                   & \adqn        & $0.0001$ & $1M$         & $16$ & $2$ & --  \\
		%
		                   & \adqnvr      & $0.0001$ & $2M$         & $16$ & $2$ & --  \\
		%
		                   & \adqnstate   & $0.0001$ & $1M$         & $8$  & $2$ & --  \\
		%
		                   & \adqnstatevr & $0.0001$ & $2M$         & $8$  & $2$ & --  \\
		%
		\midrule
		%
		\gvmemoryfourrooms & \dqn         & $0.0001$ & $4M$         & $16$ & $2$ & $1$ \\
		%
		                   & \adqn        & $0.0001$ & $1M$         & $16$ & $2$ & $1$ \\
		%
		                   & \adqnvr      & $0.0001$ & $1M$         & $16$ & $2$ & $1$ \\
		%
		                   & \adqnstate   & $0.0001$ & $4M$         & $16$ & $2$ & $1$ \\
		%
		                   & \adqnstatevr & $0.0001$ & $4M$         & $16$ & $2$ & $1$ \\
		%
		%
		\bottomrule
		%
	\end{tabular}
	%
\end{table}

\bibliography{baisero_636}

\end{document}
