\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
% version; also before submission to
% see how the non-anonymous paper
% would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{abbrvnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions

\usepackage{amsthm,amsmath,amssymb,amsfonts,exscale,latexsym,float,eucal}
\usepackage{xspace}
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{algorithm,algpseudocode}
\usepackage{multirow}
\usepackage{wrapfig}

\usepackage{url}
\usepackage{hyperref}
\hypersetup{
    colorlinks=true,
    linkcolor=blue,
    citecolor=cyan,
    filecolor=green,      
    urlcolor=magenta,
}

\usepackage{Definitions}

\def\Algref#1{Algorithm~\ref{#1}}
\def\figref#1{figure~\ref{#1}}
% Figure reference, capital. For start of sentence
\def\Figref#1{Figure~\ref{#1}}
% Section reference, lower-case.
\def\secref#1{section~\ref{#1}}
% Section reference, capital.
\def\Secref#1{Section~\ref{#1}}


% \usepackage{algorithmic}


%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

\newlength{\bibitemsep}\setlength{\bibitemsep}{.2\baselineskip plus .05\baselineskip minus .05\baselineskip}
\newlength{\bibparskip}\setlength{\bibparskip}{0pt}
\let\oldthebibliography\thebibliography
\renewcommand\thebibliography[1]{%
  \oldthebibliography{#1}%
  \setlength{\parskip}{\bibitemsep}%
  \setlength{\itemsep}{\bibparskip}%
}
\usepackage{xr} 

\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}

\myexternaldocument{Zhang_322}


\title{Energy-based Predictive Representations for Partially Observed Reinforcement Learning (Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1, 2, $^\star$]{\href{mailto:<tongzheng@utexas.edu>?Subject=Your UAI 2023 paper}{Tianjun Zhang}}
\author[1, 3, $^\star$]{\href{mailto:<tianjunz@berkeley.edu>?Subject=Your UAI 2023 paper}{Tongzheng Ren}}
\author[4]{Chenjun Xiao}
\author[2]{Wenli Xiao}
\author[2]{\\Joseph E. Gonzalez}
\author[1, 4]{Dale Schuurmans}
\author[1, 5]{\href{mailto:<bodai@google.com>?Subject=Your UAI 2023 paper}{Bo Dai}}
% Add affiliations after the authors
\affil[1]{%
    Google Research, Brain Team
}
\affil[2]{%
    Department of EECS, UC Berkeley
}
\affil[3]{
    Department of Computer Science, UT Austin
  }
\affil[4]{Department of Computer Science, University of Alberta}
\affil[5]{School of Computational Science and Engineering, Georgia Tech}
  
\begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\appendix


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5
\section{Derivation of the Random Feature in \eq{eq:random_expansion}}\label{appendix:rf_deriviation}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5

We have that
\begin{align}
    P(o_{t+1}|x_t, a_t) = & p(o_{t+1}) \exp\left(f(x_t, a_t)^\top \left(g(o_{t+1})  + \lambda f(x_t, a_t)\right)\right)\\
    = & p(o_{t+1}) \exp\left(\left(\lambda-\frac{1}{2}\right) \|f(x_t, a_t)\|^2\right) \exp\left(-\frac{\|g(o_{t+1})\|^2}{2}\right)\exp\left(\frac{\|f(x_t, a_t) + g(o_{t+1})\|^2}{2}\right),
\end{align}
where we have that
\begin{align}
    & \exp\left(\frac{\|f(x_t, a_t) + g(o_{t+1})\|^2}{2}\right)\\
    = & (2\pi)^{-d/2}\exp\left(\frac{\|f(x_t, a_t) + g(o_{t+1})\|^2}{2}\right) 
    \int \exp\left(-\frac{\|\omega - (f(x_t, a_t) + g(o_{t+1}))\|^2}{2}\right) d \omega\\
    = & (2\pi)^{-d/2} \int \exp\left(-\frac{\|\omega\|^2}{2} + \omega^\top( f(x_t, a_t)+ g(o_{t+1}))\right) d\omega\\
    = & \mathbb{E}_{\omega\sim\mathcal{N}(0, I_d)} \left[\exp\left(\omega^\top f(x_t, a_t)\right) \exp\left(\omega^\top g(o_{t+1})\right)\right],
\end{align}
which concludes the proof for \eqref{eq:random_expansion}.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5
\section{Observable LQG as~\algabb}\label{appendix:lqg}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5

Follow the standard notations, the dynamics of Linear-Quadratic Gaussian is defined as
\begin{align}
    s_t = & As_{t-1} + Ba_{t} + w_t,\\ 
    o_t = & Cs_{t-1} + z_t,
\end{align}
where $w_t$ and $z_t$ are Gaussian noise. Define the matrix 
\[
G_L = [C^\top, CA^\top,\ldots, \rbr{CA^{L-1}}^\top ]^\top,
\]
and reduced observation 
\[
\otil_t = o_t - z_t - C\sbr{\sum_{k=0}^{t-2} A^kBa_{t-k-1} + \sum_{k=0}^{t-2} A^kw_{t-k-2} }.
\]
By the observability condition of LQG, $G_L$ is full column rank, one can identify $s_0$ by
\[
s_0 = \rbr{G_L^\top G_L}^{-1}\sum_{j=1}^L \rbr{A^\top}^{j-1} C^\top \otil_j.
\]
Therefore, we have
\begin{align}
    s_1 =& As_0 + Ba_{0} + w_0 = A\rbr{\rbr{G_L^\top G_L}^{-1}\sum_{j=1}^L \rbr{A^\top}^{j-1} C^\top \otil_j} + Ba_1 + w_0,\\
    s_2 =& As_1 + Ba_1 + w_1 = A^2\rbr{\rbr{G_L^\top G_L}^{-1}\sum_{j=1}^L \rbr{A^\top}^{j-1} C^\top \otil_j} + ABa_1 + Ba_2 + Aw_0 + w_1,\\
    s_{L+1} = &As_L + Ba_L + w_L = A^L \rbr{\rbr{G_L^\top G_L}^{-1}\sum_{j=1}^L \rbr{A^\top}^{j-1} C^\top \otil_j} + \sum_{j=0}^L A^{L-j}Ba_{j+1} + \sum_{j=0}^L A^{L-j} w_j,\\
    o_{L+1} =& Cs_{t+1} + z_t = CA^L \rbr{\rbr{G_L^\top G_L}^{-1}\sum_{j=1}^L \rbr{A^\top}^{j-1} C^\top \otil_j} + C\sum_{j=0}^L A^{L-j}Ba_{j+1} + C\sum_{j=0}^L A^{L-j} w_j + z_t,
\end{align}
which means $o_{L+1}$ follows a Gaussian distribution with mean as a function of history $x_L = \cbr{\rbr{o_{i-1}, a_i}_{i=1}^{L}}$ and action $a_{L+1}$, and variance as a function of ${\sigma_w}$, $\sigma_z$, and $\rbr{A, B, C}$. Therefore, we have some function $f_{A, B, C, \sigma_w, \sigma_z}$ and $g_{A, B, C,\sigma_w, \sigma_z}$, such that
\[
g_{A, B, C,\sigma_w, \sigma_z}\rbr{o_{L+1}} = f_{A, B, C,\sigma_w, \sigma_z}\rbr{x_L, a_{L+1}} + \xi,\quad \xi\sim\Ncal\rbr{0, \Ib}. 
\]

On the other hand, we set $\lambda = -\frac{1}{2}$, and $p(o) = \Ncal\rbr{0, \Ib}$ in~\eq{eq:energy_transition}, then, we obtain
\[
p(o_{L+1}|x_L, a_L) \propto \exp\rbr{-\frac{\nbr{g(o_{L+1}) - f(x_L, a_L)}_2^2}{2}},
\]
which reproduces the observable LQG with specific $f_{A, B, C, \sigma_w, \sigma_z}$ and $g_{A, B, C,\sigma_w, \sigma_z}$. 


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5%%%%%%%%%
\section{Experiment Details}\label{appendix:experiments}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5%%%%%%%%%

\subsection{Online Setting}
In Table~\ref{tab:hyper_online}, we list all the hyperparameters and network architecture we use for our experiments. We see that we don't use the additional exploration bonus term in the mojuco tasks. But this is very helpful in DM control suite tasks, especially in those sparse-reward tasks. 

For evaluation in Mujoco, in each evaluation (every 5K steps) we test our algorithm for 10 episodes. We average the results over the last 4 evaluations and 4 random seeds. For Dreamer and Proto-RL, we change their network from CNN to 3-layer MLP and disable the image data augmentation part (since we test on the state space). The architecture we used for the transformer is following the Trajectory Transformer~\citep{janner2021offline}. The attention used is the causal attention. 

\begin{table*}[h]
\caption{Hyperparameters used for \algabb in all the environments in MuJoCo and DM Control Suite.}
\setlength\tabcolsep{3.5pt}
\label{tab:hyper_online}
\centering
\begin{tabular}{p{6cm}p{3cm}p{5cm}p{2.5cm}p{2.5cm}p{2cm}p{2cm}}
% {lcccccccccccc}
\toprule
& Hyperparameter Value \\ 
\midrule
Bonus Coefficient (MuJoCo) & 0.0 \\
Bonus Coefficient (DM Control) & 5.0 \\
Actor lr & 0.0003 \\
Model lr & 0.0003 \\
Actor Network Size (MuJoCo) & (256, 256) \\
Actor Network Size (DM Control) & (1024, 1024) \\
ERP Embedding Network Size (MuJoCo) & (1024, 1024, 1024) \\
ERP Embedding Network Size (DM Control) & (1024, 1024, 1024) \\
Critic Network Size (MuJoCo) & (1024, 1) \\
Critic Network Size (DM Control) & (1024, 1) \\
Discount & 0.99\\
Target Update Tau & 0.005 \\
Model Update Tau & 0.005 \\
Batch Size & 256 \\
\bottomrule 
\end{tabular}
\end{table*}

\subsection{Learning Curves}
We provide the performance curves for online DM Control Suite experiments in~\Figref{fig:dm_control}. As we can see in the figures, the proposed~\algabb converges faster and achieve the state-of-the-art performances in most of the environments, demonstrating the sample efficiency and the ability to balance of exploration vs. exploitation of~\algabb. We also provide additional curves for POMDP setting in ~\Figref{fig:pomdp}.

\begin{figure*}[h]
    \centering
    \includegraphics[width=\textwidth]{figures/dm_result.png}
    \caption{Performance Curves for online DM Control Suite.}
    \label{fig:dm_control}
\end{figure*}

\begin{figure*}[h]
    \centering
    \includegraphics[width=\textwidth]{figures/pomdp_curve.png}
    \caption{Performance Curves for online POMDP DM Control Suite.}
    \label{fig:pomdp}
\end{figure*}

\subsection{Image-Based Experiments}
\begin{figure*}[h]
    \centering
    \includegraphics[width=0.47\textwidth]{figures/reach.png}
    \vspace{-1em}
    \caption{\textbf{Reach environment:} Using a robot arm to reach a specific position.}
    \label{fig:reach_env}
\end{figure*}
We provide the details of metaworld image-based experiments here. We first provide an illustration of the reach environment in~\Figref{fig:reach_env}. We then provide some more experiment details in the following section. 

\begin{table*}[tb]
\caption{Settings of adapted OpenAI Fetch-Reach Environment.}
\footnotesize
\setlength\tabcolsep{3.5pt}
\label{tab:hyper_online}
\centering
\begin{tabular}{p{6cm}p{3cm}p{5cm}p{2.5cm}p{2.5cm}p{2cm}p{2cm}}
\toprule
& Hyperparameter Value \\ 
\midrule
Maximum Episode Steps & 50 \\
Reward Type & 'sparse' \\
Observation Size & (3, 64, 64) \\
Fixed Goal Position & (1.27, 0.90, 0.66) \\
\bottomrule 
\end{tabular}
\end{table*}

\begin{table*}[tb]
\caption{Hyperparameters used for EPR in FetchReachImage.}
\footnotesize
\setlength\tabcolsep{3.5pt}
\label{tab:hyper_online}
\centering
\begin{tabular}{p{6cm}p{3cm}p{5cm}p{2.5cm}p{2.5cm}p{2cm}p{2cm}}
\toprule
& Hyperparameter Value \\ 
\midrule
Bonus Coefficient (MuJoCo) & 0.0 \\
Bonus Coefficient (DM Control) & 5.0 \\
Actor lr & 0.0003 \\
Model lr & 0.0003 \\
Actor Network Size (MuJoCo) & (256, 256) \\
Actor Network Size (DM Control) & (1024, 1024) \\
ERP Embedding Network Size (MuJoCo) & (1024, 1024, 1024) \\
ERP Embedding Network Size (DM Control) & (1024, 1024, 1024) \\
Critic Network Size (MuJoCo) & (1024, 1) \\
Critic Network Size (DM Control) & (1024, 1) \\
Discount & 0.99\\
Target Update Tau & 0.005 \\
Model Update Tau & 0.005 \\
Batch Size & 256 \\
\bottomrule 
\end{tabular}
\end{table*}

\begin{table*}[h]
\caption{Hyperparameters used for SPR in FetchReachImage.}
\footnotesize
\setlength\tabcolsep{3.5pt}
\label{tab:hyper_online}
\centering
\begin{tabular}{p{6cm}p{3cm}p{5cm}p{2.5cm}p{2.5cm}p{2cm}p{2cm}}
\toprule
& Hyperparameter Value \\ 
\midrule
lr & 0.0001 \\
Dropout & 0.5 \\
Discount & 0.99 \\
Batch Size & 32 \\
Augmentation & off \\
Target Update Tau & 0.005 \\
Model Update Tau & 0.005 \\
Batch Size & 256 \\
Update & Distributional Q \\
Dueling & True \\
Optimizer & Adam \\
Optimizer: learning rate & 0.0001 \\
Max gradient norm & 10 \\
Priority exponent & 0.5 \\
Noisy nets parameter & 0.5 \\
Min replay size for sampling & 2000 \\
Replay period every & 1 step \\
Updates per step & 2 \\
Multi-step return length & 10 \\
Q network: channels & 32, 64, 64 \\
Q network: filter size & 8 × 8, 4 × 4, 3 × 3 \\
Q network: stride & 4, 2, 1 \\
Q network: hidden units & 256 \\
Non-linearity & ReLU \\
Target network: update period & 1 \\
$\lambda$ (SPR loss coefficient & 2 \\
K (Prediction Depth) & 5 \\

\bottomrule 
\end{tabular}
\end{table*}

\begin{table*}[h]
\caption{Hyperparameters used for SAC-AE in FetchReachImage.}
\footnotesize
\setlength\tabcolsep{3.5pt}
\label{tab:hyper_online}
\centering
\begin{tabular}{p{6cm}p{3cm}p{5cm}p{2.5cm}p{2.5cm}p{2cm}p{2cm}}
\toprule
& Hyperparameter Value \\ 
\midrule
Critic lr & 0.001 \\
Actor lr & 0.001 \\
Discount & 0.99 \\
Batch Size & 128 \\
Critic Q-function soft-update rate $\tau_Q$ & 0.01 \\
Critic encoder soft-update rate $\tau_{enc}$ & 0.05 \\ 
Critic target update frequency & 2 \\
Actor update frequency & 2 \\
Actor standard deviation bounds & $[-10, 2]$ \\
Autoencoder learning rate & 0.001 \\
Temperature learning rate & 0.0001 \\
Temperature Adam’s $\beta_1$ & 0.5 \\
Init temperature & 0.1 \\


\bottomrule 
\end{tabular}
\end{table*}


\bibliography{ref}

\end{document}