% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
% version; also before submission to
% see how the non-anonymous paper
% would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
% \usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{arriojas_611}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\hypersetup{hidelinks}
\usepackage{multirow}
\usepackage{subcaption}
\usepackage{algorithmic}
\usepackage{algorithm}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}

\newcommand{\beginsupplement}{%
        \setcounter{table}{0}
        \renewcommand{\thetable}{S\arabic{table}}%
        \setcounter{figure}{0}
        \renewcommand{\thefigure}{S\arabic{figure}}%
        \setcounter{algorithm}{0}
        \renewcommand{\thealgorithm}{S\arabic{algorithm}}%
     }

\title{Bayesian Inference Approach for Entropy Regularized Reinforcement Learning with Stochastic Dynamics (Supplementary material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:Argenis Arriojas <arriojasmaldonado001@umb.edu>?Subject=Your UAI 2023 paper}{Argenis Arriojas}{}}
\author[1]{Jacob Adamczyk}
\author[2]{Stas Tiomkin}
\author[1]{\href{mailto:Rahul Kulkarni <rahul.kulkarni@umb.edu>?Subject=Your UAI 2023 paper: Bayesian inference ...}{Rahul V Kulkarni}{}}
% Add affiliations after the authors
\affil[1]{%
    Department of Physics\\
    University of Massachusetts Boston\\
    Boston, Massachusetts, USA
}
\affil[2]{%
    Department of Computer Engineering\\
    San Jose State University\\
    San Jose, California, USA
}
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle


\appendix
\beginsupplement
\section{Experimental details}\label{app: Experiment details}

\begin{table}[b!]
    \centering
    \caption{Number of iterations required to find the optimal mapping between original (unconstrained) and policy-equivalent (constrained) dynamics for a large set of arbitrary mazes. For each combination of maze size and number of traps, 1620 mazes were randomly generated qualitatively resembling the one in Fig.~\eqref{fig:stochastic_maze_solved}, with varying wind intensity and values for the temperature $\beta~\in~\left\{2, 5, 20, 50\right\}$. For every maze a solution is found, taking between 2 and 126 iterations to find the corresponding biases. Figure \eqref{fig:random_maze_n_iters_violinplots} offers a detailed faceted visualization where influence of beta and wind strength is also evaluated.}
\begin{tabular}{rrrrr}
\toprule
maze size & trap density &  number of replicas & mean iterations & 3 largest number of iterations  \\\midrule
\multirow[t]{4}{*}{10} & 0.10 & 1620 & 3.68 & [8, 8, 9] \\
 & 0.15 & 1620 & 3.82 & [8, 8, 9] \\
 & 0.20 & 1620 & 3.84 & [8, 9, 9] \\
 & 0.25 & 1620 & 3.83 & [8, 8, 10] \\\midrule
\multirow[t]{4}{*}{20} & 0.10 & 1620 & 3.59 & [8, 8, 8] \\
 & 0.15 & 1620 & 3.69 & [8, 9, 14] \\
 & 0.20 & 1620 & 3.73 & [8, 9, 13] \\
 & 0.25 & 1620 & 3.80 & [12, 13, 23] \\\midrule
\multirow[t]{4}{*}{30} & 0.10 & 1620 & 3.68 & [12, 13, 13] \\
 & 0.15 & 1620 & 3.80 & [22, 24, 39] \\
 & 0.20 & 1620 & 3.85 & [21, 26, 30] \\
 & 0.25 & 1620 & 4.04 & [21, 71, 126] \\
 \bottomrule
\end{tabular}
\label{tab:random_mazes_summary}
\end{table}


\subsection{Results on stochastic gridworld environment WindyFrozenLake}
First, a modified version of OpenAI Gym's FrozenLake was used \citep{brockman2016openai}, and named WindyFrozenLake. This is a grid world environment with discrete states and actions. There is uncertainty whenever a step is taken, with the wind affecting the transition probabilities between states after executing actions.


Table \eqref{tab:random_mazes_summary} summarizes results for several randomly generated mazes of various sizes and complexities. The proposed method consistently finds the constrained optimization solutions for these mazes. Figure \eqref{fig:random_maze_n_iters_violinplots} presents a more detailed view of the convergence properties of the algorithm, when several parameters are changed. Here we note that as the temperature decreases, more biasing iterations are required to attain the original dynamics. This trend becomes more evident for the more complex mazes with higher wind intensity and number of traps. 


\begin{figure}
    \centering
    \includegraphics[width=\textwidth]{FigS1.png}
    \caption{Number of biasing iterations required for solving random windy frozen lake environments, in terms of 4 different variables: size of the maze, wind strength level, trap density and beta used for the solution. We observe that most of the mazes required between 2 and 6 biasing iterations to converge. In general, we can appreciate that for lower temperatures (higher beta), more iterations are required. For higher temperatures, maze size and trap density have no major impact. For the cases were variables combine to produce difficult environments (strong wind, many traps), a higher number of biasing iterations are required.
    In this visualization we have removed outlier values above 50 iterations.}
    \label{fig:random_maze_n_iters_violinplots}
\end{figure}



\subsection{Experiments on discretized versions of environments with continuous state space}



To demonstrate the usefulness of the method on more complex examples, we proceeded to solve multiple reinforcement learning environments which fall under the classic control category. In this section we present a description of the four continuous Gym environments used. Table \eqref{tab:continuous_envs_experiments} presents a description of the state-space discretization parameters used for these environments, as well as the $\beta$ values used to solve them and the number of biasing iterations required for convergence to the original system transition dynamics.


For each environment, the following steps were followed
\begin{itemize}
    \item Make rewards instantaneous. This is necessary for CartPole and MountainCar.
    \item Digitize/Discretize state space. In the case of Pendulum, actions also need to be discretized.
    \item Sample transition dynamics and reward function. This is done exhaustively by manually setting the environment state, taking all available actions and observing the resulting state and reward. We sampled 500 repetitions of each combination of state and action.
    \item Choose a $\beta$ parameter and compute the constrained optimization solution by using the estimated dynamics and rewards, and Algorithm \eqref{alg:cap}. At each iteration, the intermediate policies are stored for later evaluation.
    \item Perform evaluation of intermediate policies and optimal policy.
\end{itemize}



A detailed description of each of these four environments is presented in the following subsections.
\begin{figure}[t]
    \centering
    \begin{subfigure}[t]{0.192\textwidth}
    \centering
    \frame{\includegraphics[width=\textwidth]{FigS2a.png}}
    \caption{Pendulum-v0}
    \label{fig:pendulum_env}
    \end{subfigure}
    \hfill
    \begin{subfigure}[t]{0.29\textwidth}
    \centering
    \frame{\includegraphics[width=\textwidth]{FigS2b.png}}
    \caption{MountainCar-v0 \citep{MooreAW1990}}
    \label{fig:mountaincar_env}
    \end{subfigure}
    \hfill
    \begin{subfigure}[t]{0.29\textwidth}
    \centering
    \frame{\includegraphics[width=\textwidth]{FigS2c.png}}
    \caption{CartPole-v1 \citep{6313077}}
    \label{fig:cartpole_env}
    \end{subfigure}
    \hfill
    \begin{subfigure}[t]{0.193\textwidth}
    \centering
    \frame{\includegraphics[width=\textwidth]{FigS2d.png}}
    \caption{Acrobot-v1 \citep{NIPS1995_8f1d4362,JMLR:v16:geramifard15a}}
    \label{fig:acrobot_env}
    \end{subfigure}
    \caption{Reinforcement learning environments with continuous observations, made available by OpenAI under the MIT License \citep{brockman2016openai}. For these environments, the observation space was discretized. See appendix \eqref{app: Experiment details} and Table \eqref{tab:continuous_envs_experiments}}
    \label{fig:my_label}
\end{figure}

\begin{table}
    \centering
    \caption{Experiments on environments with continuous observation spaces.
    $|\cal{S}|$ denotes the size of the observation state-space after discretization, and $\cal{A}$ the corresponding action-space. $|f_m|$ and $|\tau_m|$ are the maximal force and torque available to the agent, respectively. For Pendulum, the continuous action space was discretized to 3 actions. Each environment was solved with a given $\beta$ hyper-parameter as specified. Each of these problems was solved in $k$ iterations for the biasing process. }
\begin{tabular}{lrcrr}
    \toprule
    Environment  & $|\cal{S}|$&  $\cal{A}$                             &  $\beta$  & $k$     \\\midrule
    Pendulum     & $16^2$     &  $ \{-|\tau_{m}|, 0, +|\tau_{m}| \} $  &   1       &   28    \\
    MountainCar  & $12^2$     &  $ \{-|f_{m}|, 0, +|f_{m}| \} $        &   2       &   38    \\
    CartPole     & $8^4$      &  $ \{-|f_{m}|, +|f_{m}| \} $           &   10      &   29    \\
    Acrobot      & $12^4$     &  $ \{-|\tau_{m}|, 0, +|\tau_{m}| \} $  &   25      &   58    \\
    \bottomrule
\end{tabular}
    \label{tab:continuous_envs_experiments}
\end{table}

\subsubsection{Pendulum}
Shown in Figure~\eqref{fig:pendulum_env}. This is a direct control environment. The goal is to swing up a pendulum by using a torque as input control. The 2-dimensional observation space is continuous, consisting of the angular position and the angular velocity of the pendulum. A continuous-valued torque $\tau$ can be applied with $\tau \in [-2, 2]$. The reward function is proportional to the angle of the pendulum, with the lowest reward given in the lower position.

\subsubsection{MountainCar}
Shown in Figure~\eqref{fig:mountaincar_env}. In this environment the goal is to reach the flag at the top of a hill. The car power is not enough to climb the hill in one shot, and instead needs to swing back and forth to gain enough energy. The continuous state space is 2-dimensional and consists of the linear position and speed of the car. There are 3 discrete actions available: accelerate left, no acceleration, and accelerate right. Each step has reward $-1$ until the flag is reached, where a reward of $0$ is given and the episode terminates.

\subsubsection{CartPole}
Shown in Figure~\eqref{fig:cartpole_env}. A cart can move horizontally on a track, while balancing a pole in a vertical position. The goal is to continue balancing the pole for as long as possible, without moving beyond the edges of the environment. The 4-dimensional observation is comprised of the linear position and velocity of the cart, and the angular position and velocity of the pole.  A reward of $+1$ is given at each step while the bar is in the upright position. The episode is terminated when the pole reaches a threshold angle, and $0$ reward is given.

\subsubsection{Acrobot}
Shown in Figure~\eqref{fig:acrobot_env}. This is a double pendulum system, which is a chaotic system. The goal is to apply torque to the joint between the two pendulums, and make the second pendulum go above a vertical threshold. The observation space has 4 dimensions, with the angular positions and velocities of each joint. All rewards are $-1$, until the goal is attained and $0$ reward is given.

\section{Learning biases in a model-free setting}
\label{app:model-free}

In the main paper we have shown how to obtain the biasing functions for the transition dynamics and rewards to solve the constrained optimization problem. The results presented rely in knowing both the dynamics and rewards beforehand. In this section, we present an algorithm for learning the biasing functions from experience in a model-free setting.


We first note the connections between the optimal soft-value functions $Q(s,a)$ and $V(s)$ discussed in the main text with the dominant eigenvalue ($\rho = e^{-\theta}$) and the corresponding left eigenvector ($u(s,a)$) of the tilted matrix $\widetilde{P}$. as previously established in \citep{arriojas2021closed}, given by the following equations:
\begin{align}
Q(s,a)&=-\theta N + \frac{1}{\beta} \log u(s,a)\\
V(s)&= -\theta N + \frac{1}{\beta}\log \sum_a \pi^0(a|s)u(s,a),
\end{align}
where $\theta$ is the so-called bulk free energy and $N$ is the number of steps in the trajectory. In the following, we will work with the dominant eigenvalue ($\rho=e^{-\theta}$) and the corresponding left eigenvector ($u(s,a)$) and derived quantities. The results obtained can be expressed in terms of the optimal value functions using the preceding equations.


We start by considering the eigenvalue equation for the tilted matrix $\widetilde{P}$ and its left eigenvector $u$
\begin{equation}
    \sum_{(s',a')}e^{\beta r(s,a)}\pi^0(a'|s')p(s'|s,a)u(s',a') = \rho u(s,a)
\end{equation}
and after applying biases to $r$ and $p$ we obtain
\begin{equation}
    \sum_{(s',a')}e^{\beta (r(s,a) + \delta(s,a))}\pi^0(a'|s')p(s'|s,a)b(s'|s,a)u_b(s',a') = \rho_b u_b(s,a)
\end{equation}
then for some fixed biasing functions $b(s'|s,a)$ and $\delta(s,a)$, we adapt the update equations from \citep{arriojas2021closed} resulting in the following update equations that can be used to estimate the corresponding $u_b$ and $\rho_b$
\begin{align}
u_b(s, a) &\leftarrow (1-\alpha) u_b(s,a) + \alpha \frac{ e^{\beta r_b(s, a)} }{\rho_b} u_b(s', a')b(s'|s,a)\\
\rho_b &\leftarrow (1-\alpha_{\rho}) \rho_b + \alpha_{\rho} e^{\beta r(s, a)} \frac{ u_b(s', a')}{u_b(s, a)}b(s'|s,a).
\end{align}

Now let
\begin{equation}
\psi(s) = e^{-\beta V(s)} = \frac{1}{\sum_a \pi^0(a|s) u(s,a)}
\end{equation}

From Eqn. \eqref{eq:constraint_1} and the normalization condition we can write the dynamics bias function
\begin{equation}
    b(s'|s,a)= \frac{\psi(s')}{\sum_{s'}\psi(s')p(s'|s,a)}.
\end{equation}
Using this form to rewrite Eqn. \eqref{eq:constraint_2}, we obtain the reward bias function
\begin{equation}
    \beta \delta(s,a) = 
    \log \left[\sum_{s'}\psi(s')p(s'|s,a)\right]
    - \sum_{s'}\log\left[\psi(s')\right]p(s'|s,a).
\end{equation}
After rewriting the biasing functions $b$ and $\delta$ in terms of $\psi$, we see that it suffices to find estimations for the expectations $\mathbb{E}_{s'|s,a}\left[\psi(s')\right]$ and $\mathbb{E}_{s'|s,a}\left[\log\left(\psi(s')\right)\right]$,
which can be achieved through the following update equations
\begin{align}
\left<\psi\right>_{(s,a)} &\leftarrow (1-\alpha_{\psi}) \left<\psi\right>_{(s,a)} + \alpha_{\psi}\psi(s')\\
\left<\log\psi\right>_{(s,a)} &\leftarrow (1-\alpha_{\psi}) \left<\log\psi\right>_{(s,a)} + \alpha_{\psi}\log\psi(s').
\end{align}
With all four update equations, we can construct an algorithm that is capable of learning the biases $b(s'|s,a)$ and $\delta(s,a)$, which enables the learning of the left eigenvector $u_b(s,a)$ for the biased problem. The corresponding pseudocode for this method is presented in Algorithm \eqref{algo:model_free_biased}. We have tested this algorithm on the windy cliff environment presented in Figure \eqref{fig:optimistic_vs_optimal}. Results for this experiment are presented in Figure \eqref{fig:model_free_rewards} of the main text. 

In conclusion, we have developed an off-policy, model-free algorithm that successfully finds the maximum entropy optimal policy for a stochastic RL environment, while avoiding the optimistic agent issue. The ideas here presented may serve of inspiration to find an equivalent solution for the case of continuous state-action spaces.



\begin{algorithm}[h!]
    \caption{Find optimal policy from samples}\label{algo:model_free_biased}
\begin{algorithmic}
    \STATE {\bfseries Parameters:} inverse temperature $\beta$; update rates $\alpha$,  $\alpha_{\rho}$,  $\alpha_\psi$; prior policy $\pi^0$; $N_{\text{epochs}}$; $N_{\text{bias}}$
    \STATE {\bfseries Input:} experience samples of tuples $(s, a, r, s', a')$
    \STATE {\bfseries Output:} optimal policy $\pi^*$
    \STATE \;
    \STATE 1. Initialize $\psi(s) \gets 1$; $\left<\psi\right>_{(s,a)} \gets 1$ and $\left<\log\psi\right>_{(s,a)} \gets 0$\;
    \REPEAT
        \REPEAT
            \FOR{$s, a, r, s', a'$ {\bfseries in} experience}
                \STATE 2. $\left<\psi\right>_{(s,a)} \gets (1-\alpha_{\psi}) \left<\psi\right>_{(s,a)} + \alpha_{\psi}\psi(s')$
                \STATE 3. $\left<\log\psi\right>_{(s,a)} \gets (1-\alpha_{\psi}) \left<\log\psi\right>_{(s,a)} + \alpha_{\psi}\log\psi(s')$
            \ENDFOR
        \UNTIL{relative error between epochs $<0.01\%$ }
        \STATE \;
        \STATE 4. $b(s'|s, a) \gets \psi(s') / \left<\psi\right>_{(s,a)}$
        \STATE 5. $\delta(s,a) \gets \frac{1}{\beta} (\log\left<\psi\right>_{(s,a)} - \left<\log\psi\right>_{(s,a)})$
        \STATE 6. $\Delta \gets \max(\delta(s,a))$
        \STATE 7. $u(s, a) \gets 1$
        \STATE 8. $\rho \gets e^{-\beta}$
        \REPEAT
            \FOR{$s, a, r, s', a'$ {\bfseries in} experience}
                \STATE 9. $r_b \gets r + \delta(s,a) - \Delta$
                \STATE 10. $u(s, a) \gets (1-\alpha) u(s,a) + \alpha \frac{ e^{\beta r_b} }{\rho} u(s', a')b(s'|s,a)$\;
                \STATE 11. $\rho \gets (1-\alpha_{\rho}) \rho + \alpha_{\rho} e^{\beta r_b} \frac{ u(s', a')}{u(s, a)}b(s'|s,a)$
            \ENDFOR
        \UNTIL{completed $N_{\text{epochs}}$}
        \STATE \;
        \STATE 12. $\psi(s) \gets \left(\sum_a \pi^0(a|s) u(s,a)\right)^{-1}$
    \UNTIL{completed $N_{\text{bias}}$ iterations}
    \STATE \;
    \STATE 13. $\pi^*(a|s) \gets \pi^0(a|s)u(s,a)$
    \STATE 14. normalize $\pi^*$
    \STATE {\bfseries return: $\pi^*$}  
\end{algorithmic}
\end{algorithm}



\bibliography{arriojas_611}

\end{document}
