\documentclass[accepted]{uai2024}
                        

\usepackage[british]{babel}
\usepackage{natbib} % has a nice set of citation styles and commands
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions% \documentclass{article}

% \usepackage{caption} % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT

% \usepackage{algorithm}

% \usepackage{newfloat}pp
% \usepackage{listings}

% \usepackage[utf8]{inputenc} % allow utf-8 input
% \usepackage[T1]{fontenc}    % use 8-bit T1 fonts
% \usepackage{hyperref}       % hyperlinks
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols

% \usepackage[ruled]{algorithm2e}

% \usepackage{breqn}
% \usepackage{graphicx} % DO NOT CHANGE THIS
% \usepackage{minted}
\usepackage{amsthm}
% \usepackage[acronym]{glossaries}
% \usepackage{todonotes}
% \usepackage{algorithm}
% \usepackage{algpseudocode}
\usepackage[ruled,linesnumbered]{algorithm2e}

% \usepackage{multirow}
% \usepackage{multicol}


\newcommand{\R}{\mathbb{R}}
\newcommand{\tb}{\mathbf{t}}
\newcommand{\fb}{\mathbf{f}}
\newcommand{\N}{\mathbb{N}}
% \newcommand{\minp}[1]{\mintinline{Python}{#1}}
\newcommand{\prob}{\mathbb{P}} % probability
\newcommand{\E}{\mathbb{E}} % expectationBayes

\newcommand{\D}{\mathcal{D}}
\newcommand{\X}{\mathcal{X}}

\newcommand{\sts}{\mathcal{S}} % State space
\newcommand{\as}{\mathcal{A}}  % Action space
\newcommand{\mdp}{\mathcal{M}}
\newcommand{\tmax}{t_{\max}}
\newcommand{\mdpfull}{\mdp=\left(\sts, \as, p, r, \gamma, \tmax, \rho_0\right)}
\newcommand{\svisit}{\eta}
\newcommand{\seval}{\sts_{\text{eval}}}
\newcommand{\ssucc}{\sts_{\text{succ}}}

\newcommand{\bq}{\mathbf{q}}
\newcommand{\ba}{\mathbf{a}}
\newcommand{\bba}{\bar{\mathbf{a}}}
\newcommand{\bs}{\mathbf{s}}

\newcommand{\bpi}{\boldsymbol{\pi}}
\newcommand{\Cov}{\text{Cov} }
\newcommand{\argmax}{\text{argmax} }

% \newtheorem{definition}{Definition}
\newtheorem{theorem}{Theorem}
% \newtheorem{proposition}[theorem]{Proposition}
% \newtheorem{lemma}[theorem]{Lemma}
% \newtheorem{corollary}[theorem]{Corollary}


% \newacronym{irl}{IRL}{inverse reinforcement learning}
% \newacronym{mcmc}{MCMC}{Markov chain Monte Carlo}
% \newacronym{mdp}{MDP}{Markov decision process}


\title{Walking the Values in Bayesian Inverse Reinforcement Learning}

\author[1]{\href{mailto:<ondrej@bajgar.org>?Subject=Your UAI 2024 paper}{Ondrej Bajgar}{}}
\author[1]{Alessandro Abate}
\author[2]{Konstantinos Gatsis}
\author[1]{Michael A. Osborne}

\affil[1]{%
    University of Oxford
}
\affil[2]{University of Southampton}


\begin{document}

\maketitle


\begin{abstract}
    The goal of Bayesian inverse reinforcement learning (IRL) is recovering a posterior distribution over reward functions using a set of demonstrations from an expert optimizing for a reward unknown to the learner. The resulting posterior over rewards can then be used to synthesize an apprentice policy that performs well on the same or a similar task.
    A key challenge in Bayesian IRL is bridging the computational gap between the hypothesis space of possible rewards and the likelihood, often defined in terms of Q values: vanilla Bayesian IRL needs to solve the costly forward planning problem -- going from rewards to the Q values -- at every step of the algorithm, which may need to be done thousands of times. We propose to solve this by a simple change: instead of focusing on primarily sampling in the space of rewards, we can focus on primarily working in the space of Q-values, since the computation required to go from Q-values to reward is radically cheaper. Furthermore, this reversion of the computation makes it easy to compute the gradient allowing efficient sampling using Hamiltonian Monte Carlo. We propose ValueWalk -- a new Markov chain Monte Carlo method based on this insight -- and illustrate its advantages on several tasks.
\end{abstract}


\section{Introduction}
Reinforcement learning (RL) has shown impressive performance across a wide variety of tasks, ranging from robotics to game playing. However, one of the main challenges in applying RL to real-world problems is specifying an appropriate reward function by hand, which is often difficult and can result in reward functions that are only imperfect proxies for designers' intentions. Inverse reinforcement learning (IRL) addresses this issue by instead learning the underlying reward function from expert demonstrations.

A key challenge in IRL is that the reward function is often underdetermined by the available demonstrations, as multiple reward functions can lead to the same optimal behaviour. This can be solved by picking a criterion for choosing among the reward functions compatible with the demonstrations -- maximum margin~\citep{ng2000,ratliff2006} and maximum entropy \citep{ziebart2008} are the most prominent examples. As an alternative, Bayesian IRL explicitly tracks the uncertainty in the reward using a probability distribution. This not only accounts for the issue of underdeterminacy but also provides principled uncertainty estimates to any downstream tasks, which can be used, for instance, for the synthesis of safe policies or for active learning.

While having these attractive properties, Bayesian IRL is computationally challenging. While inference is done over the space of reward functions (in terms of which the prior is also expressed), the likelihood is usually formulated in terms of Q values (or is otherwise linked to the distribution of trajectories), and going from the former to the latter may require solving the whole forward planning problem at each iteration (as is case in the original Bayesian IRL algorithm \citep{ramachandran2007}), which is expensive in itself and may further need to be done thousands of times during IRL inference. To avoid this, we propose to use a simple insight: while going from rewards to Q-values is expensive, the inverse calculation can be much simpler. Thus, we propose to perform the inference as if it were done primarily over the space of Q-values, computing reward estimates beside it, resulting in a much cheaper algorithm. A related formulation appeared already in the variational method of \cite{chan2021}, which was, however, learning only a point estimate of the Q-function thus sacrificing Bayesianism from the centre of the algorithm. 

We instead propose a new method that provides a full Bayesian treatment of the Q values, along with the rewards, and is able to provide samples from the true posterior, being based on Markov chain Monte Carlo (MCMC) as opposed to variational inference, which needs to pre-specify a family of distributions within which to approximate the posterior. Furthermore, since the computation required at each step is much simpler than in prior MCMC-based methods \citep{ramachandran2007, michini2012}, which in itself makes our method more efficient, we can also easily calculate the gradient, which allows us to use Hamiltonian Monte Carlo \citep{duane1987} granting further gains in efficiency. 

The contributions of this paper are the following: (1) we provide the first MCMC-based (and thus agnostic to the shape for the posterior) algorithm for continuous-space Bayesian inverse reinforcement learning; (2) we show that it scales better on discrete-space cases than the MCMC-based baseline, PolicyWalk; and (3) we show that we outperform the previous state-of-the-art algorithm for continuous state-spaces, AVRIL, better capturing the posterior over rewards and performing better on imitation learning tasks. 

The paper is organized as follows: Section 2 provides background on inverse reinforcement learning and Hamiltonian Monte Carlo and summarizes related work. Section 3 introduces our proposed algorithm called ValueWalk. Section~\ref{sec:experiments} compares our approach to an MCMC-based predecessor, PolicyWalk \citep{ramachandran2007}, the previous state-of-the-art scalable method for Bayesian IRL, AVRIL \citep{chan2021}, and 2 imitation learning baselines on several control tasks. %, as well as on clinical data from the MIMIC-III~\cite{johnson2016} dataset.


\section{Background}
\subsection{Bayesian inverse reinforcement learning}

The goal of Bayesian inverse reinforcement learning is recovering a posterior distribution over reward functions based on observing a set of demonstrations $\D=\{(\phi(s_1),a_1),...,(\phi(s_n),a_n)\}$ from an expert acting in a Markov decision process (MDP) $\mdpfull$ where 
 $\sts, \as$ are the state and action spaces respectively,
$\phi: \sts \to \Phi$ is a feature function representing states in a feature space $\Phi$,
$p: \sts \times \as \to \mathcal P(\sts)$ is the transition function where $\mathcal P(\sts)$ is a set of probability measures over $\sts$,
$r: \Phi\times \as \to  \mathbb{R}$ is a reward function,
    % \item $\phi:\sts\to\R^n$ is a feature representation of the states, also interpretable as an observation function (we will sometimes use $\phi_i$ to denote $\phi(s_i)$); 
$ \gamma\in (0,1) $ is a discount rate,
$\tmax\in\N\cup\{\infty\}$ is the time horizon, and
 $\rho_0\in\mathcal P(\sts)$ is the initial state distribution.

In IRL, we know all elements of the MDP except for the reward function and, possibly, the transition function (the setting without the knowledge of transition dynamics -- or other form of access to the environment or its simulator -- is sometimes called \emph{strictly batch} \citep{jarrett2020}; our method is applicable in both this setting and the one including an environment simulator, though most of the experiments are run in the former setting following the main baseline method, AVRIL).
Instead, we have a model of how the expert policy is linked to the reward and, in the case of Bayesian IRL, also a prior distribution over reward functions, $p_R$ (which is, in general, a multi-dimensional stochastic process, that for any set of state-action pairs returns a joint probability distribution over the corresponding set of real-valued rewards). Commonly used expert models include Boltzmann rationality models such as
\begin{equation}
\label{eq:boltzmann-rat}
\prob [a_i | \phi(s_t)] = \frac{e^{\alpha Q^*(\phi(s_t),a_i)}}{\sum_{a'\in\as} e^{\alpha Q^*(\phi(s_t),a')}}
\end{equation}
\citep{ramachandran2007,chan2021} where
$Q^*(s,a)$ is the expected (discounted) return if action $a$ is taken in state $s$, and the optimal policy is subsequently followed, and $\alpha$ is a rationality coefficient; the maximum entropy approach \citep{ziebart2008}, where the probability of each trajectory is assumed to be proportional to the exponential of the trajectory's return; or sparse behaviour noise models \citep{zheng2014}, where the expert is assumed to behave rationally except for sparse deviations. Beside these approximately rational models, various models of irrationality can also be considered \citep{evans2015}. The Bayesian IRL framework is flexible with respect to the choice of expert model, each such model just resulting in a different likelihood function, and can also be extended to the case where the model is not fully known. 

In this article, we adopt the Boltzmann rationality model (\ref{eq:boltzmann-rat}). We will assume that conditional on the Q values, the actions chosen by the expert are independent, yielding the likelihood 
\begin{equation}
\label{eq:boltzmann-likelihood}
p(\D|r) = \prod_{s_t,a_t,s_{t+1}\in\D} \frac{e^{\alpha Q^*(\phi(s_t),a_t)}}{\sum_{a'\in\as} e^{\alpha Q^*(\phi(s_t),a')}} p(s_{t+1}|s_t,a_t)
\end{equation}
for a discrete action space $\as$ (the expression can readily be adapted to a continuous setting by replacing the sum by an integral).
Given this likelihood together with the prior over rewards $p_R$, we can calculate the posterior using the Bayes Theorem as $p(r|\D) = p(\D|r)p_R(r)/p(\D)$. Generally, we cannot calculate this posterior analytically, so in practice, we need to resort to approximate methods. In this article, we use Markov chain Monte Carlo sampling.

When performing Bayesian inference over the reward, the transition probabilities will be considered fixed (except for Appendix~\ref{app:unknown-transitions}, which discusses the extension of Bayesian inference also to transition probabilities). Thus looking at the likelihood as a function of the reward, we can write 
\begin{equation}
    p(\D|r)=c \prod_{s_t,a_t\in\D} \frac{e^{\alpha Q^*(\phi(s_t),a_t)}}{\sum_{a'\in\as} e^{\alpha Q^*(\phi(s_t),a')}} =: c \mathcal{L}(\D|r).
\end{equation}
Since $p(D) = \int p(D|r) d p_R(r) = c\int \mathcal{L}(D|r) d p_R(r)$, the constant transition term cancels out in the posterior, and, going forward, we can use the partial likelihood $\mathcal{L}$ in reward posterior inference. Furthermore, MCMC algorithms generally depend only on the unnormalized distribution, thus we can also drop the remainder of the marginal $p(D)$ from our calculation. 

\subsection{Markov-chain Monte Carlo (MCMC)}
Markov chain Monte Carlo (MCMC) methods form a class of algorithms widely used for sampling from complex probability distributions. MCMC methods rely on constructing Markov chains whose stationary distribution is the distribution of interest. Usually a new candidate sample in the chain is proposed and then accepted or rejected with probability proportional to the one under the target distribution -- in our case the posterior over rewards.

In simpler MCMC methods, such as Metropolis-Hastings~\citep{metropolis1953,hastings1970}, which were also used in some previous articles on Bayesian IRL \citep{ramachandran2007,michini2012}, the new step is proposed as a random jump in the sampling space. However, this often leads to a high rejection rate, if the jumps are large, or tightly correlated samples, if the jump is small, both of which can make the algorithm inefficient. 

Thus, we instead use the popular Hamiltonian (or hybrid) Monte Carlo (HMC; \cite{duane1987}) with the no-U-turn (NUTS) sampler \citep{hoffman2014}, which uses the gradient of the posterior density and Hamiltonian-like dynamics to propose samples that are far apart but still likely under the posterior, keeping a high acceptance rate, thus improving the efficiency of the algorithm.


% \subsection{Gaussian processes and deep kernels}

% Gaussian processes (GPs) are a powerful class of non-parametric probabilistic models that define a distribution over functions $f: \mathcal{X}
% \to \mathcal{Y}$, where generally $\mathcal{X} =\R^{d_x},\mathcal{Y}=\R^{d_y}$. A Gaussian process is specified by a mean function $m: \mathcal{X}\to\mathcal{Y}$ and a covariance function, also known as a kernel, $k: \mathcal{X}\times \mathcal{X} \to \R^{d_y\times d_y}$. For any finite set of input points $\mathbf{X} = \left(\mathbf{x}_1, \dots, \mathbf{x}_n\right)$, the GP defines a multivariate Gaussian distribution over the function values $\mathbf{f} = \left(f(\mathbf{x}_1), \dots, f(\mathbf{x}_n)\right)$: 
% $f\sim\mathcal{N}(m,K)$ 
% where $\mathbf{m} = [m(\mathbf{x}_1), \dots, m(\mathbf{x}_n)]^T$ is the mean vector and $\mathbf{K}$ is the covariance matrix with elements $K_{ij} = k(\mathbf{x}_i, \mathbf{x}_j)$ (where for $d_y>1$, $k(\mathbf{x}_i, \mathbf{x}_j)$ are submatrices of $K$). Given a set of input-output pairs $\mathcal{D}={(\mathbf{x}_i, y_i)}_{i=1}^n$, a Gaussian process can be used to make predictions for new input points by conditioning the GP on the observed data. 

% The kernel function captures the relationship between input points and is crucial for the expressiveness and flexibility of GPs. Typically, one or several standard stationary kernels are used. In this article, we will make use of the Matérn 5/2 kernel defined as
% \begin{equation}
% k(\mathbf{x}, \mathbf{x}') = \sigma^2\left(1 + \sqrt{5d^2} + \frac{5d^2}{3}\right)e^{-\sqrt{5d^2}},
% \end{equation}
% where $d^2 = (\mathbf{x} - \mathbf{x}')^T \boldsymbol{\Lambda}^{-1} (\mathbf{x} - \mathbf{x}')$ and $\boldsymbol{\Lambda}$ is a diagonal matrix of length scales, and $\sigma^2$ is the signal variance. 

% Deep kernel learning (DKL) \citep{calandra2016, wilson2016} combines the advantages of GPs and deep learning, where the kernel of a GP is parameterized using a deep neural network. Let $\phi(\mathbf{x})$ be the output of a deep neural network for input $\mathbf{x}$, and let $k_{\text{base}}(\mathbf{u}, \mathbf{v})$ be a base kernel function. Then, the deep kernel function $k(\mathbf{x}, \mathbf{x}')$ is defined as:
% $$k(x,x'):=k_{\text{base}}(\phi(x),\phi(x'))$$
% This allows the model to capture complex relationships in the data while retaining the benefits of GPs, such as uncertainty quantification and data efficiency. DKL has been successfully applied to various domains, including image classification, time-series forecasting, and reinforcement learning \citep{wilson2016}.


\subsection{Related Work}

Inverse reinforcement learning is most often used as a component in imitation learning: the more general task of learning an apprentice policy from expert demonstrations (see \cite{zare2023} for a good recent survey). Beside IRL, the other major family of methods within imitation learning is behavioural cloning \citep{pomerleau1991,ross2011}, which, in its vanilla form, aims to learn the policy via supervised learning directly from the expert's observation-action pairs. The supervised learning approach has an advantage of lower computational cost, but faces the challenge of covariate-shift, since the training states are distributed according to the expert policy, not that of the learner agent, though multiple methods try to mitigate this by encouraging the learner policy to stay close to the expert one \citep{dadashi2020,reddy2019,brantley2019}.

Inverse reinforcement learning represents an alternative which, instead of directly learning the observation-action mapping, first learns an estimate of the reward function, which can then be used to synthesize a policy. This can offer better generalization, but usually requires a model of the environment or access to it in order to run reinforcement learning, and generally incurs a higher computational cost.

We build on the paradigm of Bayesian IRL introduced by \cite{ramachandran2007}. While the Bayesian approach is attractive thanks to its principled treatment of uncertainty in light of the limited demonstration data, the key downside relative to other methods has been its scalability to higher-dimensional settings. \cite{michini2012} try to improve efficiency upon Ramachandran by focusing computation into regions of the state space close to the expert demonstration, still using MCMC, while \cite{chan2021} try to improve efficiency by using an approximate variational distribution to model the posterior, as well as an additional neural network that tracks the Q function, which avoids the need for a costly inner-loop solver. \cite{mandyam2023} has recently used kernel density estimation as an alternative method for approximate Bayesian inference.\footnote{The evaluation in this paper focuses on an offline setting without access to environment dynamics, while the last mentioned method fundamentally depends on having access to the environment dynamics so we omit it from the comparison in this paper.}

As opposed to recent work experimenting with other approximation techniques, we return to MCMC, with its greater expressivity, while at the same time adapting it to be used with continuous state spaces, which would not be feasible with prior MCMC-based methods.


\section{Method}
% The key innovation in our method concerns the calculation of the posterior probability. The fact that the calculation required at every step is much simpler has two main advantages: (1) it directly reduces the computational cost, and (2) it allows for easy gradient calculation, enabling the use of gradient-based methods for proposing new samples.

Similarly to early work in Bayesian IRL \citep{ramachandran2007, michini2012}, we use Markov chain Monte Carlo sampling to produce samples from the posterior distribution over rewards given a prior and expert demonstrations. Our key innovation is in the way we calculate the posterior. At each step of the Markov chain, these previous methods generally (1) proposed a new reward (2) used some method of forward planning, such as policy iteration, to deduce the corresponding optimal Q function and then (3) used the Q function to evaluate the likelihood and the reward to evaluate the prior.

We suggest proceeding the other way round: our method proposes a set of new parameters of the Q function and then uses it to deduce the corresponding rewards, which is generally a much easier calculation than going from rewards to Q functions. The method then uses the reward to calculate the prior and the Q value to evaluate the likelihood, and combines the two to calculate the unnormalized posterior density. This value can then be used for calculating the acceptance probability in an MCMC algorithm. Also thanks to the calculation being simple (rather than involving a RL-like inner-loop problem) and differentiable, we can also calculate the gradient, which we can use for efficient proposals using HMC+NUTS. Since we construct the random chain in the space of Q values instead of the space of rewards, used by previous methods, we call our new method ValueWalk.


\subsection{Finite state and action spaces}
\label{ssec:method-finite}
Let us first outline the algorithm for the case of finite state and action spaces since the calculation can be performed exactly in this case, and the later continuous algorithm builds on this base case. We concentrate here on the calculation of the posterior probability corresponding to a single proposed set of Q values (which is performed at each step of the HMC trajectory) and otherwise employ standard HMC. Note that here, we assume the knowledge of the environment dynamics $P$, since this finite setting is close to that of PolicyWalk~\citep{ramachandran2007}, which also assumes this knowledge. However, the method can easily be extended to the \textit{strictly batch} setting using steps analogous to the ones taken in the next subsection on continuous spaces.

In this finite case, we maintain a vector $Q\in\R^{|\sts||\as|}$ representing the Q-value for each state-action pair. The first thing to notice is that given such a vector, we can calculate the corresponding reward vector of the same dimensionality as $Q$ using the Bellman equation as
\begin{equation}
\label{eq:reward-bellman-finite}
R(s,a) = Q(s,a) - \gamma \sum_{s'\in\sts} p(s'|s,a) \sum_{a'\in\as} \pi_Q(a'|s') Q(s',a')    
\end{equation}
with either $\pi^Q(a'|s')=\mathbb{I}[a'=\argmax_{a''}Q(s',a'')]$ or a softmax approximation (which we use since it has the advantage of being differentiable using an inverse temperature coefficient $\bar \alpha$ to regulate the softness of the approximation).  Equation (\ref{eq:reward-bellman-finite}) can also be written in vector form as
$R = (I - \gamma \bar P) Q$ where $\bar P$ is a $|\sts||\as|\times|\sts||\as|$ matrix whose values are defined as $\bar P(s,a;s',a')=P(s'|s,a)\pi^Q(a'|s')$. In that case, given a prior $p_R$ over rewards, we can calculate the prior of $Q$ as
$$p_Q(Q) = p_R((I - \gamma \bar P)Q) \det(I-\gamma \bar P),$$
where $p_Q$ and $p_R$ are the prior probability densities of $Q$ and $R$ respectively. Since $\bar P$ is a stochastic matrix and $0<\gamma<1$, the determinant is always strictly positive.

This can be combined with the likelihood
$$\mathcal{L}(D|Q) = \prod_{(s,a)\in\D} \exp(\alpha Q(s,a))/\sum_{a'\in \as}\exp(\alpha Q(s,a'))$$
to calculate the unnormalized posterior density $p(Q|\D) \propto p_Q(Q) \mathcal{L}(\D|Q)$ which we use in the standard HMC+NUTS algorithm to produce samples from the posterior. Note that the algorithm takes form of sampling Q-values, but produces samples of rewards as a byproduct, which is what we are primarily interested in. Algorithm~\ref{alg:posterior-finite} summarizes this calculation. Note that Q here corresponds to the optimal Q value (as opposed to the one corresponding to the expert policy).

\begin{algorithm}[t]
% \RestyleAlgo{ruled}
\KwData{a candidate matrix of Q values, set of expert demonstrations $\D$, prior over rewards $p_R$}%, expert rationality coefficient $\alpha$, optimality approximation coefficient $\bar \alpha$}
\For{$s,s'\in \sts, a,a'\in \as$}{
    $\pi^Q(a'|s')=\mathbb{I}[a'=\argmax_{a''}Q(s',a'')]$ \;
    % $\pi(a|s) = \exp(\bar\alpha Q(s,a))/\sum_{a'\in \as}\exp(\bar\alpha Q(s,a'))$\;
    $\bar P(s,a;s',a') = p(s'|s,a)\pi(a'|s')$ \; 
    }
$\bar R = (I - \gamma \bar P) \bar Q$ where $\bar R,\bar Q$ are flattened vector versions of the reward and Q-value matrices \;
$p_Q(Q) = p_R(\bar R) \det(I-\gamma \bar P)$ \;
$\mathcal{L}(\D|Q) = \prod_{(s,a)\in\D} \exp(\alpha Q(s,a))/\sum_{a'\in \as}\exp(\alpha Q(s,a'))$ \;
\KwResult{$p(Q|\D) \propto p_Q(Q) \mathcal{L}(\D|Q)$; candidate sample $\bar R$}
\caption{Calculation  of the unnormalized posterior for finite $\sts$ and $\as$ and known transition probabilities $P$ (performed in each step of HMC). The resulting candidate reward sample $\bar R$ is then accepted/rejected together with the corresponding Q.}
\label{alg:posterior-finite}
\end{algorithm}

Theorem~\ref{thm:detailed-balance} in Appendix~\ref{app:proofs} formally proves that even though the algorithm primarily performs MCMC sampling over Q values, the secondary Markov chain over rewards produced by the algorithm also satisfies the detailed balance condition with respect to the posterior over rewards and thus constitutes a valid MCMC algorithm for sampling from the reward posterior.

Note that the determinant needs to be recalculated only if the optimal policy changes. Furthermore, we found that in practice, the recovered samples do not differ significantly if the determinant term is omitted.

See Section~\ref{ss:exp-gridworld} for an example of this finite-case algorithm applied to a gridworld environment. Note that if the reward is known to depend only on the state, the sampling can instead be performed over state-values $V$. Similarly, if it depends on the full state, action, next state triple, it should be performed over state-action-state values to maintain a match in the dimensionality of the reward and value spaces. 

The algorithm (and the Q-space trick) extends to the case of unknown transition probabilities. See Appendix~\ref{app:unknown-transitions} for more details on this.


\subsection{Continuous state representations}
\label{ssec:method-cts}
For continuous or large discrete spaces, it is generally no longer possible or practical to maintain a separate Q-function parameter for each state, so we need to resort to approximation. Thus, from now on, our inference will centre around parameters $\theta_Q\in\R^{n_Q}$ of a Q function approximator $Q_\theta:\Phi\times\as\to\R$ where $\Phi$ is the space of feature representations of the states. While the method is again centred around the Q function, the algorithm can also produce samples from the \emph{reward} posterior at any set of evaluation points of interest, $\D_{\text{eval}}$. Furthermore, a method such as warped Gaussian processes \citep{snelson2003} can then be used to generalize the reward posterior from $\D_{\text{eval}}$ to new parts of the state-action space.

The likelihood calculation remains very similar to the discrete case:
\begin{equation}
\label{eq:boltzmann-rat-cts}
\mathcal{L}(\D|\theta_Q) = \prod_{(s,a)\in\D} \frac{\exp(\alpha Q_{\theta_Q}(\phi(s),a))}{\sum_{a'\in \as}\exp(\alpha Q_{\theta_Q}(\phi(s),a'))}
\end{equation} (assuming $\as$ to be bounded).
What concerns the evaluation of the prior, the reward corresponding to given Q-function parameters can be expressed using the continuous Bellman equation as
\begin{equation*}
    R(s,a) = Q_{\theta_Q}\bigl(\phi(s),a\bigr) - \gamma\E_{s',a'|s,a}\Bigl[Q_{\theta_Q}\bigl(\phi(s'),a'\bigr)\Bigr]
    % \gamma \int_{s'\in\sts} p(s'|s,a) \int_{a'\in\as} \pi^Q(a'|\phi(s')) Q_{\theta_Q}(\phi(s'),a'),\;\forall s\in \sts, a\in\as.
\end{equation*}
on any subset of states and actions. 

In general, the integral in $\E_{s',a'|s,a}[Q_{\theta_Q}(\phi(s',a')]=\int_{s'\in\sts} p(s'|s,a) \max_{a'\in\as}Q_{\theta_Q}(\phi(s'),a')$ needs to be approximated, for which any of a number of numerical methods can be used, from grid sampling to Monte Carlo methods, to more sophisticated techniques like probabilistic numerics \citep{hennig2022}. For most of these methods, we approximate the integral using a discrete set of candidate successor states $S_{\text{succ}}(s,a)=\bigl\{s\sim q(\cdot|s,a) \bigr\}$ sampled from some proposal distribution $q$ and then approximate the integral by
\begin{equation}
\label{eq:reward-importance-sampling}
% \int_{s'\in\sts} p(s'|s,a) \max_{a'\in\as}Q_{\theta_Q}(\phi(s'),a')\approx 
\frac{1}{|\ssucc|}\sum_{s'\in \ssucc} \frac{p(s'|s,a)}{q(s'|s,a)} \max_{a'\in\as}Q_{\theta_Q}(\phi(s'),a').
\end{equation}

The variant of the approximation we choose depends of what information we have at our disposal:
\begin{itemize}
    \item If we have access to a probabilistic model $\hat p$ of the environment (which can either represent the true environment dynamics, if we know them, or our best inferred model of the dynamics including any epistemic uncertainty) that we can sample from, we can simply sample $\ssucc(s,a)=\{ s'\sim \hat p (\cdot|s,a) \}$ and drop the importance weight.
    \item If we can evaluate the density $\hat p$ we can directly use the importance sampling equation \ref{eq:reward-importance-sampling} with $q$ being a proposal distribution ideally close to $\hat p$.
    \item If all we have is a static set of trajectories $\D_+$ -- either just the expert ones $\D$, or also additional ones sampled from another, possibly random, policy -- we can crudely approximate the reward for a transition $s,a,s'\in\D_+$ using a singleton $\ssucc(s,a)=\{s'\}$. This is an approximation made by the baseline AVRIL algorithm, so to match, we use it for the experiments in Section~\ref{ssec:exp-classic-control}. In that case we require that $\D_{\text{eval}} \subseteq \D_+$, and for $s,a,s'\in\D_+$ we can define an empirical transition model $\hat p(s''|s,a)=\delta_{s'}(s'')$ to be used within the algorithm.
\end{itemize}



% \subsubsection{Model-based setting}
% First, assume we have access to a probabilistic model $\hat p$ of the environment dynamics, which, for any state-action pair $s,a$ allows us to sample $s'\sim \hat p (\cdot|s',a')$.

% If we know the environment dynamics or its simulator (which in turn can be given to us, or we can learn it from the demonstrations or from environment interactions), we can generate simulated trajectories and deduce reward estimates using the temporal difference equation as is outlined in Algorithm~\ref{alg:posterior-sim}.

% If we can further simulate arbitrary chosen transitions in the environment, we can, more generally, choose a subset  $\sts_{\text{eval}}$ of states for which we can calculate the reward implied by the current value of the parameter $\theta_Q$ similarly to Equation~\ref{eq:reward-bellman-finite}, which we can choose to be the demonstration states augmented with states easily reachable from the demonstration states, or states which are otherwise of interest for downstream applications. To better approximate the Bellman equation, we can also further use a set of successor states $\sts_{\text{succ}}$, which we can form as a sample of 1-step successors of the states in $\sts_{\text{eval}}$. The size of both sets trades off approximation accuracy against the computational budget. The resulting approximation of the integral over states is then
% \begin{multline}
%     \label{eq:reward-bellman-approx}
%     R(s,a) = Q_{\theta_Q}(\phi(s),a) - \gamma \frac{1}{|\sts_{\text{succ}}|} \bigg( \\
%     \sum_{s'\in\sts_{\text{succ}}} p(s'|s,a) \int_{a'\in\as} \pi^Q(a'|s') Q_{\theta_Q}(\phi(s'),a')\bigg)
% \end{multline}
% for all $s\in \sts_{\text{eval}}$. In the first case, when we can generate only whole trajectories, $\sts_{\text{succ}}$ is a singleton. The above equation assumes that states in $\sts_{\text{succ}}$ have been sampled with their respective probabilities under the transition dynamics. If this is not the case, their weights should be corrected by importance sampling. The action space is often lower dimensional and smaller, and the integral may thus be easier to approximate using standard quadrature techniques.

The corresponding continuous version of the algorithm is presented in Algorithm~\ref{alg:posterior-sim}. 

\begin{algorithm}[t]
    \KwData{candidate parameters of the Q-function $\theta_Q$, a set of expert demonstarations $\D$, a set of evaluation locations $D_{\text{eval}}$, prior over rewards $p_R$}
    Initialize empty sequence $\mathcal{R}_{\text{cand}}$ of candidate reward samples \;
    \For{$(s,a)\in\D_{\text{eval}}$}{
        % Subsample a set of counterfactual actions $ \as_{\text{cf}} \subset \as$\;
        % \eIf{a model $\hat p$ of the environment is available}{
            Sample a set of successor states $\ssucc=\{s''\sim\hat p(\cdot|s,a)\}$\;
            $ R(s,a) = Q_{\theta_Q}(\phi(s),a) - \gamma \frac{1}{|\ssucc|} \sum_{s'\in\ssucc}\max_{a'\in \as} Q_{\theta_Q}(s',a')$\;
        % }{
            % $ R(s,a) = Q_{\theta_Q}(\phi(s),a) - \gamma \max_{a'\in \as} Q_{\theta_Q}(s',a')$\;
        % }
        
        Append $R_t$ to $\mathcal{R}_{\text{cand}}$\;
    }
   Use samples to evaluate the prior $p_R(D_{\text{eval}}, \mathcal{R}_{\text{cand}})$ \;
   Use demonstrations to evaluate the likelihood $\mathcal{L}(\D|\theta_Q)$ per equation (\ref{eq:boltzmann-rat-cts}) \;
    \KwResult{unnormalized approximate posterior $p(\theta_Q|\D)\propto p_R(D_{\text{eval}}, \mathcal{R}_{\text{cand}}) p(\D|\theta_Q)$; candidate reward samples $\mathcal{R}_{\text{cand}}$.}
\caption{Calculation of the unnormalized posterior probability with continuous state representations for a single proposed parameter value $\theta_Q$ (performed in each step of MCMC). The returned candidate reward samples are accepted or rejected by the outer MCMC algorithm together with the candidate parameters $\theta_Q$.}
\label{alg:posterior-sim}
\end{algorithm}

We can store both the Q function parameters $\theta_Q$ and the corresponding reward samples depending on downstream needs. We can then fit a warped Gaussian process to the posterior reward samples to get a posterior reward distribution over the whole state space. This can then be used together with an algorithm for RL (or \emph{safe} RL in particular) to find an apprentice policy from the reward. Alternatively, as a shortcut, the posterior over Q-functions can be used to define an apprentice policy directly.


\subsection{Continuous actions}
The algorithm can be extended to continuous actions, replacing the sum in the Boltzmann likelihood (\ref{eq:boltzmann-rat-cts} by an integral, and again, in turn, approximating it by a discrete set of samples from the action space. Simple discretizations (such as uniform sampling) can work well for low-dimensional action spaces (as we illustrate in our safe navigation experiment in the next section) but suffer from the curse of dimensionality, so a more sophisticated scheme would be needed for higher-dimensional action spaces. We leave that for future work. 


% The two choices that need to be made are (1) the set over which we decide to evaluate our prior and (2) the set of successor states $\sts_{\text{succ}}$ which we use to perform the Bellman update. The second choice may be simple in environments where the number of successor states is small (which is the case for the gridworld examples) but may need to be approximated by a subsample of likely successors if their number is large. 
% A more nuanced choice needs to made in the case of $\sts_{\text{eval}}$. Factors influencing this choice are 
% \begin{itemize}
%     \item where we possess prior information on the value of the reward (we may have information about some states and not others),
%     \item where the reward prior most influences the Q values (e.g. if the Q values of two neighbouring states may be very different due to the likelihood, a prior over the reward associated with the transition from one of those states to the other may be important to keep them close)
%     \item ultimately, if we are focused on synthesizing an apprentice policy, we are interested in maximizing the associated objective — in our case, this means maximizing the expected return subject to a safety constraint, as detailed below 
% \end{itemize}

% TODO: formulate a clear objective we are trying to optimize here (where do we evaluate the reward to maximize the reward / minimize the safety risk of the apprentice policy?).

% \subsection{Update policies}

% \subsubsection{Updates focused around demonstrations}
% In the case of discrete state spaces where each state has a limited number of successor states, we can the set of evaluation points as follows:
% $$\sts_{\text{eval}}=\sts_\D:=\{s|\exists a \text{ such that } s,a\in\D\}$$
% and the set of successors for a given $\epsilon>0$ as
% $$\sts_{\text{succ}} = \{s'|\exists s\in \sts_{\text{eval}} , a \in \as \text{ such that } p(s'|s,a)>\epsilon\}.$$



% \subsubsection{Optimizing for an apprenticeship objective}
% The whole problem can also be framed in terms of an objective through which we evaluate an apprentice policy, that is a policy sythesized based on the knowledge of the reward recovered in the IRL phase. The objective we will be focusing on here is maximizing the expected return of the apprentice policy subject to keeping the reward above some threshold $r_\min$ at all time steps — an auxilliary safety objective, which can be written as 
% \begin{multline}
%     \max_\pi\  \E_{\tau\sim(\rho_0,\pi,p)} J(\tau) \\ 
%     \text{ subject to } \prob(R_t>r_\min)>p_{\text{safe}},\; \forall t\in\N_{\leq T}.
% \end{multline}

% Now, we want to include also the choice of the evaluation points.




% \begin{algorithm}
% \label{alg:dbk}
% \caption{Bayesian IRL using ValueWalk (single chain)}
% \begin{algorithmic}[1]
% % \Input mass matrix $M\in\R^{|\D_+}$
% % \State Draw a random reward $R_0$ from the prior at the demonstration points and their neighbours.
% \State Randomly initialize a vector of Q-values $Q_0$ at all state-action pairs.
% \Repeat
%     \State Hamiltonian Monte Carlo with NUTS calculating the posterior as follows:
%     \State Initialize momentum term $\phi\sim\mathcal{N}(0,M)$
%     \State 
%     \State $\pi(a|s) = \exp(\alpha Q(s,a))/\sum_{a'\in \as}\exp(\alpha Q(s,a'))$
%     \State $P(s,a;s',a')=p(s'|s,a)\pi(a'|s')$
%     \State $\bar R = (I - \gamma \bar P) \bar Q$ where $\bar R,\bar Q$ are flattened vector versions of the reward and Q-value matrices
%     \State $p(Q) = p(R)  |\det(I-\gamma P)|$
%     \State $p(\D|Q)=\prod_{(s,a)\in\D} \exp(\alpha Q(s,a))/\sum_{a'\in \as}\exp(\alpha Q(s,a'))$
%     \State $p(Q|\D) \propto p(Q) p(\D|Q)$
%     \State Use this unnormalized posterior in the accept/reject step of the MCMC algorithm
% \Until{ $ ESS(\{R^{\D_+}_i\}_{i=n_{\text{burnin}}}^N) \geq n_{\text{samples}}$ where $N$ is the total number of samples.}

% \Return{$\{R^{\D_+}_i\}_{i=n_{\text{burnin}}}^N$ }

% \end{algorithmic}
% \end{algorithm}

% \begin{algorithm}
% \label{alg:dbk}
% \caption{Bayesian IRL using ValueWalk (single chain)}
% \begin{algorithmic}[1]
% % \Input mass matrix $M\in\R^{|\D_+}$
% % \State Draw a random reward $R_0$ from the prior at the demonstration points and their neighbours.
% \State Randomly initialize a vector of Q-values $Q^{\D_+}_0$ at the augmented demonstration set $\D_+$
% \Repeat
%     \State Hamiltonian Monte Carlo with NUTS calculating the posterior as follows:
%     \State Propose new Q
%     \State $\pi(a|s) = \exp(\alpha Q(s,a))/\sum_{a'\in \as}\exp(\alpha Q(s,a'))$
%     \State $P(s,a;s',a')=p(s'|s,a)\pi(a'|s')$
%     \State $\bar R = (I - \gamma \bar P) \bar Q$ where $\bar R,\bar Q$ are flattened vector versions of the reward and Q-value matrices
%     \State $p(Q) = p(R)  |\det(I-\gamma P)|$
%     \State $p(\D|Q)=\prod_{(s,a)\in\D} \exp(\alpha Q(s,a))/\sum_{a'\in \as}\exp(\alpha Q(s,a'))$
%     \State $p(Q|\D) \propto p(Q) p(\D|Q)$
%     \State Use this unnormalized posterior in the accept/reject step of the MCMC algorithm
% \Until{ $ ESS(\{R^{\D_+}_i\}_{i=n_{\text{burnin}}}^N) \geq n_{\text{samples}}$ where $N$ is the total number of samples.}

% \Return{$\{R^{\D_+}_i\}_{i=n_{\text{burnin}}}^N$ }

% \end{algorithmic}
% \end{algorithm}

\section{Experiments}
\label{sec:experiments}

We tested our method on a small gridworld (for illustration and to compare the speed to PolicyWalk~\citep{ramachandran2007}, which our method builds upon but which is restricted to such small finite-space settings) and on 4 simulated control tasks with continuous states.


\subsection{Gridworld}
\label{ss:exp-gridworld}

\begin{figure*}
    \centering
    \includegraphics[width=0.35\textwidth]{figs/3x3_obstacle_gridworld_pen20.pdf} \includegraphics[width=0.30\textwidth]{figs/gw_posterior_reward_samples_nonterminal.pdf}
    \includegraphics[width=0.30\textwidth]{figs/avril_3x3.pdf}
    \caption{\textbf{Left}: Illustrative 3x3 gridworld. The agent always starts in the top left corner. The top right corner yields a reward of 10 and is terminal. The top centre tile represents an unsafe state that should be avoided and yields a reward of -20. \textbf{Centre}: Histograms of the samples from the posterior over rewards recovered by our ValueWalk corresponding to the 9 states of the gridworld. \textbf{Right}: Density functions of the posterior over rewards recovered by AVRIL.}
    \label{fig:3x3_gw}
\end{figure*}

\begin{table}
    \caption{\textbf{Speed comparison.} Samples per second produced by PolicyWalk and ValueWalk on a 3x3, 6x6, and 12x12 gridworld respectively.}
    \centering
    \begin{tabular}{lcc}
        \toprule
        Num states & PolicyWalk & ValueWalk \\
        \midrule
        9 & 5.84 & 28.83 \\
        36 & 0.46 & 13.75 \\
        144 & 0.26 & 4.85 \\
        \bottomrule
    \end{tabular}
    \label{tab:time-on-gridworlds}
\end{table}


For an illustration of the method with easily interpretable and visualizable features, we first test it on a simple gridworld environment shown in Figure~\ref{fig:3x3_gw}. We have generated a fixed set of 50 demonstration steps in the environment and used our method, ValueWalk (including the environment dynamics), the original PolicyWalk~\citep{ramachandran2007}, and AVRIL~\citep{chan2021} (which does not use environment dynamics, making the comparison unfair but illustrative of inherent limitations of such model-free methods) to recover a posterior over rewards from an independent normal prior with mean 0 and standard deviation of 10. With the two MCMC methods, we took a total of 10,000 MCMC samples spread across 5 parallel chains using HMC+NUTS with 1000 warm-up steps per chain, which lead to $\hat R \leq 1.01$ on each dimension (where $\hat R$ is the potential scale reduction factor \citep{gelman1992}, a commonly used indicator that the chains have mixed well). We then also run the methods on a 6x6 and 12x12 version of the gridworld to examine how the compute times scale.


\subsubsection{Results}
Both PolicyWalk and ValueWalk (our algorithm) resulted in matching posterior reward samples as expected (confimed by two-sample Kolmogorov-Smirnov at $\alpha=0.001$; comparison of their essentially same cdfs can be found in the supplement). The speed comparison of the two methods can be found in Table~\ref{tab:time-on-gridworlds}, showing ValueWalk indeed runs faster than the baseline PolicyWalk algorithm.

We also ran AVRIL on this simple grid world (which took 43s to converege). In terms of the resulting posterior, there are 3 things to note (see Figure~\ref{fig:3x3_gw} centre and right). Firstly, the posteriors are much tighter -- the x-axis is zoomed in about 5x relative to the ValueWalk histograms. This is due to the fact that AVRIL does not model the uncertainty in the Q-function, instead learning only a point estimate. The reward posterior is then pegged to this Q-function point estimate thus significantly reducing its variance. As a result, both the reward of the obstacle and of the goal are extremely unlikely under the posterior.

Secondly, we can observe that the posterior reward for the obstacle is not any lower than that for most other states. This is because this state is never visited in demonstrations, and AVRIL -- not taking the environment dynamics into account -- consequently does not update this value. This illustrates an important downsides faced by methods without an environment model. (Note that the model-free version of ValueWalk would face the same issue.)

Finally, we can see that while the true posterior differs considerably from normal (see especially the strong skew of the negative-reward top middle cell), AVRIL is limited by its normal variational distribution. While in theory, AVRIL could be used with any variational family, we first need to determine which family may be suitable, for which an MCMC-based method such as ours is a useful instrument.

% \subsection{1D Linear Track environment}
% (This subsection is a hot candidate for removal.)

% As a first simple test of the continuous version of our algorithm, we ran a linear version of it on linear track environment, where the state consists of a 1D position of the agent (bounded to [-10, 10]), and the action takes in the interval [-1, 1]. The agent receives a reward of 1 if their step ends in the interval [9,10] and a zero reward otherwise.

% We generated 4 Boltzmann-rational demonstrations with $\alpha=10$. In each step, 10 candidate actions were chosen randomly uniformly from [-1,1], and then a Boltzmann-rational choice was made between them (based on the optimal Q-function calculated in closed form). We ran both the continuous version of ValueWalk and a continuous-action adaptation of AVRIL\footnote{We closely followed the authors' original implementation at https://github.com/XanderJC/scalable-birl, making just minimal necessary changes for our comparison. The adapted code will be made available on Github upon publication.} (see the supplement for details of the adaptation). As approximate models, we used a 1-hidden-layer neural network of 8 units for both methods.

% The posteriors produced by the two methods were then tested on a set of test demonstrations distinct from the training ones (but generated using the same process). We then evaluated (1) the posterior log-density of the true reward under the reward posterior produced by the two methods and (2) the average density of the actions taken in the test demonstrations under the two models. In AVRIL, the action probabilities were calculated using the learnt Q-model. In ValueWalk, these probabilities were calculated by also averaging over all sampled Q-function parameters.

% \subsubsection{Results}
% As shown by the results in Table~\ref{tab:results}, in this simple environment, ValueWalk outperforms AVRIL on both these metrics. By examining the two posterior distributions, we find that our method makes use of its expressivity to model also the covariance relationships between parameters, which allows it to get a tighter posterior which includes the ground-truth values.

% \begin{table*}
%   \centering
%   \begin{tabular}{ccccc}\toprule
%      & \multicolumn{2}{c}{Gridworld} & \multicolumn{2}{c}{2D Safe Navigation} \\
%     & Action prob & Reward log-density & Action prob & Reward log-density \\
%     \midrule 
%     % AVRIL (linear) & 0.53  &  -0.23 & - & - \\
%     % VW (linear) & 0.55 & 0.27 & - & - \\
%     AVRIL (MLP) & 0.62 & -0.08 & 0.68 & -0.32 \\
%     VW (MLP) & 0.62 & 0.22 & 0.61 & -0.24
%   \end{tabular}
%     \caption{Results of experiments with AVRIL and ValueWalk (our method). The probabilities listed have been evaluated over state-action pairs of a test set of demonstrations.}
%   \label{tab:results}
% \end{table*}




% \subsection{Safety Gymnasium}
% The seconds set of experiments was run on a Mujoco-based environment for safe navigation, the Safety Gymnasium~\citep{Safety-Gymnasium}. The agent has 2 continuous actions corresponding to acceleration and rotation, and its task is to navigate to a goal region while avoiding hazard regions. When the goal is reached, it is reset to a new random location, and the episode continues for a total of 1000 timesteps. The environment is procedurally generated so the locations of both the goals and the hazards are different in each episode (we set a fixed seed for evaluation). 

% The observations are formed from the current velocity and direction of the agent as well as a lidar observation indicating the proximity of an obstacle or goal in 16 direction sectors around the agent.

% We first trained an expert agent using proximal policy optimization and then used it to collect 10 demonstration trajectories. These demonstrations were then fed into our IRL algorithm, for which we also assumed access to a simulator of the environment (without the reward).

\subsection{2D safe navigation environment}

\begin{figure*}
    \centering
    \includegraphics[width=0.40\textwidth]{figs/2d_obstacle_demos.pdf}\;\;\;\;\;
    \includegraphics[width=0.37\textwidth]{figs/2d_obstacle_reward.pdf}
    \caption{\textbf{Left}: The 10 demonstrations used in the continuous 2D environment. \textbf{Right}: Trajectories of policies derived from AVRIL and ValueWalk using an argmax of the inferred Q-values in each state. For AVRIL the Q-function point estimate is used. For ValueWalk, median and 0.1 quantile of the posterior distribution over optimal Q-values are used.}% The background colours and associated numbers indicate rewards received for ending a step in each state.}
    \label{fig:2d_cts}
\end{figure*}



To illustrate the potential of the full posterior over Q-values for synthesizing safe policies in a continuous-space environment, we also test our method on a simple 2D safe navigation environment. The state consists of a 2D position within the $[-10, 10]^2$ box, and the agent has a 2D continuous action at its disposal within an action space of $[-1, 1]^2$, which moves it by the given vector perturbed by white noise with standard deviation of 0.1. The region $[-9,10]$ is a terminal goal state with a reward of 1; however, there is also a hazardous obstacle $[4,6]^2$ with a penalty of -10. We collected 10 demonstration trajectories with $\alpha=20$ and then ran ValueWalk and AVRIL on these demonstrations. We then used the two methods' estimates of the optimal Q-function to synthesize an apprentice policy. In AVRIL, we use the point estimate that the method learns. For ValueWalk, we tried policies optimizing the mean or the median of the Q-value estimates, but also a conservative policy maximizing the 0.1 quantile intended to have lower risk of low rewards.


\subsubsection{Results}

\begin{figure*}
    \centering
    \includegraphics[width=0.32\textwidth]{figs/acrobot_num_trajs_new.pdf}
    \includegraphics[width=0.32\textwidth]{figs/carpole_num_trajs_new.pdf}
    \includegraphics[width=0.32\textwidth]{figs/lander_num_trajs_new.pdf}

    \caption{The test performance of an apprentice agent for ValueWalk and 3 baseline methods for different numbers of demonstration trajectories. The ValueWalk apprentice agent takes the action that maximizes the median of the posterior Q-value samples. The line shows mean performance across 10 runs with different sets of expert demonstrations; the shaded region shows mean$\pm$std.}
    % \textcolor{red}{[can you explain acronyms in labels, here in caption or in main text?]}% The background colours and associated numbers indicate rewards received for ending a step in each state.}
    \label{fig:classic_env_results}
\end{figure*}



Figure~\ref{fig:2d_cts} illustrates the demonstrations used and the learnt apprentice policies. Firstly, note that while the demonstrations are highly stochastic, the methods learn estimates of the \textit{optimal} Q-function, thus possibly allowing them to produce apprentice policies with performance superior to that of the expert. Secondly, even to a human eye, the demonstrations leave it ambiguous whether there may be an unsafe region that the expert is avoiding, or whether the said area was missed by chance. While both the AVRIL apprentice policy, and the ValueWalk policies maximizing the mean and median of the Q-value distribution tend to go straight to the goal region (hitting the hazardous obstacle between 68 and 81\% of cases), the 0.1-quantile-maximizing policy tends to avoid the region (hitting it in only 13\% of cases across 100 sampled trajectories). This illustrates an important benefit of recovering a full posterior -- it allows producing similar conservative policies based on statistics of the posterior distribution other than the usual mean.

\subsection{Classic Control Environments}
\label{ssec:exp-classic-control}




To allow for direct comparison, we also evaluated ValueWalk on three classic control environments that were used to evaluate AVRIL by its authors: CartPole, where the goal is to balance an inverted pendulum by controling a cart underneath it, Acrobot, where the goal is to swing up a double pendulum using an actuated joint, and LunarLander, where the goal is to safely land a simulated lander on the surface of the moon. We used the same setup as was used for AVRIL to study the performance of an apprentice agent as a function of the number of demonstration trajectories for 1, 3, 7, 10, and 15 trajectories. The apprentice agent was evaluated on 300 test episodes and the mean reward is reported. We also compare against energy-based distribution matching (EDM; \cite{jarrett2020}) -- a successful method for strictly batch imitation learning -- and plain behavioural cloning (BC) as a simple baseline. Baseline results were taken from \cite{chan2021}.



% \subsection{MIMIC-III clinical data}
% Following \cite{chan2021}, we also evaluated the methods on real-world intensive-care unit data from the MIMIC-III dataset~\cite{johnson2016}, where the state space observations are clinical measurements, and the possible actions are putting the patient on a ventilator (2 action setting) or on a ventilator and/or antibiotics (4 action setting). Since no ground-truth is available, the methods were evaluated using action matching on held-out set of expert trajectories. See \cite{chan2021} for further details.

% \subsection{Results}

% \begin{table*}
% \centering
% \caption{\textbf{Clinical dataset performance.} Comparison of ValueWalk against baseline methods on the MIMIC-III dataset. Performance of the policy is evaluated on the quality of \emph{action matching} against a held out test set of demonstrations. We report the accuracy (ACC), area under the receiving operator characteristic curve (AUC) and average precision score (APS).}
% \vspace{1mm}
% \label{tab:mimic_results}
% \footnotesize
% \setlength{\tabcolsep}{1pt}
% \begin{tabular}{c|ccc|ccc}\toprule
%     & \multicolumn{3}{c}{\textbf{Ventilator}} & \multicolumn{3}{c}{\textbf{Ventilator + Antibiotics}}\\ \midrule
%     Metric & ACC & AUC & APS & ACC & AUC & APS\\ \midrule
%     BC & $0.873 \pm 0.007$ & $0.916 \pm 0.002$ & $0.904 \pm 0.003$ & $0.700 \pm 0.009$ & $0.864 \pm 0.003$ & $0.665 \pm 0.009$ \\
%     EDM & $0.882 \pm 0.011$ & $\mathbf{0.920 \pm 0.002}$ & $0.909 \pm 0.003$ & $0.716 \pm 0.008$ & $0.873 \pm 0.002$ & $0.682 \pm 0.004$\\ 
%     AVRIL & $\mathbf{0.891 \pm 0.002}$ & $0.917 \pm 0.001$ & $\mathbf{0.940 \pm 0.001}$ & $\mathbf{0.754 \pm 0.001}$ & $\mathbf{0.884 \pm 0.000}$ & $\mathbf{0.708 \pm 0.002}$\\
%     \midrule
%     ValueWalk & $\mathbf{0.913 \pm 0.005}$ & $0.919 \pm 0.002$ & $\mathbf{0.940 \pm 0.001}$ & $\mathbf{0.754 \pm 0.001}$ & $\mathbf{0.884 \pm 0.000}$ & $\mathbf{0.708 \pm 0.002}$\\
%    \bottomrule
% \end{tabular}
% \vspace{-\baselineskip}
% \label{tab:mimic-results}
% \end{table*}

% Table~\ref{tab:mimic-results} shows the results. 

\subsubsection{Results}

\begin{figure*}
    \centering
    \includegraphics[width=0.32\textwidth]{figs/acrobot_action_logprobs_and_entropy.pdf}
    \includegraphics[width=0.32\textwidth]{figs/cartpole_action_logprobs_and_entropy.pdf}
    \includegraphics[width=0.32\textwidth]{figs/lunar_lander_action_logprobs_and_entropy.pdf}

    \caption{The log likelihood on a hold-out set of 100 test demonstrations and the entropy of the action predictions produced by ValueWalk and AVRIL.}
    % \textcolor{red}{[can you explain acronyms in labels, here in caption or in main text?]}% The background colours and associated numbers indicate rewards received for ending a step in each state.}
    \label{fig:classic_env_logprobs}
\end{figure*}

The results are plotted in Figure~\ref{fig:classic_env_results}. While both agents do close to expert-level when provided with 15 expert trajectories, our agent reaches this level with much fewer expert demonstrations. We hypothesize that this is due to treating the Q-function in a Bayesian way, as opposed to a point estimate in AVRIL, leveraging the advantages of a fully Bayesian treatment in the low data regime. 

To support this, we can look at the log likelihoods of the action predictions on a hold-out set of 100 test trajectories and the entropies of the predictive posterior shown in Figure~\ref{fig:classic_env_logprobs}. For ValueWalk, the log likelihood increases as the method is given more trajectories, while the prediction entropy either decreases or stays about level as we would expect from a Bayesian method given increasing amounts of information. On the other hand, we do not consistently see similar behaviour in AVRIL. The test log likelihood consistently increases only in the case of the LunarLander environment, where it, however, starts from extremely low levels (the initial \emph{mean} log probability of -18.0 would correspond to a probability of $10^{-8}$, suggesting the method has been putting practically 0 probability on actions taken by the expert among only 4 possible actions). Also, the prediction entropy of AVRIL tends to increase with seeing more trajectories. That suggests that AVRIL may be exhibiting overfitting behaviour in the low data regimes, which Bayesian methods should generally avoid.

The ValueWalk experiments were run until we get a well mixed chain, which can take between 4 and 38 hours of wall time on a single Nvidia RTX 3090 GPU\footnote{Experiments with fewer trajectories were run on a CPU.} where AVRIL takes 1-5 minutes to converge.

\section{Discussion}
We presented a method that allows us to apply MCMC-based Bayesian inverse reinforcement learning to continuous environments. The method maintains the attractive properties of MCMC methods: it is agnostic to the shape of the posterior (where variational methods assume a particular parameterized distribution family) and given enough compute, produces samples from the true posterior. This comes at a large computational cost relative to cheaper methods, such as variational inference. However, we still think MCMC-methods do have a role to play in the Bayesian IRL ecosystem. 

Firstly, we have shown that staying true to the Bayesian posterior does bring benefits in terms of superior performance on imitation learning tasks. Furthermore, the computational cost is paid in the learning phase, with inference at deployment being fast (sub millisecond per step in all cases, which would be sufficient for real-time control in most possible use cases and could be further optimized).

Secondly, we think that having a method that can draw samples from the true posterior can be extremely important in the process of developing other, faster or easier to scale methods, since it allows us to assess how their approximation deviates from the true posterior and how it impacts their performance. Also, variational methods in particular require a pre-specified family of distributions over which the optimization is subsequently run. ValueWalk can be used in an exploratory phase to determine what family of distributions may be appropriate for the problem at hand, before possibly using the advantages of variational methods to scale up. 

Thus, despite their steep computational cost, we think MCMC methods have their place in Bayesian inverse reinforcement learning, and our method is a sizable step in extending them up to a wider range of settings.


\bibliography{2305_zotero}

\newpage







\onecolumn

\title{Walking the Values in Bayesian Inverse Reinforcement Learning\\(Supplementary Material)}
\maketitle

\appendix

% \bibliographystyle{iclr2024_conference}

% \section*{Appendix}

% \section{Finite Space ValueWalk with Unknown Dynamics}
% [TODO]

\section{Unknown transition probabilities}
\label{app:unknown-transitions}
Section~\ref{ssec:method-finite} presents a version of the ValueWalk algorithm for finite state and action spaces that assumes known transition probabilities. However, the key trick used in ValueWalk extends to unknown transition probabilities as well. 

One simplified option to handle unknown transitions, also employed in the continuous-state case in Section~\ref{ssec:method-cts} matching the setting used by AVRIL, is replacing the transition probabilities with their empirical estimate $\hat p(s'|s,a) = \xi(s,a,s')/\xi(s,a)$ where $\xi(s,a,s'),\xi(s,a)$ are the numbers of occurrences in the set of demonstration set of the transition $(s,a,s')$ and state-action pair $(s,a)$. In the finite-state, this would mean limiting the evaluation of the prior in Algorithm~\ref{alg:posterior-unknown-transitions} to only those state-action pairs that do occur in the data (i.e. replacing vectors and matrices on lines 3-6 by the appropriate sub-vectors and sub-matrices).

A more principled Bayesian alternative is of course using full Bayesian inference also over transitions -- in that case, we can perform the MCMC sampling jointly over both the transitions and the Q function parameters, recovering samples from the full joint posterior. The changes needed are (1) treating parameters of the transition model as inputs in the algorithm, (2) adding a prior over those parameters (so the joint prior will be a product of the Q-parameter prior and the transition-parameter prior), and (3) including transition probabilities in the likelihood. Here is the adaptation of the finite-space algorithm to this case of unknown probabilities:
\begin{algorithm}
% \RestyleAlgo{ruled}
\KwData{a candidate matrix of Q values, a candidate transition matrix $P$, set of expert demonstrations $\D$, prior over rewards $p_R$, prior over transitions $p_P$}%, expert rationality coefficient $\alpha$, optimality approximation coefficient $\bar \alpha$}
\For{$s,s'\in \sts, a,a'\in \as$}{
    $\pi(a|s) = \exp(\bar\alpha Q(s,a))/\sum_{a'\in \as}\exp(\bar\alpha Q(s,a'))$\;
    $\bar P(s,a;s',a') = P(s'|s,a)\pi(a'|s')$ \; 
    }
$\bar R = (I - \gamma \bar P) \bar Q$ where $\bar R,\bar Q$ are flattened vector versions of the reward and Q-value matrices \;
$p_Q(Q) = p_R(\bar R) \det(I-\gamma \bar P)$ \;
$p(\D|Q) = \prod_{(s,a,s')\in\D} P(s'|s,a)\exp(\alpha Q(s,a))/\sum_{a'\in \as}\exp(\alpha Q(s,a'))$ \;
\KwResult{$p(Q,P|\D) \propto p_P(P) p_Q(Q) p(\D|Q,P)$; candidate sample $\bar R$}
\caption{Calculation  of the unnormalized posterior for finite $\sts$ and $\as$ with unknown transition probabilities (performed in each step of HMC). The resulting candidate reward sample $\bar R$ is then accepted/rejected together with the corresponding Q and P.}
\label{alg:posterior-unknown-transitions}
\end{algorithm}


\section{Proof of soundness of the algorithm}
\label{app:proofs}

\begin{theorem}
\label{thm:detailed-balance}
Assume that the transition kernel $q_Q$ satisfies the detailed balance condition
$$\frac{q_Q(Q'|Q)}{q_Q(Q|Q')} = \frac{p_Q(Q'|D)}{p_Q(Q|D)}$$
with respect to the posterior over Q values defined in Algorithm 1. Then the associated implicit Markov chain over rewards also satisfies the detailed balance condition with respect to the posterior $p_R(R|D)$.
\end{theorem}

\begin{proof}
Let $q_Q$ be the transition kernel over Q-values that satisfies the detailed balance condition with respect to the posterior $p_Q(Q|D)$ as assumed in the theorem statement.

The implicit transition kernel $q_R$ over rewards induced by $q_Q$ can be expressed as
\begin{equation}
q_R(R'|R) = q_Q(Q(R')|Q(R)) \left|\det\left(\frac{\partial Q(R')}{\partial R'}\right)\right|
\end{equation}
where $Q(R)=(I-\gamma \bar{P})^{-1}R$ is the Q-value corresponding to reward $R$ as used in Algorithm 1. The determinant term accounts for the change of variables from $Q$ to $R$.

The posterior over rewards can be expressed in terms of the posterior over Q-values as
\begin{equation}
p_R(R|\D) = p_Q(Q(R)|\D) \left|\det\left(\frac{\partial Q(R)}{\partial R}\right)\right| \
= p_Q(Q(R)|\D) \left|\det(I-\gamma \bar{P})^{-1}\right|.
\end{equation}

Now consider the ratio of the implicit transition kernel:
\begin{multline}
\frac{q_R(R'|R)}{q_R(R|R')} = \frac{q_Q(Q(R')|Q(R))}{q_Q(Q(R)|Q(R'))}  \frac{\left|\det\left(\frac{\partial Q(R')}{\partial R'}\right)\right|}{\left|\det\left(\frac{\partial Q(R)}{\partial R}\right)\right|} \
= \frac{p_Q(Q(R')|D)}{p_Q(Q(R)|\D)}  \frac{\left|\det\left(\frac{\partial Q(R')}{\partial R'}\right)\right|}{\left|\det\left(\frac{\partial Q(R)}{\partial R}\right)\right|} \
= \\
\frac{p_R(R'|\D) \det((I-\gamma\bar P')^{-1})}{p_R(R|\D) \det((I-\gamma\bar P)^{-1})}\frac{\det(I-\gamma\bar P')}{ \det(I-\gamma\bar P)}
= \frac{p_R(R'|D)}{p_R(R|D)}
\end{multline}
where the second equality follows from the assumed detailed balance condition on $q_Q$, the last equality follows from the expression for $p_R(R|D)$ derived above, and $\bar P'$ are the joint state-action transitions corresponding to $Q'$.
Thus, the implicit Markov chain over rewards induced by the transition kernel $q_Q$ satisfies detailed balance with respect to the posterior $p_R(R|D)$, as claimed.
\end{proof}

The theorem establishes an important property of the ValueWalk method, namely that the implicit Markov chain over rewards induced by the HMC-based sampling of Q-values satisfies detailed balance with respect to the true posterior over rewards given the demonstrations, $p_R(R|D)$. This property is crucial for the soundness of the method.

Detailed balance is a sufficient condition for the Markov chain to have a stationary distribution equal to the target distribution, in this case $p_R(R|D)$. This means that, assuming the chain is ergodic, the samples of rewards obtained from the ValueWalk method will asymptotically follow the true posterior distribution, regardless of the initial distribution. In other words, the theorem guarantees that, given enough samples, ValueWalk will correctly characterize the posterior uncertainty over rewards, which is a key goal of Bayesian inverse reinforcement learning.

\section{Continuous AVRIL}
We are comparing ValueWalk with AVRIL~\cite{chan2021}, which was originally designed to work with discrete actions. When we are comparing our method to AVRIL on continuous-action environments, we use the following continuous extension of AVRIL:
\begin{enumerate}
    \item The original Boltzman likelihood \ref{eq:boltzmann-rat} is replaced by its continuous version \ref{eq:boltzmann-rat-cts}, which, in practice, gets calculated using the same approximation as our method.
    \item Instead of taking the state as input and producing an output for each of the discrete actions, the Q function and the variational distribution for the reward now takes in a state-action pair (or a batch of those) and produces a single Q-value for those or a single set of variational distribution parameters. 
\end{enumerate}

\section{Experiment details}
For the gridworld experiments, we used a version of AVRIL learning a Q-value for each state-action pair and a mean and variance value for the reward in each state. We use a matching setup for both ValueWalk and PolicyWalk.

In the continuous state space environments, for the 3 continuous baseline methods, we match the setup from \cite{chan2021} and use neural network models with 2 hidden layers of 64 units and an ELU activation function. For our experiments, we scale up the network size with the complexity of the problem: we use one hidden layer with 8 units for the 2D safe navigation task and Cartpole, 1 layer of 16 units for Acrobot, and 2 layers of 32 units for LunarLander. In each case, we also tried running AVRIL with a matching network size but in each case it performed similarly or usually worse than the default 2x64 setup for which results are reported.

With ValueWalk, we use the Pyro \citep{bingham2018} implementation of HMC+NUTS, which we ran with 2,000 warm-up steps and 20,000 inference steps. We automatically tune the step size during warm-up but do not tune the mass matrix. 

In the continuous environments, we use a Gaussian process prior with an RBF kernel with fixed scale of 1 and fixed lengthscale of 0.2 for Cartpole and Acrobot and 0.05 for Lunar Lander (chosen manually based on the distribution of features in each environment). 

In Cartpole, Acrobot, and Lunar Lander, we reuse the demonstration sets provided by the authors of AVRIL. Each contains 1000 demonstration trajectories, from which we randomly chose a set of 100 test trajectories and then randomly sampled the reported numbers of train trajectories. We reran most experiments 10 times with different random sets of training trajectories and different random initializations.

Unless otherwise stated, we use a Boltzmann rationality coefficient of 1. 


\section{Additional details of results}
\subsection{Gridworld experiments}
Figure~\ref{fig:3x3_cdfs} shows the empirical cumulative distribution functions of the 10,000 posterior reward samples collected by PolicyWalk and ValueWalk and confirms both methods track the same posterior.
\begin{figure}
    \centering
    \includegraphics[width=0.8\textwidth]{figs/3x3_cdfs.pdf}
    \caption{Cumulative distribution functions of the posterior distributions over rewards recovered by PolicyWalk and ValueWalk in the 3x3 gridworld, illustrating that the two methods recover the same posterior.}
    \label{fig:3x3_cdfs}
\end{figure}

Figure~\ref{fig:3x3_2d_hists} shows 2-D histograms of pairwise joint posteriors over rewards of the 9 states of the gridworld. Two aspects of the expert's behaviour are captured by this plot and may not be obvious from the simple histograms in Figure~\ref{fig:3x3_gw}. Firstly, the agent heading to the terminal top right corner can be explained either by the reward there being positive, or by the reward in other states (especially the initial state) being negative and thus the agent using the terminal state as a way to escape incurring further negative rewards. Secondly, note that practically all of the probability mass is placed on the reward of the obstacle tile being lower than that of the two tiles below, thus explaining the expert avoiding the obstacle tile.

The plot also clearly shows that the posterior is non-Gaussian (note especially the sharp edge expressing high confidence that the ratio of the two values does not cross a certain threshold) and thus could not be captured by the Gaussian-assuming variational prior.

Note that this plot was produced with a prior standard deviation of 33 and an obstacle reward of -100. 

\begin{figure}
    \centering
    \includegraphics[width=0.99\textwidth]{figs/pairwise_joint_distributions.pdf}
    \caption{2-D histograms representing the joint posteriors of the rewards associated with the 9 states of the gridworld (enumerated left-to-right, top-to-bottom, so state 3 is the goal state in the top right corner.}
    \label{fig:3x3_2d_hists}
\end{figure}

% \subsection{Variance of posterior probability estimate}
% The numerical estimate of the posterior probability as outlined in Algorithm~\ref{alg:posterior-sim} can produce significant variance as displayed in Figure~\ref{fig:prior_variance}. The estimates vary over several orders of magnitude, which can be crippling if used to calculate the acceptance probability in the MCMC algorithm.

% \begin{figure}
%     \centering
%     \includegraphics[width=0.99\textwidth]{figs/prior_prob_samples.pdf}
%     \caption{Histograms of the naive estimates of the prior probability for a simple 1D track environment. }
%     \label{fig:prior_variance}
% \end{figure}

\end{document}
