\documentclass[accepted]{uai2024} % for initial submission

% if you need to pass options to natbib, use, e.g.:
%     \PassOptionsToPackage{numbers, compress}{natbib}
% before loading neurips_2023
% Recommended, but optional, packages for figures and better typesetting:
\usepackage{microtype}   
\usepackage{graphicx}
%\usepackage{subfigure}
\usepackage{booktabs} % for professional tables
\usepackage{amssymb}
\usepackage{amsmath}
\usepackage{caption}%, subcaption}
%\usepackage[justification=centering]{subfig}
\usepackage{subcaption}
\usepackage{natbib}


%\usepackage[colorinlistoftodos]{todonotes}



\let\vec\mathbf

% hyperref makes hyperlinks in the resulting PDF.
% If your build breaks (sometimes temporarily if a hyperlink spans a page)
% please comment out the following usepackage line and replace
% \usepackage{icml2021} with \usepackage[nohyperref]{icml2021} above.
\usepackage{hyperref}
\usepackage{algorithm}
\usepackage{algpseudocode}

% Attempt to make hyperref and algorithmic work together better:
%\newcommand{\theHalgorithm}{\arabic{algorithm}}

% ready for submission


% to compile a preprint version, e.g., for submission to arXiv, add add the
% [preprint] option:
%     \usepackage[preprint]{neurips_2023}


% to compile a camera-ready version, add the [final] option, e.g.:
%     \usepackage[final]{neurips_2023}


% to avoid loading the natbib package, add option nonatbib:
%    \usepackage[nonatbib]{neurips_2023}


\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors


\title{Offline Bayesian Aleatoric and Epistemic Uncertainty Quantification and Posterior Value Optimisation in Finite-State MDPs}


% The \author macro works with any number of authors. There are two commands
% used to separate the names and addresses of multiple authors: \And and \AND.
%
% Using \And between authors leaves it to LaTeX to determine where to break the
% lines. Using \AND forces a line break at that point. So, if LaTeX puts 3 of 4
% authors names on the first line, and the last on the second line, try using
% \AND instead of \And before the third author name.


\author[1]{\href{mailto:<filippo.valdettaro20@imperial.ac.uk>}{Filippo Valdettaro}}
\author[1,2,3]{\href{mailto:<a.faisal@imperial.ac.uk>}{A. Aldo Faisal}}
% Add affiliations after the authors
\affil[1]{%
    Brain \& Behaviour Lab\\
    Dept. of Computing\\
    Imperial College London, UK
}
\affil[2]{%
    Brain \& Behaviour Lab\\
    Dept. of Bioengineering\\
    Imperial College London, UK
}
\affil[3]{%
    Chair in Digital Health \& Data Science\\
    University of Bayreuth, Germany
}


\begin{document}


\maketitle

\begin{abstract}
We address the challenge of quantifying Bayesian uncertainty and incorporating it in offline use cases of finite-state Markov Decision Processes (MDPs) with unknown dynamics.
Our approach provides a principled method to disentangle epistemic and aleatoric uncertainty, and a novel technique to find policies that optimise Bayesian posterior expected value without relying on strong assumptions about the MDP’s posterior distribution.
First, we utilise standard Bayesian reinforcement learning methods to capture the posterior uncertainty in MDP parameters based on available data.
We then analytically compute the first two moments of the return distribution across posterior samples and apply the law of total variance to disentangle aleatoric and epistemic uncertainties. 
To find policies that maximise posterior expected value, we leverage the closed-form expression for value as a function of policy. This allows us to propose a stochastic gradient-based approach for solving the problem.
We illustrate the uncertainty quantification and Bayesian posterior value optimisation performance of our agent in simple, interpretable gridworlds and validate it through ground-truth evaluations on synthetic MDPs.
Finally, we highlight the real-world impact and computational scalability of our method by applying it to the AI Clinician problem, which recommends treatment for patients in intensive care units and has emerged as a key use case of finite-state MDPs with offline data.
We discuss the challenges that arise with Bayesian modelling of larger scale MDPs while demonstrating the potential to apply our methods rooted in Bayesian decision theory into the real world.
We make our code available at \url{https://github.com/filippovaldettaro/finite-state-mdps}.
\end{abstract}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\input{intro2.tex}
\input{relatedwork2.tex}

\section{Background}
\paragraph{Dynamic Programming}
A Markov Decision Process $\mathcal{M}$ (MDP) \citep{mdpbook} is given by a tuple $(\mathcal{S}, \mathcal{A}, r, P, \gamma, \rho)$, where $\mathcal{S}$ and $\mathcal{A}$ are the (assumed finite) state and action spaces respectively, $r:\mathcal{S}\to\mathbb{R}$ is the reward function, $P:\mathcal{S}\times\mathcal{A}\to\mathcal{P}(\mathcal{S})$ the transition kernel ($\mathcal{P}$ denoting a probability distribution over the corresponding set), $\gamma \in [0,1)$ a discount factor and $\rho$ the distribution over initial states.
Given a policy $\pi:\mathcal{S}\to\mathcal{P}(\mathcal{A})$, the \textit{return} of an episode starting from state $s$ is a random variable given by
%\begin{equation}
%\label{eq:returndefinition}
    $G^\pi(s) = \sum_{t=0}^{\infty} \gamma^t R_t$,
%\end{equation}
where $R_t=r(s_t)$, $a_t\sim \pi(\cdot|s_t)$, $s_t\sim P(\cdot|s_{t-1}, a_{t-1})$ given that $s_0 = s$.
For simplicity, we assume reward as known and only dependent on state. This is a natural modelling step for MDPs where a certain state is associated with a particular reward and in practice is common when constructing MDPs. Nonetheless our methods extend naturally to more general formulations.

The expected value of $G$ is called the value function $V^\pi(s) = \mathbb{E}G^\pi(s)$, and it can be shown that with this definition, $V$ satisfies the Bellman equation
\begin{equation}
\label{eq:bellman}
    V^\pi(s) = r(s)+ \gamma\sum_{a,s'}P(s'|s,a)\pi(a|s)V^\pi(s').
\end{equation}
Dynamic programming methods, such as value iteration, can evaluate $V$ and provide the policy that optimises $V$ \citep{suttonbartobook}.
It can be shown that the value of any arbitrary policy $\pi$ is
\begin{equation}
\label{eq:bellmansolution}
    \vec{v}(\pi) = (\vec{I} - \gamma \vec{T}(\pi))^{-1}\vec{r},
\end{equation}
with $\vec{v}$ and $\vec{r}$ being $|\mathcal{S}|$-dimensional vectors with $i^\text{th}$ element being $V^\pi(s_i)$ and $r(s_i)$ respectively (for $s$ the $i^\text{th}$ state in $\mathcal{S}$) and $\vec{T}(\pi)$ the policy-dependent transition matrix with element $i, j$ given by 
\begin{equation}
    \vec{T}_{i,j} = \sum_{a}\pi(a|s_i)P(s_j|s_i,a).
\end{equation}
The term $(\vec{I} - \gamma \vec{T}(\pi))^{-1}$ can be interpreted as successor features, in terms of which the analytic solution for value has a simple form \citep{successor}. For clarity we have highlighted here the dependence of $\vec{T}$ on $\pi$ and note that $\vec{r}$ does not depend on $\pi$ as we assumed state-dependent rewards.


\paragraph{Return Distribution}
Unlike traditional distributional RL methods \citep{bellemare2017, distributionalrlbook}, we focus solely on the first two moments of the return distribution. This allows us to bypass the full distributional RL framework, as closed-form solutions for these moments can be obtained analytically for a given finite-state MDP.

Methods that solve the Bellman value equation (Eq.~\ref{eq:bellman}) can be extended to determine moments of the return distribution.
For example, it can be shown that the variances of the return random variable $G^\pi(s)$ satisfy an analogous set of linear Bellman equations, with solution given in vector form by \cite{sobel}:
\begin{equation}
    \label{eq:bellmanvarsolution}
    \vec{var}(\pi) = (\vec{I} - \gamma^2 \vec{T}(\pi))^{-1}\vec{r}^\text{(var)}(\pi),
\end{equation}
where the vector of variances $\vec{var}$ has element $i$ corresponding to the variance at state $s_i$ and $\vec{r}^{\text{(var)}}$ is the vector with element $i$ being
\begin{equation}
    \vec{r}^{\text{(var)}}_i (\pi) = 
    \sum_{j} P^\pi(s_j|s_i)(r(s_i)+\gamma V^\pi(s_j))^2-V^\pi(s_i)^2,
\end{equation}
where $P^\pi(s'|s) = \sum_{a} \pi(a|s) P(s'|s,a)$.

\paragraph{Bayesian Dynamics Model}
\label{sec:bayesdyanmics}

The dynamics model we employ is standard in Bayesian RL, and is equivalent to the one used in BAMDPs \citep{bayesianrlreview, beetle} with an unchanging belief and similar to the one proposed in \citet{learningtodefer}, but stationary.
By modelling the belief over the MDP's dynamics parameters, this line of work effectively captures the uncertainty due to not being able to fully narrow down the true underlying MDP: with a finite number of transitions, there may be several potential MDPs that could have generated the observations, to which we can assign posterior probabilities by using Bayes' rule.
For our purposes, we take the reward function of the MDP as known (and deterministic), ultimately because in our applications we will define reward directly as a deterministic function of state, but treat the dynamics of the world as unknown.

Let $\theta_{s,a}^{s'}$ be a parameter representing the probability of transitioning to state $s'$ given action $a$ at state $s$, and consider a dataset of observed transitions $(s,a,r,s')\in \mathcal{D}$.
The probability of transitioning to some next-state follows a multinomial distribution with parameters given by $\theta$, and we can specify a conjugate Dirichlet prior on these so that for each state-action the resulting posterior probability is also Dirichlet.
Assuming a symmetric Dirichlet prior (independent across different state-actions) with parameter $\alpha_p$, the posterior distribution satisfies
\begin{equation}
    p(\{\theta_{s,a}^{s_i}|s_i\in\mathcal{S}\}|\mathcal{D}) \propto \prod_j (\theta_{s,a}^{s_j})^{n_j+\alpha_p-1},
\end{equation}
with $n_{j}$ being the number of times $s,a$ transitioned to state $s_j$ and the proportionality constant is given (in closed form) by the multivariate Beta function \citep{distributionsbook}.

When the number of possible outcomes, in this case next states, is large then inference on the Dirichlet parameters can be very data-inefficient: if a generic maximum-entropy prior parameter is employed it can assign a disproportionate amount of posterior probability to unobserved outcomes.
To mitigate this, one may scale the prior parameter inversely to the number of outcomes, as done in a BAMDP context in \cite{dirichletbamdpprior}, or induce sparsity in the possible outcomes by modelling the belief of feasible next states through a hierarchical Bayesian model \citep{dirichlethierarchical}.
We will address this same issue in section \ref{sec:clinicalresults} by employing a sparse Dirichlet model. 


\paragraph{Aleatoric and Epistemic Uncertainty}

In order to quantify and distinguish between epistemic uncertainty due to ambiguity in MDPs $\mathcal{M}$ given limited data and aleatoric uncertainty in the return $G$, we use the common decomposition formula that arises after applying the law of total variance \citep{disentanglinguncertainties, learningtodefer} to the return $G$:
\begin{equation}
\label{eq:disentangling}
    \mathop{\text{Var}}{G(s)} = 
    \underbrace{{\text{Var}}_{\mathcal{M}}{\mathbb{E}{\, G_{\mathcal{M}}(s)}}}_\text{epistemic} + \underbrace{{\mathbb{E}}_{\mathcal{M}}{\mathop{\text{Var}}{G_{\mathcal{M}}(s)}}}_\text{aleatoric},
\end{equation}
where we have made clear that the dependence on the return random variable $G$ is conditioned on the MDPs $\mathcal{M}$, so that the inner expectations and variances are marginalising over returns for a given MDP and the outer expectations and variances are marginalising over distributions of MDPs.
The epistemic variance term captures the overall variance in the expected returns due to ambiguity in the MDPs and the aleatoric variance term is an estimate of the intrinsic variance averaged over the posterior MDP distribution. 
Eqs.~\ref{eq:bellmansolution} and \ref{eq:bellmanvarsolution} allow us to determine $\mathbb{E}{\, G_{\mathcal{M}}(s)} = V_\mathcal{M}(s)$ and $\mathop{\text{Var}}{G_{\mathcal{M}}(s)}$ exactly, while averages and variances over the MDPs can be approximated through Monte Carlo sampling of the posterior over MDPs.
In the limit of infinite data, the epistemic variance will tend to $0$ as the probability mass of the posterior focuses in on a specific $\mathcal{M}$, but the aleatoric term will not necessarily behave similarly.

\paragraph{Bayesian Objective}

Beyond evaluating uncertainty, having a belief over the possible range of dynamics that an MDP can exhibit can allow us to account for this uncertain belief when carrying out control.
Bayesian decision theory dictates that the optimal decision rule for a given prior belief and observed data is the one that maximises posterior expected value \citep{bayesianchoice}.
Thus, we seek to find a policy that maximises the posterior expected value objective
\begin{equation}
    \label{eq:objective}
    \max_{\pi} \sum_s \rho(s)\mathbb{E}_{\mathcal{M}\sim p(\cdot|\mathcal{D})} V_\mathcal{M}^\pi(s),
\end{equation}
where the value of each state $\mathbb{E}_{\mathcal{M}\sim p(\cdot|\mathcal{D})} V_\mathcal{M}^\pi(s)$ has been marginalised with respect to the initial state distribution $\rho$.
This approach is consistent with previous literature that establishes the benefits of optimising this objective for decision-making in uncertain MDPs \citep{robustadversarial, robustmdp, robusttradeoff}.
Thus, this objective will be one of the performance metrics we will use to evaluate different algorithms.
\citet{robustmdppercentile} addresses finding a policy that performs well on this objective, but their approach relies on a second-order expansion of the value in terms of the MDP parameters' posterior distribution moments, and must thus assume small posterior uncertainty to be successful.

\section{Methods}

\subsection{Uncertainty quantification}
\label{sec:uncertaintymethod}

Some proposed approaches for jointly estimating aleatoric and epistemic uncertainty in discrete-space MDPs either overlook uncertainty in the transition model \citep{paul} or rely on extensive Monte Carlo sampling \citep{learningtodefer}.
As a consequence, the former does not scale consistently with additional data (see Appendix \ref{app:paul} for empirical evidence for this claim) and we can improve on the latter in some regimes for the infinite-horizon MDP case by using closed-form expressions for the first two moments of the return distribution.

We present in Algorithm~\ref{alg:uncertainty} a way to estimate posterior value, aleatoric and epistemic variances in Eq.~\ref{eq:disentangling}, that exploits the finite-state stationary nature of the MDPs considered here.
Its computational complexity scales as $O(|\mathcal{S}|^3)$ due to requiring an $|\mathcal{S}| \times |\mathcal{S}|$ matrix inversion for each of the $N_M$ dynamics samples.
In contrast, methods that rely on Monte Carlo return samples to estimate aleatoric and epistemic return will require a larger number of Dirichlet samples and large simulation trajectory lengths to achieve comparable accuracy, but no matrix inversion.

We investigate this trade-off quantitatively in Appendix \ref{app:numsamples} and conclude that the larger number of samples required for a full Monte Carlo-style evaluation (similar to \citet{learningtodefer}) is not worth the additional sampling overhead for the MDPs we are considering ($\gamma=0.999, \mathcal{|S|}<1000$).
In particular, we show that for large $\gamma$, finding exact solutions for values using analytic forms will be more computationally efficient as longer rollouts become necessary to have accurate return samples and more posterior samples become necessary to decrease the error from Monte Carlo sampling.
In principle one could also use some iterative policy evaluation scheme \citep{suttonbartobook} to solve for the first and second moments of the return distribution, sacrificing a small amount of accuracy but avoiding a matrix inverse calculation. 
\begin{algorithm}
    \caption{Bayesian Value, Epistemic and Aleatoric Uncertainty Evaluation}\label{alg:uncertainty}
    \begin{algorithmic}
    \Require Policy $\pi$, state $s_i$, posterior distribution over transition parameters $p(\mathcal{M}|\mathcal{D})$
    \State ${\theta^{s'}_{sa}}_{\{1:N_M\}} \gets N_M$ matrix samples from $p(\mathcal{M}|\mathcal{D})$
    \For{$s\in S, s' \in S$}
        \State $\{\vec{T}_{s s'}\}_{\{1:N_M\}}\gets \sum_a \pi(a|s) \theta_{sa \,\{1:N_M\}}^{s'}$ \Comment{$N_M$ action-marginalised transition matrices}
    \EndFor
    \For{$t = 1$ to $N_M$}
        \State $\vec{v}_{t} \gets  (\vec{I} - \gamma \vec{T}_t)^{-1}\vec{r}$ \Comment{Eq. \ref{eq:bellmansolution} for samples}
        \For{$s_k \in \mathcal{S}$}
            \State $V_k \gets$ element $k$ of $\vec{v}_{t}$
        \EndFor
        \For{$s_k \in \mathcal{S}$}
            \State $\vec{r}^{\text{(var)}}_k \gets
        \sum_{j} \{\vec{T}_{s_k s_j}\}_{t}(r(s_k)+\gamma V_j)^2-V_k^2$
        \EndFor
        \State $\vec{var}_{t} \gets (\vec{I} - \gamma^2 \vec{T}_t)^{-1}\vec{r}^\text{(var)}$ \Comment{Equation \ref{eq:bellmanvarsolution}}
        \State $v_t \gets$ element $i$ of $\vec{v}_{t}$
        \State $var_t \gets$ element $i$ of $\vec{var}_{t}$
    \EndFor
    \State bayes\_value $\gets \frac{1}{N_M}\sum_{t=1}^{N_M} v_t$
    \State aleatoric\_var $\gets \frac{1}{N_M}\sum_{t=1}^{N_M} var_t$
    \State epistemic\_var $\gets \frac{1}{N_M-1}\sum_{t=1}^{N_M} (v_t - \text{bayes\_value})^2$
    
    \Return bayes\_value, aleatoric\_var, epistemic\_var
    \end{algorithmic}
\end{algorithm}

\subsection{Policy improvement}
\label{sec:improvement}

Unlike with a single MDP, the objective in Eq.~\ref{eq:objective} does not always admit a deterministic optimal policy (we provide an example in Appendix \ref{app:casino} where the optimal policy is stochastic).
For this reason, approaches analogous to classical dynamic programming cannot find an optimal policy.
We suggest a gradient-based approach to optimise this objective in Algorithm \ref{alg:improvement}.
We approach the optimisation by taking stochastic gradient steps of the value objective with respect to a parametrised stochastic policy, which is made possible by the analytic form for value for given parameter samples.
This approach is qualitatively distinct to the classic policy gradient in RL which estimates policy gradients from rolled-out trajectories \citep{suttonbartobook} .
In contrast to other methods \citep{aiclinician, robustvalueiter} this does not introduce bias due to optimising only with respect to a finite number of transition samples:
by re-sampling from the posterior every gradient step, we remove the bias that would occur by picking a smaller finite sample, and we note that all standard stochastic gradient optimisation guarantees regarding computational complexity or convergence to a local optimum will apply.
For example, one can show that with appropriate learning rate scheduling, this convergence is guaranteed almost surely \citep{sgdproofs} (although we empirically found that convergence was also achieved with a constant learning rate).
Note that since $\gamma<1$, all quantities (values, variances) are bounded, continuous and differentiable functions of policy parameters.
We also remark that Algorithm \ref{alg:improvement} can be implemented faster computationally by reducing the batch size or by resampling the posterior periodically rather than at every gradient step (see Appendix \ref{app:rebuttal_comp} for computational benchmarks).
\begin{algorithm}
    \caption{Stochastic Gradient Policy Optimisation}\label{alg:improvement}
    \begin{algorithmic}
    \Require Initial deterministic $\pi$, posterior distribution over transition parameters $p(\mathcal{M}|\mathcal{D})$, initial policy softness parameter $\eta$, learning rate $\alpha$
    \State $\forall s\in \mathcal{S}, a\in \mathcal{A}, \, z_{sa} \gets \log(\eta/(|\mathcal{A}|-1))$
    \State $\forall s\in \mathcal{S}, \, z_{s\pi(s)} \gets \log(1-\eta)$ \Comment{Set initial $\pi$ params}
    \While{not converged}
    \State $\forall s\in \mathcal{S}, a\in \mathcal{A}, \, \text{let} \, \pi(a|s) \gets \frac{\exp(z_{sa})}{\sum_a' \exp({z_{sa'}})}$
    \State ${\theta^{s'}_{sa}}_{\{1:n\}} \gets n$ minibatch samples from $p(\mathcal{M}|\mathcal{D})$
    \For{$s\in S, s' \in S$}
        \State $\{\vec{T}_{s s'}\}_{\{1:N_M\}}\gets \sum_a \pi(a|s) \theta_{sa \,\{1:N_M\}}^{s'}$ \Comment{$N_M$ action-marginalised transition matrices}
    \EndFor
    \State $\vec{v}_{1:N_M} \gets  (\vec{I} - \gamma \vec{T}_{1:N_M})^{-1}\vec{r}$ \Comment{Eq. \ref{eq:bellmansolution} for samples}
    \State $\mathcal{L} = - \sum_i \vec{\rho} \cdot \vec{v_i}$ \Comment{Posterior and state marginalised}
    \State $\forall s\in \mathcal{S}, a\in \mathcal{A}, \, z_{sa} \gets z_{sa} - \alpha \frac{\partial \mathcal{L}}{\partial z_{sa}}$ \Comment{Gradient step}
    \EndWhile
    \State $\forall s\in \mathcal{S}, a\in \mathcal{A}, \, \pi(a|s) \gets \frac{\exp(z_{sa})}{\sum_a' \exp({z_{sa'}})}$
    
    \Return $\pi$
    \end{algorithmic}
\end{algorithm}
\section{Experiments}

Here we apply the proposed method on toy environments and a real-world clinical dataset.
The toy environments demonstrate the salient features of our methods where ground-truth MDPs can be easily generated and interpreted, while the application to clinical data confirms its scalability to MDPs with practical use.
We first examine uncertainty evaluation on interpretable gridworlds for a specific policy and then consider policy optimisation on gridworlds and synthetic MDPs.
Finally we apply the same methods to the MIMIC-III dataset \citep{mimic}, and present results on the impact that carrying out our approach has on this dataset's posterior expected value.

\subsection{Gridworld}
\label{sec:gridworld}

We consider a gridworld with stochastic transitions: at each step there is a probability $p_{\text{rand}}$ of being pushed down regardless of action taken.
Otherwise, the agent moves up, down, left or right by one square determined by the action.
The observed transitions dataset $\mathcal{D}$ is generated by repeatedly spawning an agent in a non-terminal random state and carrying out a random action.
Experiments are ran on the gridworld visualised in Fig.~\ref{fig:gridworld}.
The results presented here are for datasets of varying sizes, where smaller ones are always subsets of any larger ones to ensure that the latter are strictly more informative.

\label{sec:uncertaintyquantification}
\paragraph{Uncertainty Quantification}
We consider the policy uncertainty \emph{evaluation} problem, comparing how results from our Bayesian approach differ from others when evaluating aleatoric and epistemic uncertainty for the policy that is optimal under the MLE dynamics parameter estimates.
We see in Fig.~\ref{fig:stochasticity} that the uncertainty quantification results from applying Algorithm \ref{alg:uncertainty} scale consistently with varying dataset size (epistemic uncertainty always becomes small with more data) and intrinsic stochasticity (higher $p_\text{rand}$ corresponds to higher aleatoric uncertainty).
In contrast, we find that the approach in \citet{paul} always leads to low epistemic uncertainty at the end of training, as the lack of knowledge of the underlying MDP is not modelled, and thus does not scale consistently with data.
In Appendix \ref{app:paul} we visualise how epistemic uncertainty evolves during training with different datasets.
We adapt their algorithm to carry out SARSA policy evaluation on the same, fixed policy and observe that it always tends to be small regardless of how informative the dataset is by the end of training.
Additionally, as discussed in section \ref{sec:uncertaintymethod}, the computation of aleatoric and epistemic uncertainty through closed-form moments as in Algorithm \ref{alg:uncertainty} does not require averaging samples over episodic rollouts as in \citet{learningtodefer}.

\begin{figure}[!b]
    \begin{subfigure}{0.15\textwidth}
    \raisebox{0.75cm}{
    %\vspace*{-2cm}
    %\centering
    \includegraphics[width=\columnwidth, trim = 0cm -2cm 0cm 0cm]{figures/figplain_grid_star.png}}
    \caption{Gridworld}
    \label{fig:gridworld}
    \end{subfigure}
    \centering
    \begin{subfigure}{0.32\textwidth}
    \centering
    \includegraphics[width=\textwidth]
    {figures/uncertainty_quantification.pdf}
    \caption{Aleatoric and epistemic uncertainty}
    \label{fig:stochasticity}
    \end{subfigure}
    \caption{Fig.~\ref{fig:gridworld} shows the gridworld used in the experiments. The terminal states are the failure \textbf{F} states (cliff) and the goal \textbf{G} state. The agent can move up, down, left, or right (or remain stationary if it hits the boundary of the grid). The transition dynamics have intrinsic stochasticity controlled by the probability $p_{\text{rand}}$, which is the probability of pushing the agent down regardless of action taken. Offline training datasets were generated by randomly sampling actions at random non-terminal states. State $\bigstar$ is chosen as an exemplar state to plot state-dependent uncertainties. In Fig.~\ref{fig:stochasticity}, the plot shows the epistemic (blue) and aleatoric (red) standard deviations as a function of training dataset size, with different levels of intrinsic stochasticity indicated by solid, dashed, and dotted lines.}
    
\end{figure}

\paragraph{Bayesian Policy Optimisation}

An optimal memoryless policy that accounts for the model uncertainty maximises the posterior expected value given in Eq.~\ref{eq:objective}.
We compare the performance on this objective of four policies: the optimal policy when transition probabilities are modelled as naive visitation frequencies (MLE-optimal policy), the optimal policy for the \textit{expected} or marginalised (referred to as nominal) MDP (Nominal policy)  \citep{robustvalueiter, robustmdppercentile}, the policy derived from the second-order approximation of value in terms of posterior moments proposed in \cite{robustmdppercentile} (Second order policy) and ours, described in Algorithm~\ref{alg:improvement} (Gradient policy).
In Algorithm \ref{alg:improvement} and the second order policy, we choose the initial policies to be a softened version of the Nominal policy $(\eta=0.1)$.
Just like the MLE-optimal policy, the required amount of computation is one round of value iteration \citep{suttonbartobook} and is therefore a computationally negligible addition to the algorithm.

A further method that optimises a similar objective is  the Multi-Sample Backwards Induction (MSBI policy) algorithm suggested in \cite{robustvalueiter}. However, this algorithm was originally devised to find a policy that achieves a near-optimal lower bound on the utility with respect to the Bayes-\emph{adaptive} policy, which is a different task to the one considered here.
Nonetheless, for completeness we report the corresponding results for this method in Appendix~\ref{app:msbi}, and found that it did not outperform ours on any of the metrics considered.

\begin{figure*}
    \centering
    \begin{subfigure}[b]{0.24\textwidth}
        \includegraphics[width=\columnwidth]{figures/value_comparison.pdf}
        \caption{Single run}
        \label{fig:gridbayesianvalues}
    \end{subfigure}
     \begin{subfigure}[b]{0.24\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figures/relative_comparison_mle.pdf}
         \caption{Gradient vs MLE-optimal}
         \label{fig:gradmle}
     \end{subfigure}
     \begin{subfigure}[b]{0.24\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figures/relative_comparison_nominal.pdf}
         \caption{Gradient vs Nominal}
         \label{fig:gradnominal}
     \end{subfigure}
     \begin{subfigure}[b]{0.24\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figures/relative_comparison_second_order.pdf}
         \caption{Gradient vs Second order}
         \label{fig:gradsecondorder}
    \end{subfigure}
    \caption{Fig.~\ref{fig:gridbayesianvalues} shows the average posterior expected return (`Value') as a function of dataset size for a single set of generated datasets, as in the objective in Eq.~\ref{eq:objective}. The example gridworld has $p_{\text{rand}}=0.25$. As value will be dataset-dependent, we show the average and standard deviation between the pairwise difference in posterior values between ours and the other methods in Figs.~\ref{fig:gradmle},~\ref{fig:gradnominal} and \ref{fig:gradsecondorder}, where values above the red dashed line signify an improvement. These plots report the average and standard deviation across 50 generated datasets for each dataset size.}
    \label{fig:manystatecomparison}
\end{figure*}

We empirically find that the gradient-optimised policy consistently outperforms the other methods on optimising the posterior value objectives, especially in lower data regimes when optimising the Bayesian posterior value objective. Results from a sample run are presented in Fig.~\ref{fig:gridbayesianvalues} for different dataset sizes.
The corresponding relative performances of our method against both MLE-optimal and second order policies on the posterior value objective over 50 sets of generated datasets are presented in Fig.~\ref{fig:gradmle}, \ref{fig:gradnominal} and \ref{fig:gradsecondorder} with error bars (standard deviations), confirming that our method consistently outperforms the other two in terms of posterior value maximisation over a larger number of randomly-generated datasets.

\subsection{Synthetic MDPs}
\label{sec:syntheticmdps}
While gridworlds are convenient to interpret results relating to uncertainty disentanglement, they are not adequate for repeated experimentation and evaluation on multiple ground truth environments.
Therefore, we present here results on unstructured, synthetic MDPs that allow us to meaningfully evaluate the ground-truth performance of learned policies on a large number of MDPs.

The MDPs we consider have 5 states, 5 actions and are generated by sampling the ground-truth transition probabilities independently for each state-action from a flat Dirichlet prior (with any state being a valid next state).
To break the symmetry between states, we sample the state-dependent reward from a normal Gaussian and keep these constant throughout all experiments.
The datasets are generated by sampling the outcome of visiting each state-action between 1 and 10 times, resulting in different dataset sizes.
For each dataset size, we generate 250 different MDPs and datasets and train the various policies on these datasets.
Finally, we roll out the policies and evaluate them on the ground-truth MDP that generated the data (for 1k steps and with $\eta=0.5$).
Similarly to the previous section, the intrinsic `luck' associated with MDPs generated can affect the maximum value that each policy is able to achieve, so we focus on the difference in performance between methods for each MDP.
In Fig.~\ref{fig:syntheticgroundtruth}, we display the ground-truth relative performance of the various policies compared to ours.
For all ground-truth results, we display standard error of the mean rather than standard deviation as we are interested in average performance across prior MDP samples rather than the variability over each individual sample.
We also report the performance on the posterior expected value, as with the gridworlds, in Fig.~\ref{fig:syntheticbayes}.
In Appendix~\ref{app:rebuttal_perf}, we present the same results in Figs.~\ref{fig:syntheticgroundtruth} and~\ref{fig:syntheticbayes} with the $y$-values rescaled to reflect fractional improvements rather than absolute values.

\begin{figure*}
    \centering
    \begin{subfigure}[b]{0.3\textwidth}
        \includegraphics[width=\columnwidth]{synthetic_figures/mle_ground_truth.pdf}
        \caption{Gradient vs MLE-optimal}
        \label{fig:syntheticmleground}
    \end{subfigure}
     \begin{subfigure}[b]{0.3\textwidth}
        \includegraphics[width=\columnwidth]{synthetic_figures/nominal_ground_truth.pdf}
        \caption{Gradient vs Nominal}
        \label{fig:syntheticnominalground}
    \end{subfigure}
    \begin{subfigure}[b]{0.3\textwidth}
        \includegraphics[width=\columnwidth]{synthetic_figures/second_order_ground_truth.pdf}
        \caption{Gradient vs Second order}
        \label{fig:syntheticsecondground}
    \end{subfigure}
    \caption{Ground truth pairwise difference in average performance (and shaded standard error of the mean) on the policies found by each method and rolled out on the ground-truth synthetic MDP. Regions above the red line correspond to improved performance with our method.}
    \label{fig:syntheticgroundtruth}
\end{figure*}

\begin{figure*}
    \centering
    \begin{subfigure}[b]{0.3\textwidth}
        \includegraphics[width=\columnwidth]{synthetic_figures/mle_bayes.pdf}
        \caption{Gradient vs MLE-optimal}
        \label{fig:syntheticmlebayes}
    \end{subfigure}
     \begin{subfigure}[b]{0.3\textwidth}
        \includegraphics[width=\columnwidth]{synthetic_figures/nominal_bayes.pdf}
        \caption{Gradient vs Nominal}
        \label{fig:syntheticnominalbayes}
    \end{subfigure}
    \begin{subfigure}[b]{0.3\textwidth}
        \includegraphics[width=\columnwidth]{synthetic_figures/second_order_bayes.pdf}
        \caption{Gradient vs Second order}
        \label{fig:syntheticsecondbayes}
    \end{subfigure}
    \caption{Average and standard deviation (shaded) of posterior expected value difference achieved by our method. Regions above the red line correspond to improved objective optimisation with our method.}
    \label{fig:syntheticbayes}
\end{figure*}
We notice that our stochastic gradient/based method consistently outperforms the others in the low data regimes, both for ground truth performance as well as optimisation performance.
MLE and Nominal policies can be very sub-optimal in the low data regime and then significantly improve with more data, suggesting that with more data the gradient-based optimisation may not be necessary, whereas the second order policy is consistently slightly sub-optimal compared to ours. 

\subsection{Clinical Data}
\label{sec:clinicalresults}
We apply Algorithm \ref{alg:improvement} to the MIMIC-III dataset, as in \cite{aiclinician} and \cite{paul}, using the same clustering of 752 states and 25 actions.
Two terminal states represent patient recovery and death, with reward 1 for a patient's recovery and 0 for death. Thus value corresponds to probability of survival when $\gamma\approx1$ ($\gamma=0.999$).
As in \cite{aiclinician}, actions at any state with fewer than 5 visits in the dataset are excluded.
We address here two main points.
First we confirm that our method can scale computationally to real-world MDPs and datasets.
Secondly, we investigate the impact on the posterior expected value when employing our policy compared to the MLE-optimal one as in the original work.

Fig.~\ref{fig:statevalues} shows the posterior expected value of the two policies under two different choices of dynamics prior. Fig.~\ref{fig:statevalues}a corresponds to a symmetric Dirichlet prior chosen via Bayesian model selection. The posterior probability mass over transition parameters still has a high entropy causing the agent to believe transitions are essentially random.
In Fig.~\ref{fig:statevalues}b, we employ a conservative sparse dynamics model that only includes the death state and any observed next states in the dataset as possible next-state outcomes for each state-action.
Here, we notice that the posterior expected value can be significantly increased by using our policy optimisation algorithm, suggesting that we are in a data regime where the choice of algorithm for policy selection is important.
We defer more detailed discussion and a visualisation of resulting uncertainties to Appendix \ref{app:clinical}.

\begin{figure}
    \centering
    \includegraphics[width=0.5\textwidth]{figures/priorcomparison.pdf}\caption{Posterior of each state (blue dots) under our policy and the MLE-optimal policy in the clinical MDP. Points above the diagonal indicate superior performance of our policy on the posterior expect value. The left plot (a) demonstrates the impact of policy choice on performance when employing Bayesian model selection with an optimal parameter of $\alpha_p=0.072$. The right plot (b) shows the same result when using a prior selected through a conservative sparse dynamics model.}
    \label{fig:statevalues}
\end{figure}


\section{Limitations}
Our methods are investigated for a specific category of Markov Decision Processes (MDPs) with finite states and known reward structures.
While our method is well-suited for the lower data regimes, we empirically observe that it can be slightly suboptimal compared to the classical dynamic programming baselines, in particular the “Nominal” policy, in higher data regimes where uncertainty in the underlying transitions is low. Nonetheless, in practice this can be detected by comparing the Bayesian objective of the nominal policy to the posterior expected value achieved by the policy after our stochastic gradient optimisation and choose the one with the better posterior expected value.
We have shown our approach can handle moderately-sized MDPs that carry practical real-world application possibilities in section \ref{sec:clinicalresults}).
However, it relies on matrix inversion ($\mathcal{O}\left(|\mathcal{S}|^3\right)$ complexity) so it cannot directly scale to much larger MDPs.
In Appendix~\ref{app:rebuttal_comp} we show empirical results on how our method scales to larger state-spaces on the computational side.
One key limitation of our proposed methods towards real-world application is the sensitivity of the resulting policy and inferred values on the dynamics model prior used, especially when data is inadequate for effective inference across all dynamics priors. 
For example, we observe that the effects of having a sparse or evidence-optimised model can be significant on both the inferred policy and the associated posterior values (see Fig.~\ref{fig:statevalues}) and exactly how to best include or combine these elements to select a prior that achieves consistently good performance on real-world MDPs is an important question and one that we defer to future work.

\section{Conclusion}
We have proposed methods to estimate Bayesian aleatoric and epistemic uncertainty in the outcome of finite-state space policies and to maximise posterior expected value. We offer a real-world example application of our method in a prominent case of discrete-state offline RL \citep{aiclinician} in clinical decision support systems.
In contrast to previous approaches that estimate such uncertainties in MDPs with finite states, we directly exploit the tractability of stationary MDPs to avoid potentially computationally expensive or inaccurate episodic rollouts (necessary in non-stationary MDPs \citep{learningtodefer}) or employing ensemble of model-free approaches that may overlook dynamics uncertainty \citep{uadqn, paul}.

On the control side, we introduced a stochastic gradient-based method to optimise posterior expected value \citep{robusttradeoff} that, unlike previous approaches \citep{robustmdppercentile}, does not make strong assumptions on the posterior's distribution and does not introduce bias from having a finite number of posterior samples \citep{robustvalueiter}.
Through numerical simulations, we have shown that our method consistently improves on the posterior value objective as well as performance on ground-truth MDPs,  particularly in low data regimes, when these are unknown and sampled from a given prior.
Our method can be extended to optimise value over any distribution of MDPs that can be sampled from, including those with uncertain or more expressive rewards (of the form $R(s,a,s')$) as these also have differentiable closed-form expressions for value in terms of policy \citep{sobel}.
We apply our method to a clinical dataset, confirming its computational scalability, and notice that the resulting policy significantly improves the posterior expected values compared to that in the original approach \citep{aiclinician}.
We suggested domain-specific conservatism in the dynamics model as a potential solution to new challenges that arise in this task and a starting point for further work towards finding offline policies with robust ground-truth performance in finite-state MDPs.

\begin{acknowledgements}
FV was supported by a Department of Computing PhD scholarship and AAF was supported by a UKRI Turing AI Fellowship (EP/V025449/1).
\end{acknowledgements}

\bibliography{main}

\bibliographystyle{plainnat}
\newpage
\include{appendix}

\end{document}
