\documentclass[accepted]{uai2022}

\usepackage[american]{babel}
\usepackage{natbib}
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{siunitx}

\newtheorem{thmthm}{Theorem}
\def\cD{\mathcal{D}}
\def\cH{\mathcal{H}}
\def\th{\tilde{h}}
\def\tH{\tilde{H}}
\def\tz{\tilde{z}}
\def\tZ{\tilde{Z}}
\def\E{\mathbb{E}}

% Cross references
\newcommand{\secProtomodel}{3.1}
\newcommand{\secProtovalues}{3.3}
\newcommand{\figSepsisBias}{8}
\newcommand{\eqRBFsim}{(3)}
\newcommand{\eqSoftmax}{(4)}
\newcommand{\eqProjection}{(6)}
\newcommand{\eqProtoProb}{(8)}

\title{Case-Based Off-Policy Evaluation Using Prototype Learning\\(Supplementary material)}

\author[1]{Anton~Matsson}
\author[1]{Fredrik~D.~Johansson}

\affil[1]{Chalmers University of Technology}

\begin{document}

\maketitle

% ----------------------------------------------------------
% -- APPENDIX ----------------------------------------------
% ----------------------------------------------------------

\appendix

\section{The Prototype Model}
\label{app:reg}

In Section \secProtomodel, we only briefly described the objective function
\begin{equation*}
    J(\Theta) = \mathrm{NLL}(\cD; \Theta) + \lambda_{d}R_{d}(\Theta) + \lambda_{c}R_{c}(\Theta) + \lambda_{e}R_{e}(\Theta),
\end{equation*}
where $\mathrm{NLL}(\cD; \Theta)$ is the negative log-likelihood, $R_{d}(\Theta)$, $R_{c}(\Theta)$ and $R_{e}(\Theta)$ are regularization terms and $\lambda_{d}$, $\lambda_{c}$ and $\lambda_{e}$ are regularization parameters. $\Theta$ denotes the set of model parameters, i.e., the parameters of the encoding network $e$, the weights $B, c$ and the prototypes $\tH$. For a given dataset $\cD = ((h^1_{t_1}, a^1_{t_1}), \ldots, (h^m_{t_m}, a^m_{t_m}))$, drawn according to a distribution $p_{\mu}$, the NLL loss of the estimate $\hat{p}_{\mu}$, parameterized in $\Theta$, is defined as
\begin{equation*}
    \mathrm{NLL}(\cD; \Theta) = -\frac{1}{m} \sum_{i=1}^{m}
    \log{\left(\hat{p}_{\mu}(A_{t}=a_{t_{i}}^{i} \mid H_{t}=h_{t_{i}}^{i})\right)}.
\end{equation*}
Furthermore, the regularization terms are defined as follows (see \cite{ming2019prototypes} for further details):
\begin{itemize}
    \item The \textbf{diversity} regularization 
    \begin{equation*}
        R_{d}(\Theta) = \sum_{i=1}^{n}\sum_{j=i+1}^{n}\text{max}\left(0, d_{\mathrm{min}}-d(\tz_{i}, \tz_{j})\right)^{2},
    \end{equation*}
    where $d(z, z^{\prime}) = \|z-z^{\prime}\|_{2}$, penalizes latent prototypes that are too close to each other. The parameter $d_{\mathrm{min}}$ is a tunable hyperparameter in our experiments.
    
    \item The \textbf{clustering} regularization 
    \begin{equation*}
        R_{c}(\Theta) = \sum_{h\in \cD} \underset{i}{\text{min}}\,d(\tz_{i}, e(h))^2
    \end{equation*}
    encourages the encoded histories to approach the most similar latent prototypes, which creates a clustering structure in the latent space.
    
    \item The \textbf{evidence} regularization 
    \begin{equation*}
        R_{e}(\Theta) = \sum_{i=1}^{n}\underset{h\in \cD}{\text{min}}\:d(\tz_{i}, e(h))^{2}
    \end{equation*}
    encourages the latent prototypes to approach the encodings that are most similar to them.
\end{itemize}

\subsection{Prototype Value}
\label{app:protvalue}

In Section \secProtovalues, we showed how the prototypes can be used to compute the value of a policy $\pi$ for prototype $j$ at time $t$, $V_{j, t}(\pi)$. Below, we derive a statistical estimator for $V_{j, t}(\pi)$ using observations under $\mu$. First, we have
\begin{align*}
    V_{j,t}(\pi) & \coloneqq  \E_{\pi} \bigg[\sum_{t' \geq t} R_{t'} \Bigm\vert J_t = j \bigg] \\
    & = \E_{\pi} \bigg[ \frac{p(J_t = j \mid H_t)}{p_\pi(J_t=j)}  \sum_{t' \geq t} R_{t'} \bigg].
\end{align*}
This equation follows from the fact that $J_t$ is conditionally independent of all other variables given $H_t$. Now, with $W$ importance weights for $\pi$ and $\mu$, 
\begin{equation*}
    V_{j,t}(\pi) = \E_{\mu} \bigg[ \frac{p(J_t = j \mid H_t)}{p_\pi(J_t=j)} W \sum_{t' \geq t} R_{t'}  \bigg].
\end{equation*}
Following standard definitions, 
\begin{equation*}
    p_\pi(J_t=j) = \E_\pi[p(J_t \mid H_t)],
\end{equation*}
which may be identified using importance sampling, 
\begin{equation*}
    p_\pi(J_t ) = \E_\pi[p(J_t \mid H_t)] = \E_\mu[p(J_t \mid H_t)W_t],
\end{equation*}
with $W_t = \prod_{t^{\prime}=0}^t \frac{p_\pi(A_{t'} \mid H_{t'})}{p_\mu(A_{t'} \mid H_{t'})}$. Hence, we may estimate
\begin{equation*}
    \hat{p}_\pi(J_t=j) = \frac{1}{m}\sum_{i=1}^m  p(J_t=j \mid H_t = h_{t}^{i})w_{t}^{i}.
\end{equation*}
For trajectories $i$ which end before $t$, we let $\hat{p}(J_t=j \mid H_t = h_{t}^{i}) = 0$.

\subsection{Is There a Good Prototype Model?}
\label{app:ignorability}

In Section \secProtomodel, we asked the question: Assuming that adjusting for the history $H_t$ is sufficient for unbiased policy evaluation, do there exist prototype histories $\tilde{H}$, an encoding $e$ and a similarity function $s$ such that evaluation using the prototype model is accurate? Here follows an example when this is provably the case.

Consider the history at the first time step, $H_0 = X_0 \in \mathbb{R}^d$. Assume that for each action $a$, the distribution of histories in which action $a$ is taken is isotropic Gaussian, $(H_0 \mid A_0=a) \sim \mathcal{N}(\mu_a, \gamma^2)$. Then, the joint distribution of $(H_0, A_0)$ is a Gaussian mixture model (GMM) with components identified by the actions, $a=1, \ldots, k$. We have by the definition of the GMM that 
\begin{equation*}
    p(A_0=a \mid H_0=h) \propto e^{-\frac{\|h - \mu_a\|^2_2}{\gamma^2}} = s(\mu_a, h),
\end{equation*}
where $s$ is defined as in \eqRBFsim. As a result, with the component means $\tilde{X} = [\mu_1, \ldots, \mu_k]$ as prototypes and $S(\tilde{X}, h) = [s(\mu_1, h), \ldots, s(\mu_k, h)]^\intercal$, the behavior policy is given by $p(A_0 \mid H_0 = h) = S(\tilde{X}, h)/(\sum_a s(\tilde{x}_a, h))$, which matches \eqSoftmax with $B = 1/(\sum_a s(\tilde{x}_a, h))$ and $c=0$, up to the application of the softmax function. Furthermore, the prototype assignment probability in \eqProtoProb is equal to the behavior policy. We state a generalization below.
%
\begin{thmthm}
    \label{thm:gmm}
    Assume that there exists a bijective, differentiable encoding function $e : \cH \rightarrow \mathcal{Z}$ such that $\forall t : (e(H_t), A_t) \sim \mathrm{GMM}$ with stationary component means $\{\mu_a\}_{a=1}^k$ and  variance $\gamma^2$. Then, with prototypes $\tilde{H} = [e^{-1}(\mu_1), \ldots, e^{-1}(\mu_k)]^\intercal$, and $S$ as defined above, 
    \begin{equation*}
        \forall t : p(A_t \mid H_t=h)\propto S(e(\tilde{H}), e(h)).
    \end{equation*}
\end{thmthm}
\begin{proof}
    Due to bijectivity, $p(A_t \mid H_t=h) = p(A_t \mid e(H_t)=e(h))$. The final result follows from the same argument as for the special case of $H_0$ above.
\end{proof}
%
Theorem~\ref{thm:gmm} shows that there are indeed problems for which a prototype model that exactly describes the behavior policy \emph{exists}. This also implies that there exists a prototype estimate of the value $V(\pi)$ which is unbiased. However, it does not give guarantees for recovering such a model from data, or that the training set contains samples which act well as prototypes. Learning encoding functions $e$ which satisfy the conditions of Theorem~\ref{thm:gmm} has been studied in the context of normalizing flows~\citep{kong2020expressive,rezende2015variational}.

\section{Experimental Details}
\label{app:exp_details}

The prototype approach for off-policy evaluation was evaluated on real-world sepsis data extracted from the MIMIC-III database~\citep{mimiciii}. In addition, a synthetic environment for sepsis management, provided by~\cite{oberst2019counterfactual}, was used to study the bias induced by prototypes as a function of the trajectory length. In this section, we give further details about the experiments. To produce the results presented in this paper, we needed about 750~core-hours of computational time. The neural networks were implemented in PyTorch~\citep{pytorch} and trained on GPU (Nvidia Tesla T4) using the skorch framework~\citep{skorch}. Other models were implemented using scikit-learn~\citep{scikit-learn}.

\subsection{Using Data from MIMIC-III}
\label{app:mimic}

We extracted the dataset of sepsis patients from the MIMIC-III database using the code provided by \cite{komorowski2018artificial}.\footnote{The original code is available at \url{https://github.com/matthieukomorowski/AI_Clinician}.} This dataset contains the features listed in Supplementary Table 2 in \cite{komorowski2018artificial} as well as the total fluid intake and the total urine output for each patient. We also built the AI Clinician using the code provided by \cite{komorowski2018artificial}. To evaluate the 500 candidate policies, we used only the MIMIC-III test data and not data from the eICU Research Institute Database.

We used the train-test split associated with the best performing candidate policy in our experiments. We trained and evaluated the estimators of the behavior policy using a subset of the available features: heart rate, systolic blood pressure, diastolic blood pressure, mean blood pressure, shock index, hemoglobin, BUN, creatine, urine output over 4 hours, pH, base excess, bicarbonate, lactate, $\text{PaO}_{\text{2}}/\text{FiO}_{\text{2}}$ ratio, age, Elixhauser index and SOFA score. In addition, we included the treatment dose of vasopressors and IV fluids, respectively, over the previous 4 hours. These values were set to 0 at the initial time steps.

For ProNet and ProSeNet, we selected parameters of the diversity regularization $(d_{\textrm{min}}, \lambda_{d})$ by performing 3-fold cross-validation over a grid of points in the parameter space $\{1, 2, 3, 4, 5\}\times \{0.00001, 0.0001, 0.001, 0.01, 0.1\}$. These parameters were optimized for each combination of prototypes $n$ and prediction prototypes $q$ in our experiments. The parameters $\lambda_{c}$ and $\lambda_{e}$ were set to 0.001, and we performed the projection step, see \eqProjection, every fifth epoch.

For LR and RF, we searched for optimal models using 3-fold cross-validation, considering the following parameter values:
\begin{itemize}
    \item LR: regularization: $\{\mathrm{L1}, \mathrm{L2}\}$; regularization strength: 10 values spaced evenly on a log-scale from \SI{1e-4} to \SI{1e4};
    \item RF: maximum tree depth: $\{5, 10, 15, 20, \mathrm{None}\}$.
\end{itemize}

To create the model based on post-hoc clustering of encodings, we performed K-means clustering to cluster the encodings of the converged RNN model into 10 clusters. We used the encodings that had the shortest Euclidean distance to the cluster centroids as ``prototypes'' and trained a logistic regression model on the similarity vector between encodings and ``prototypes''.
 
We estimated the value of the target policies---the AI Clinician and the zero-drug policy---by performing weighted importance sampling of the test set trajectories. In accordance with \cite{komorowski2018artificial}, we used a final reward $r^{i}=\pm 100$ based on the survival of the patient. For all estimators of the behavior policy, we excluded a small number of sequences whose WIS weight exceeded 100.
 
\subsection{Using the Sepsis Simulator}
\label{app:sepsis_sim}

To estimate the bias induced by prototypes as a function of trajectory length, we utilized the sepsis simulator provided by \cite{oberst2019counterfactual}.\footnote{The simulator is publicly available at \url{https://github.com/clinicalml/gumbel-max-scm/tree/sim-v2}.} We used the full state representation consisting of five discrete variables---a binary diabetes indicator and four ordinal-valued vital signs (heart rate, systolic blood pressure, blood glucose level and blood oxygen level)---as well as the previously administered treatments. There is a probability of 0.2 that a randomly initialized patient has diabetes. Furthermore, there are three binary treatment variables---antibiotics, vasopressors and ventilation---resulting in an 8-dimensional action space. A simulated patient is discharged if the patient has normal vitals and is not given any treatments; discharge results in a positive reward. Death is associated with a negative reward and occurs if at least three vitals are abnormal. If neither discharge nor death has occurred at the end of the sequence, the reward is zero. We refer to the code for details about, for example, the levels of the vitals and the transition probabilities of the Markov decision process (MDP).

For the experiment presented in Figure \figSepsisBias, we used the notebook \verb|learn_mdp_parameters| provided by \cite{oberst2019counterfactual} to estimate the true parameters of the MPD. We then learned an optimal behavior policy using policy iteration. The policy was softened so that all actions had a nonzero probability of being chosen in each state. We generated trajectories of varying length from this policy to estimate the effect of bias. Some trajectories ended prematurely due to discharge or death and we therefore ensured that the number of collected state-action pairs were roughly the same for all sequence lengths. Specifically, for each sequence length, we generated \SI{20000} state-action pairs for training and sigmoid calibration of the estimators $\hat{\mu}$, respectively. 

Since we used the full state representation, we made the Markov assumption and estimated the behavior policy using a vanilla FNN and FNN-based prototype models. We trained all models over 30 epochs, using a batch size of 128. Otherwise we used the same architectures and settings as in the main experiment with the MIMIC-III data.

The collected training set was also used to estimate the MDP parameters and learn a target policy $\pi$ using policy iteration. Again, this policy was softened to avoid zero probabilities. We generated an additional evaluation set of the same size as the training and calibration sets. We repeated the process of data collection and learning 100 times. To produce Figure \figSepsisBias, we removed a small fraction of samples for which the weight ratio exceeded $10^{3}$.

% ----------------------------------------------------------
% -- BIBLIOGRAPHY ------------------------------------------
% ----------------------------------------------------------

\bibliography{matsson_289-supp}

\end{document}