%\documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams


% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 

\makeatletter

\newcommand*{\addFileDependency}[1]{% argument=file name and extension
\typeout{(#1)}% latexmk will find this if $recorder=0
% however, in that case, it will ignore #1 if it is a .aux or 
% .pdf file etc and it exists! If it doesn't exist, it will appear 
% in the list of dependents regardless)
%
% Write the following if you want it to appear in \listfiles 
% --- although not really necessary and latexmk doesn't use this
%
\@addtofilelist{#1}
%
% latexmk will find this message if #1 doesn't exist (yet)
\IfFileExists{#1}{}{\typeout{No file #1.}}
}\makeatother

\newcommand*{\myexternaldocument}[1]{%
\externaldocument{#1}%
\addFileDependency{#1.tex}%
\addFileDependency{#1.aux}%
}
%------------End of helper code--------------

% put all the external documents here!
\myexternaldocument{su_214}


%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)
\newtheorem{example}{Example}
\newtheorem{theorem}{Theorem}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{amssymb}
\theoremstyle{definition}
\newtheorem{definition}{Definition}[section]
\theoremstyle{plain}
\newtheorem{lemma}[theorem]{Lemma}
\usepackage{tikz}
\usetikzlibrary{arrows.meta,automata}
\usepackage[capitalize,noabbrev]{cleveref}
\usepackage{nicefrac}
\usepackage{booktabs}


\DeclareMathOperator*{\argmax}{argmax}
\DeclareMathOperator*{\argmin}{argmin}
\renewcommand{\cite}[1]{\citep{#1}}
\hypersetup{colorlinks=true,citecolor=darkgray,linkcolor=black,urlcolor=black}
\crefname{equation}{Eq.}{Eqs.}
\newcommand{\Real}{\mathbb{R}}


\theoremstyle{plain}
\newtheorem{prop}{Proposition}
\newtheorem{corollary}{Corollary}
\theoremstyle{remark}
\newtheorem{remark}{Remark}

\crefname{equation}{Eq.}{Eqs.}
%% Self-defined macros

\title{Solving Multi-Model MDPs by Coordinate Ascent and Dynamic Programming\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<xihong.su@unh.edu>?Subject=UAI2023}{Xihong Su}{}}
\author[1]{Marek Petrik}
% Add affiliations after the authors
\affil[1]{%
    Department of Computer Science \\
    University of New Hampshire \\
    Durham, NH, USA
}

  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle
% This Supplementary Material should be submitted as a separate file. Please do not append the Supplementary Material to the main paper. 
% Fig. \ref{fig:pitt} and Eq \ref{eq:example} in the main paper can be cross referenced using \texttt{xr}. 
\appendix

\section{Proof of Theorem~\ref{thm:policy gradient}}

\begin{proof}[Proof of \cref{thm:policy gradient}]\label{proof:policy gradient}
% We start by recalling the definitions of terms that we need for the proof. The values $q_{t,m}^\pi (s, a) $ is defined for each $s\in \mathcal{S}$ and $a\in \mathcal{A}$ as
% \begin{equation}\label{p:q}
%     q_{t,m}^\pi (s, a) = r_t^{m}(s,a)  + \sum_{s' \in \mathcal{S}} p_t^m(s' | s,a) \cdot v_{t+1,m}^{\pi}(s').
% \end{equation}
% In addition, the value $b_{t,m}^{\pi}(s)$ is defined for $t=1$ as
% \begin{equation}%\label{eq:belief-initial}
%   b_{1,m}^{\pi}(s) =   \lambda_{m} \cdot \mu(s), \quad \forall m \in \mathcal{M}, s \in \mathcal{S}, \pi \in \Pi,
% \end{equation}
% and for each $t = 1, \dots, T-1$  defined as 
% \begin{equation}%\label{eq:belief-update}
%   b_{t+1,m}^{\pi}(s') = \sum_{s_{t},a \in \mathcal{S} \times \mathcal{A}}
%   p_t^{m}(s' |  s, a)  \pi_{t}(s, a)  b_{t,m}^{\pi}(s), \quad \forall s' \in \mathcal{S}\,.
% \end{equation}
% Finally, the Bellman equation for a random policy $\pi$ can be stated as in \cref{eq:bellman_random}:
%  \begin{equation}\label{eq:bellman_random}
%  \begin{aligned}
%         v_{t,m}^{\pi}(s) &= \sum_{a_t\in \mathcal{A} } \pi_t(s,a_t) \cdot q_{t,m}^{\pi}(s,a_t), 
%    &\qquad s\in \mathcal{S}, t\in \mathcal{T}, m\in \mathcal{M}.
%  \end{aligned}
% \end{equation} 

% Then, define $D^m(s_1, k)$ for each $s_1 \in \mathcal{S}, m \in \mathcal{M}$ in \cref{proof:D} to simpliify the derived equations. We assume that $D^m(s_1,0) = 1, m \in \mathcal{M}, s_1 \in \mathcal{S}$.

% \begin{equation}\label{proof:D}
%     D^m(s_1,k) = \prod_{i=1}^{k} \sum_{a_i,s_{i+1}  \in \mathcal{A} \times \mathcal{S}}\pi_i( s_i,a_i ) \cdot p_i^m(s_{i+1} \mid s_i, a_i) 
% \end{equation}

% Now, we show by a forward induction on $t$ that
% \begin{equation}\label{p:gradient-deter}
% \frac{\partial \rho(\pi)}{\pi_t(\hat{s},\hat{a})}= \sum_{m \in \mathcal{M}} b_{t,m}^{\pi}(\hat{s})\cdot q_{t,m}^{\pi}(\hat{s},\hat{a})
% \end{equation}

% First, let us expand $\rho$ with the Bellman equation in \cref{eq:bellman_random} for $t+1$ time steps and use $D^m$  in \cref{proof:D} to simplify the equation.
  For any time step $\hat{t} \in \mathcal{T}$, we can express the return as 
\begin{align*}
\rho(\pi) &= \mathbb{E}^{\lambda,\pi,p^{\Tilde{m}},\mu} \left[ \sum_{t=1}^{T}  r_{t}^{\Tilde{m}}(\Tilde{s}_{t},\Tilde{a}_{t}) \right] \\
  &= \mathbb{E}^{\lambda,\pi,p^{\Tilde{m}},\mu} \left[ \sum_{t=1}^{\hat{t}-1}  r_{t}^{\Tilde{m}}(\Tilde{s}_{t},\Tilde{a}_{t}) \right]  + \mathbb{E}^{\lambda,\pi,p^{\Tilde{m}},\mu} \left[ \sum_{t=\hat{t}}^{T}  r_{t}^{\Tilde{m}}(\Tilde{s}_{t},\Tilde{a}_{t}) \right] \\
  &\overset{\text{(a)}}{=}  C + \mathbb{E}^{\lambda,\pi,p^{\Tilde{m}},\mu}\left[ \mathbb{E} \left[ \sum_{t=\hat{t}}^{T}  r_{t}^{\Tilde{m}}(\Tilde{s}_{t},\Tilde{a}_{t}) \mid  \tilde{s}_{\hat{t}} , \tilde{m} \right] \right]\\
  &\overset{\text{(b)}}{=}  C + \sum_{m\in \mathcal{M},s_{\hat{t}}\in \mathcal{S},a_{\hat{t}}\in \mathcal{A}}   \mathbb{P}\left[\tilde{m} = m, \tilde{s}_{\hat{t}}= s_{\hat{t}}\right] \pi_{\hat{t}}(s_{\hat{t}}, a_{\hat{t}}) \cdot  \mathbb{E}^{\lambda,\pi,p^{\Tilde{m}},\mu}\left[ \sum_{t=\hat{t}}^{T}  r_{t}^{\Tilde{m}}(\Tilde{s}_{t},\Tilde{a}_{t}) \mid  \tilde{s}_{\hat{t}} = s_{\hat{t}}, \tilde{a}_{\hat{t}} = a_{\hat{t}},  \tilde{m} = m  \right]\\
  &\overset{\text{(c)}}{=}  C + \sum_{m\in \mathcal{M},s_{\hat{t}}\in \mathcal{S},a_{\hat{t}}\in \mathcal{A}}   b_{\hat{t},m}^{\pi}(s_{\hat{t}}) \cdot  \pi_{\hat{t}}(s_{\hat{t}}, a_{\hat{t}}) \cdot  q_{\hat{t},m}^{\pi}(s_{\hat{t}}, a_{\hat{t}})\,.
\end{align*}
Here, we use $C = \mathbb{E}^{\lambda,\pi,p^{\Tilde{m}},\mu} \left[ \sum_{t=1}^{\hat{t}-1}  r_{t}^{\Tilde{m}}(\Tilde{s}_{t},\Tilde{a}_{t}) \right]$ for brevity. The step (a) follows from the law of total expectation, the step (b) follows from the definition of conditional expectation, and the step (c) holds from the definitions of $b$ and $q$ in~\eqref{eq:v-t-m-s},~\eqref{eq:backup}, and~\eqref{eq:belief-state}.

Using the expression above, we can differentiate the return for each $s\in \mathcal{S}$ and $a\in \mathcal{A}$ as
\[
  \frac{\partial \rho(\pi)}{\partial \pi_{\hat{t}}(s,a)}
  \quad =\quad 
  b_{\hat{t},m}^{\pi}(s)  \cdot  q_{\hat{t},m}^{\pi}(s, a)\,,
\]
which uses the fact that $C$, $b_{\hat{t},m}^{\pi}$, and $q_{\hat{t},m}^{\pi}$  are constant with respect to $\pi_{\hat{t}}$. The desired result then holds by substituting $t$ for $\hat{t}$, $\hat{s}$ for $s$, and $\hat{a}$ for $a$.

% \begin{equation}\label{proof:rho}
% \begin{aligned}
%  \rho(\pi) &=\mathbb{E}^{\lambda} \left[   \mathbb{E}^{\pi,p^{\Tilde{m}},\mu} \left[ \sum_{t=1}^{T} r_{t}^{\Tilde{m}}(\Tilde{s}_{t},\Tilde{a}_{t}) \mid \Tilde{m} \right] \right]\\
%         &=\sum_{m \in \mathcal{M}} \lambda_m \cdot \mathbb{E}^{\pi,p^m,\mu}\left[\sum_{t=1}^T r_{t}^m (\Tilde{s}_{t},\Tilde{a}_{t}) \mid \Tilde{m} = m\right]\\
%      &=\sum_{s_1 \in \mathcal{S},m \in \mathcal{M}}  \lambda_m \cdot  \mu(s_1) \cdot \mathbb{E}\left[\sum_{t=1}^T r_{t}^m (\Tilde{s}_{t},\Tilde{a}_{t}) \mid \Tilde{s}_1 =s_1, \Tilde{m} = m\right]\\
%     &=\sum_{s_1 \in \mathcal{S},m \in \mathcal{M}} \lambda_m \cdot \mu(s_1) \cdot v_{1,m}^{\pi}(s_1)\\
%       &=\sum_{s_1 \in \mathcal{S},m \in \mathcal{M}}  \lambda_m \cdot \mu(s_1) \cdot \sum_{a_1 \in \mathrm{A}}\pi_1(s_1,a_1)\cdot q_{1,m}^{\pi}(s_1,a_1)\\
%      &=\sum_{s_1 \in \mathcal{S},m \in \mathcal{M}} \lambda_m \cdot \mu(s_1) \cdot  \sum_{a_1 \in \mathrm{A}}\pi_1(s_1,a_1)\left(r_1^m(s_1,a_1) +\sum_{s_2 \in \mathrm{S}}p^m(s_2 \mid s_1,a_1)\cdot v_{2,m}^\pi(s_2)\right)\\
%      &=  \sum_{s_1 \in \mathcal{S},m \in \mathcal{M}} \lambda_m \cdot  \mu(s_1) \cdot \sum_{a_1 \in \mathrm{A}}\Bigl(\pi_1(s_1,a_1 ) \cdot r_1^m(s_1,a_1) \Bigr)+ \sum_{k=1}^{t} \Bigl( \sum_{s_1 \in \mathcal{S},m \in \mathcal{M}} \lambda_m \cdot \mu(s_1) \cdot D^m(s_1,k-1) \cdot\\
%     &  \sum_{a_k,s_{k+1} \in \mathcal{A} \times \mathrm{S}} \pi_k(s_k,a_k) \cdot p_k^m(s_{k+1} \mid s_k,a_k) \cdot \sum_{a_{k+1} \in \mathcal{A}} \pi_{k+1}(s_{k+1},a_{k+1} )\cdot r_{k+1}^m(s_{k+1}, a_{k+1})\Bigr) + \\
%      & \qquad  +  \sum_{s_1 \in \mathcal{S},m \in \mathcal{M}}\Bigl( \lambda_m \cdot \mu(s_1) \cdot D^m(s_1,t-1) \cdot\sum_{a_t,s_{t+1} \in \mathcal{A} \times \mathcal{S}} \pi_t(s_t,a_t) \cdot p_t^m(s_{t+1} \mid s_t,a_t)   \cdot \\
%      & \qquad \cdot  \sum_{a_{t+1} \in \mathcal{A}, s_{t+2} \in \mathcal{S}} \pi_{t+1}(s_{t+1},a_{t+1}) \cdot p_{t+1}^m(s_{t+2} \mid s_{t+1},a_{t+1}) \cdot v_{t+2,m}^{\pi}(s_{t+2}) \Bigr) \\
% \end{aligned}
% \end{equation}
% In the base case with $t=1$, let us take the derive of $\rho(\pi)$ with respect to $\pi_1(\hat{s},\hat{a} )$, \cref{p:gradient-deter} holds.
% \begin{equation}
% \begin{aligned}
%  \frac{\partial\rho(\pi)}{\partial\pi_1(\hat{s},\hat{a})} &=\frac{\partial\displaystyle{ \left( \sum_{s_1 \in \mathcal{S},m \in \mathcal{M}} \lambda_m \cdot \mu(s_1) \cdot  \sum_{a_1 \in \mathrm{A}}\pi_1(s_1, a_1)(r_1^m(s_1,a_1) +\sum_{s_2 \in \mathrm{S}}p^m(s_2\mid s_1,a_1)\cdot v_{2,m}^\pi(s_2)) \right)}}{\partial \pi_1(\hat{s},\hat{a})}\\
%  &= \sum_{m \in \mathcal{M}} \lambda_m \cdot \mu(\hat{s}) \cdot q_{1,m}^\pi(\hat{s},\hat{a}) \\
%  &= \sum_{m \in \mathcal{M}}b_{1,m}^{\pi}(\hat{s}) \cdot q_{1,m}^\pi(\hat{s},\hat{a}) 
% \end{aligned}
% \end{equation}

% Since we assume $D^m(\hat{s},0) = 1$, at the initial time step $1$, 
% \[
% b_{1,m}^{\pi}(\hat{s})= \lambda_{m}\cdot \mu(\hat{s}) \cdot D(\hat{s},0)\\ 
% \]
% In the deductive case, assume that \cref{p:gradient-deter} holds for $t< T$, and show it also holds for $t+1$.
% let us take the derivative of $\rho(\pi)$ in \cref{proof:rho} with respect to $\pi_{t+1}(\hat{s},\hat{a})$.\\

% \begin{equation}
%     \begin{aligned}
%         \frac{\partial \rho(\pi)}{\partial \pi_{t+1}(\hat{s},\hat{a})} 
%       &=\sum_{s_1 \in \mathcal{S}, m \in \mathcal{M}} \lambda_m \cdot \mu(s_1) \cdot D^m(s_1,t-1) \cdot \sum_{a_t \in \mathcal{A}}\pi_{t}(s_t,a_t) p_t^m(\hat{s}\mid s_{t}, a_t)
%       \cdot  q_{t+1,m}^{\pi}(\hat{s},\hat{a})\\    
%         &= \sum_{ m \in \mathcal{M}, s_t \in \mathcal{S}} b_{t,m}^{\pi}(s_t) \cdot \sum_{a_t \in \mathcal{A}}\pi_t(s_t,a_t) p_t^m(\hat{s}\mid s_t, a_t)
%      \cdot  q_{t+1,m}^{\pi}(\hat{s},\hat{a}) 
%     \end{aligned}
% \end{equation}
% Then 
% \[
% b_{t+1,m}^{\pi}(\hat{s})= \displaystyle \sum_{s_{t}, a_t \in \mathcal{A} \times \mathcal{S}}  b_{t,m}^{\pi}(s_{t}) \cdot \pi_{t}( s_{t}, a_t) \cdot
%     p_{t}^{m}(\hat{s} \mid s_{t}, a_t)
% \]

% Then we have the policy gradient at time step $t+1$
% \[
% \frac{\partial \rho(\pi)}{\pi_{t+1}(\hat{s},\hat{a})}= \sum_{m \in \mathcal{M}} b_{t+1,m}^{\pi}(\hat{s})\cdot q_{t+1,m}^{\pi}(\hat{s},\hat{a})
% \]

\end{proof}

\section{Proof of Theorem~\ref{thm:improvement}}

\begin{proof}[Proof of \Cref{thm:improvement}]
  Assume some iteration $n$. The proof then follows directly from the contruction of the policy $\pi^n$ from $\pi^{n-1}$. By the construction in \cref{eq:swsu-optimal-action}, we have that:
  \[
    \rho(\pi_1^{n-1}, \dots\pi_{t-1}^{n-1}, \pi_t^n, \pi_{t+1}^{n}\dots, \pi_T^{n})
    \; \ge\;  
    \rho(\pi_1^{n-1}, \dots\pi_{t-1}^{n-1}, \pi_t^{n-1}, \pi_{t+1}^{n}\dots, \pi_T^{n})\,.
  \]
Note that the optimal form of the policy in \cref{eq:swsu-optimal-action} follows immediately from the standard first-order optimality criteria over a simplex~(e.g,~Ex.~3.1.2 in ~\cite{Bertsekas2016nonlinear}) and the fact that the function optimized in \cref{eq:swsu-optimal-action} is linear (\cref{cor:ret-linear}). In particular, we have that
\[
  \pi_t^n \in \argmax_{\hat{\pi}_t \in \Real^{\mathcal{S} \times\mathcal{A}}}\rho(\pi_1^{n-1}, \dots, \hat{\pi}_t, \dots, \pi_T^{n})
\]
if and only if for each $s\in \mathcal{S}$ and $a\in \mathcal{A}$
\[
  \frac{\partial \rho(\pi_1^{n-1}, \dots, \pi^n_t, \dots, \pi_T^{n})}{\partial\pi_t(s,\pi_t^n(s))}
 \;  \ge  \; 
  \frac{\partial \rho(\pi_1^{n-1}, \dots, \pi^n_t, \dots, \pi_T^{n})}{\partial\pi_t(s,a)}\,.
\]
Intuitively, this means that the optimal policy $\pi^n_t$ must choose actions that have the \emph{maximum} gradient for each state. The optimization in \cref{eq:swsu-optimal-action} then follows by algebraic manipulation from \cref{thm:policy gradient}.

\end{proof}


\section{Proof of Theorem ~\ref{thm:no-sublinear-regret}}

\begin{proof}[Proof of \cref{thm:no-sublinear-regret}]\label{proof:no-sublinear-regret}
Consider the MMDP illustrated in \cref{fig:simple-mmdp-1}. 

\begin{figure}
\centering
\begin{minipage}{.4\textwidth}
\begin{tikzpicture}[->,-Latex,>=Latex,font=\large,node distance=35mm,el/.style = {inner sep=2pt, align=left, sloped},
every label/.append style = {font=\tiny}]
\node[state] (2) {$2$};
\node[state, below of=2] (3) {$3$};
\node[state, below right of=2] (4) {$4$};
\node[state, below left of=2] (1) {$1$};
\draw  (2) edge[left] node[el,above]{$a=1,2$} (1)
  (1) edge[bend right, below] node[el,below,pos=0.7]{$a=1$} (2)
  (1) edge[bend left,below] node[el,above,pos=0.7]{$a=2$} (3)
  (3) edge[bend left,below] node[el,below]{$a=1,2$} (1);
\draw (4) edge[loop,above] node[el,above]{$a=1,2$} (4);
\end{tikzpicture}
\end{minipage}
\begin{minipage}{.4\textwidth}
\begin{tikzpicture}[->,-Latex,>=stealth,font=\large,node distance=35mm,el/.style = {inner sep=2pt, align=left, sloped},
every label/.append style = {font=\tiny}]
\node[state] (2) {$2$};
\node[state, below left of=2] (1) {$1$};
\node[state, below of=2] (4) {$4$};
\node[state, below right of=2] (3) {$3$};
\draw  (2) edge[left]node[el,above]{$a=1,2$} (1)
  (1) edge[bend right, above] node[el,below,pos=0.7]{$a=1$} (2)
  (1) edge[bend left, below] node[el,above,pos=0.7]{$a=2$} (4)
  (3) edge[loop,above] node[el,above]{$a=1,2$} (3)
  (4) edge[bend left, below] node[el,below]{$a=1,2$} (1);
\end{tikzpicture}
\end{minipage}
\caption{Left: model $m_{1}$, right: model $m_{2}$}
\label{fig:simple-mmdp-1}
\end{figure}

First, we describe the time steps, states, rewards, and actions for this MMDP. This MMDP has three time steps, four states $\mathcal{S} = \{1,2,3,4 \}$, two actions $\mathcal{A} = \{1,2\}$, and two models $\mathcal{M} = \{1,2\}$. The model weight for $m_{1}$ is $\lambda$, then the model weight for $m_{2}$ is $1-\lambda$.  State $1$ is the only initial state. In model $m_{1}$, the only non-zero reward 2 is received upon reaching state $2$. The agent takes action $1$, which leads to a transition to state $2$ with a probability of 1. The agent takes action $2$, which leads to a transition to state $3$ with probability 1.  The agent takes action $1$ or $2$ in state $2$, which leads to a transition to state $1$ with probability 1. The agent takes action $1$ or $2$ in state $3$, which leads to a transition to state $1$ with probability 1. The agent takes action $1$ or $2$ in state $4$, which leads to a transition to state $4$ with probability 1. 

In model $m_{2}$, the agent receives rewards 3 upon reaching state $4$ and receives rewards 2 upon reaching state $2$. The agent takes action $1$, which leads to a transition to state $2$ with probability 1. The agent takes action $2$, which leads to a transition to state $4$ with probability 1. The agent takes action $1$ or $2$ in state $2$, which leads to a transition to state $1$ with probability 1.  The agent takes action $1$ or $2$ in state $4$, which leads to a transition to state $1$ with probability 1.  The agent takes action $1$ or $2$ in state $3$, which leads to a transition to state $3$ with probability 1.

Now, let us analyze the regret of this MMDP. The optimal policy of the above example is a history-dependent policy. That is, to take action $2$ at time step $1$. At time step $2$, the agent takes action $1$ or $2$, which leads to a transaction back to state $1$. From time step $3$, if the agent is in model $m_{1}$, then take action $1$; if the agent is in model $m_{2}$, then take action $2$. 

Next, let us analyze the regret of a Markov policy for the MMDP. $S_{t}$ represents a state at time step $t$. State $1$ has two options: select action $1$ or select action $2$. If action $1$ is selected, this will give a regret value of 0 in model $1$ and a regret value of $1$ in model $2$. If action $2$ is selected, this will give a regret value of $2$ in model $1$ and a regret value of $0$ in model $2$. Therefore, at time step $1$, the total regret is 2$\lambda$ or 1$(1-\lambda$). At time step $2$, the agent takes action $1$ or $2$ in state $S_{2}$(1,3 or 4), which leads to a transition back to state $1$, and gets zero rewards and zero regrets. Then repeat the procedure. At time step $3$, the agent can take action $1$ or action $2$ in state $1$ again. For $T=3$, the trajectory of a Markov policy can be $(S_{1} =1, A_{1} = 1, S_{2},A_{2}, S_{3} =1,A_{3} =1)$, $(S_{1} =1, A_{1} = 1, S_{2},A_{2}, S_{3} =1,A_{3} =0)$, $(S_{1} =1, A_{1} = 0, S_{2},A_{2}, S_{3} =1,A_{3} =1)$, or $(S_{1} =1, A_{1} = 0, S_{2},A_{2}, S_{3} =1,A_{3} =0)$. The accumulated regret can be 2$\lambda$ +1$(1-\lambda$), 2$\lambda$ + 2$\lambda$, 1$(1-\lambda$) + 1$(1-\lambda$). That is, the regret is increased by 1$(1-\lambda$) or 2$\lambda$ for every two time steps. 
\[
R_t(\pi) \geq \frac{\min \{2\lambda, 1-\lambda\}}{2} \cdot t 
\]
Let $c = \frac{\min \{2\lambda, 1-\lambda\}}{2} $, $t' \geq 2$, then we always have 
\[
R_t(\pi) \geq c \cdot t \quad \text{for all} \quad  t \geq t'\
\]

No matter which Markovian policy the agent follows, the accumulated regret will be linear with respect to $t$. Therefore, for this MMDP, there exists no Markovian policy that achieves sub-linear regret.
\end{proof}

\section{Adapted MixTS Algorithm}
The adapted MixTS algorithm is formalized in \cref{alg:MixTS}. $P_0$ is the prior of MDPs and follows the uniform distribution. At the beginning of episode $t$, sample a MDP $M_t$ from the posterior $P_t$ and compute a policy $\pi_t$ that maximizes the value of $M_t$. Then at each time step $h$, take the action $A_h$ based on the policy $\pi_t$ and obtain reward $Y_h$. For each MDP $m \in \mathcal{M}$, update its posterior based on the received rewards.
\begin{algorithm}
   \caption{Adapted MixTS}
   \label{alg:MixTS}
   \textbf{Input}: The prior of MDPs $P_0$ 
\begin{algorithmic}[1]
   \STATE Initialize $P_1 \longleftarrow P_0$\\
   \FOR{episodes t =1, $\cdots$ ,$\mathcal{N}$ }
   \STATE Sample $ M_{t}\sim P_{t}$ \\ 
   \STATE Compute $\pi_{t}$ = $\pi^{M_{t}}$ \\
   \FOR{timesteps h = 1, $\cdots$, H }
   \STATE Select $A_{h} \gets \pi_{t}(S_{h})$  \\
   \STATE Observe reward $Y_{h}$
  \STATE Update $P_{t+1}(m) 	\propto P_{t}(m)P(Y_{h} \mid A_{h};m), \forall m \in \mathcal{M}$
   \ENDFOR
    \ENDFOR
  \end{algorithmic}
\end{algorithm}



\section{Numerical Results: Details}

\subsection{Domain Details}
The CSV files of all domains are available at \href{https://github.com/suxh2019/CADP}{https://github.com/suxh2019/CADP}. ``initial.csv''  specifies the initial distribution over states. ``parameters.csv'' contains the discount factor. ``training.csv''and ``test.csv''have the following columns: ``idstatefrom'', ``idaction'', ``idstateto'', ``idoutcome'', ``probability'', and ``reward''. Each row entry specifies a transition from ``idstatefrom'' after taking an action ``idaction" to state ``idstateto'' with the associated ``probability'' and ``reward'' in model ``idoutcome''. A policy is computed from the ``training.csv'', and the policy is evaluated on the ``test.csv''. The models are identified with integer values $0, \cdots, M-1$, and each model is defined on the same state space and the action space. The states are identified with integer values $0, \cdots, S-1$, and the actions are identified with integer values $0, \cdots, A-1$. Note that the number of actions taken in each state $s$ is less or equal to $A$. Each MDP model has its unique reward functions and transition probability functions.

\subsection{Additional Simulation Results}
\Cref{app:returns} shows mean returns of algorithms on five domains at different time steps. \Cref{app:standard-deviation} shows the standard deviations of returns of algorithms on five domains at different time steps. The algorithm “Oracle” knows the true model and its standard deviation summarizes the variability of MDP models in an MMDP. The standard deviations of other algorithms include both the variability of MDP models and the variability of a policy in a MDP model. \cref{app:runtimes} shows runtimes of algorithms on five domains at different time steps. $CADP $ performs best with some runtime penalty.
\begin{table*}
  \centering
  \caption{Mean Returns $\rho(\pi)$ on the Test Set of Policies $\pi$ Computed by Each Algorithm.}\label{app:returns}
\begin{tabular}{lrrrrrrrrrr}
\toprule
  \textbf{Algorithm}  \bfseries  & \multicolumn{2}{c}{\textbf{RS}} \bfseries & \multicolumn{2}{c}{\textbf{POP}}  \bfseries & \multicolumn{2}{c}{\textbf{POPS}}\bfseries & \multicolumn{2}{c}{\textbf{INV}} & \multicolumn{2}{c}{\textbf{HIV}} \\
 \bfseries   &  T = 100  \bfseries & T =150  &  T = 100  \bfseries & T =150  &  T = 100  \bfseries & T =150  &  T = 100  \bfseries & T =150  &  T = 5  \bfseries & T =20\\
  \midrule
 \textbf{CADP}   &\textbf{207}  & \textbf{207}  & \textbf{-368} &\textbf{-368} &\textbf{-1082} &\textbf{-1082} &348 &\textbf{350} &\textbf{33348} &\textbf{42566}\\
   WSU               &206   &206  &-551     &-551   &-1934 &-1932 &347 &349   &\textbf{33348} &42564\\
 MVP               &204   &204  &-717     &-717   &-2178 &-2179 &348 &\textbf{350}  &\textbf{33348} &42564 \\ 
\midrule
 Mirror            &183   &183  &-1601    &-1600  &-3810 &-3800 &343 &345   &\textbf{33348} &\textbf{42566}\\
 Gradient          &206   &206  &-551     &-551   &-1934 &-1932 &347 &349   &\textbf{33348} &42564\\
\midrule
 MixTS             &172   &176  &-1961    &-1711  &-3042 &-3016 &\textbf{350} &\textbf{350}  &293 &-1026\\
QMDP               &201   &183  & -       &-      &-     &-     &-   &-     &30705  &39626\\ 
POMCP              &54      & 64    & -       &-      &-     &-     &-   &-     &25794  &30910\\
  \midrule
 Oracle     &213  & 213 &-172     &-172   &-894  &-894  &358 &360   &40159  &53856\\
  \bottomrule
\end{tabular}
\end{table*}

\begin{table*}
  \centering
  \caption{Standard Deviation of Returns of Algorithms on Five Domains.}\label{app:standard-deviation}
\begin{tabular}{lrrrrrrrrrr}
\toprule
  \textbf{Algorithm}  \bfseries  & \multicolumn{2}{c}{\textbf{RS}} \bfseries & \multicolumn{2}{c}{\textbf{POP}}  \bfseries & \multicolumn{2}{c}{\textbf{POPS}}\bfseries & \multicolumn{2}{c}{\textbf{INV}} & \multicolumn{2}{c}{\textbf{HIV}} \\
 \bfseries   &  T = 100  \bfseries & T =150  &  T = 100  \bfseries & T =150  &  T = 100  \bfseries & T =150  &  T = 100  \bfseries & T =150  &  T = 5  \bfseries & T =20\\
\midrule
\textbf{CADP}     &98   &98   &\textbf{1095}   & \textbf{1095}     &\textbf{2007} &\textbf{2007}  &\textbf{51}  &\textbf{51} & 9342 &\textbf{11309} \\
 MVP                &90   &90   &2046   &2046  &3619   &3620  &52  &52 &\textbf{7729} &12234 \\
 WSU                &100  &100  &1364   &1364  &3147   &3146  &53  &53 &\textbf{7729}  & 12234\\
\midrule
 Mirror             &\textbf{70}  &\textbf{70}   &2081   &2081  &4534   &4530  &57  &58 &\textbf{7729} &12237 \\
 Gradient           &100  &100  &1364   &1364  &3147   &3146  &53  &53 &\textbf{7729}  & 12234\\
  \midrule
MixTS             &226  &231   &4436  &4187   &5507   &5542  &58  &58 &23689  &27792 \\
QMDP               &193  &204   & -     &-     &-      &-     &-   &-  &42987  &61596 \\
POMCP              &66   &118   &-      &-     &-      &-     &-   &-  &42208  &57772 \\
  \midrule
 Oracle    &95   &95   &1045   &1045  &1889   &1889  &51  &51 &9029  &14796 \\
  \bottomrule
\end{tabular}
\end{table*}

\begin{table*}
  \centering
  \caption{Run-times of Algorithms on Five Domains in Minutes.}\label{app:runtimes}
\begin{tabular}{lrrrrrrrrrr}
\toprule
  \textbf{Algorithm}  \bfseries  & \multicolumn{2}{c}{\textbf{RS}} \bfseries & \multicolumn{2}{c}{\textbf{POP}}  \bfseries & \multicolumn{2}{c}{\textbf{POPS}}\bfseries & \multicolumn{2}{c}{\textbf{INV}} & \multicolumn{2}{c}{\textbf{HIV}} \\
 \bfseries   &  T = 100  \bfseries & T =150  &  T = 100  \bfseries & T =150  &  T = 100  \bfseries & T =150  &  T = 100  \bfseries & T =150  &  T = 50  \bfseries & T =100\\
\midrule
 MVP               &\textbf{0.05}  &\textbf{0.05}  &\textbf{27.68}   &\textbf{27.51}   &\textbf{0.36}   &\textbf{0.36}   &\textbf{0.22}  &\textbf{0.22}   &\textbf{0.0003}  &\textbf{0.0003}\\
 WSU               &0.12  &0.14  &40.02   &45.39   &1.53   &2.37   &0.67  &0.89   &0.0033  &0.0048\\
\textbf{CADP}              &0.52  &1.13  &124.39  &173.04  &12.12  &16.21  &1.53  &2.22   &0.0109  &0.0164\\
\midrule
 Mirror            &1.86  &3.11  &113.08  &158.06  &8.08   &11.90  &35.90 &53.6   &0.0221  &0.0330\\
 Gradient          &0.51  &0.74  &56.82   &69.32   &2.97   &4.31   &1.12  &1.44   &0.0083  &0.0123\\
  \midrule
MixTS             &0.09  &0.12  &32.08   &35.36   &0.80   &1.03   &0.47  &0.59   &0.0033  &0.0047\\
QMDP               &712   & 712  & -      &-       &-      &-      &-     & -     &0.7071  &0.7071\\
POMCP             &68    &68    & -      &-       &-      &-      &-     &-      &0.2066  &0.2066\\
  \bottomrule
\end{tabular}
\end{table*}




% \begin{table*}[ht]
%   \centering
%   \caption{Mean Returns $\rho(\pi)$ on the Test Set of Policies $\pi$ Computed by Each Algorithm.}\label{app:returns}
% \begin{tabular}{lrrrrrrrrrr}
% \toprule
%   \textbf{Algorithm}  \bfseries  & \multicolumn{2}{c}{\textbf{RS}} \bfseries & \multicolumn{2}{c}{\textbf{POP}}  \bfseries & \multicolumn{2}{c}{\textbf{POPS}}\bfseries & \multicolumn{2}{c}{\textbf{INV}} & \multicolumn{2}{c}{\textbf{HIV}} \\
%  \bfseries   &  T = 50  \bfseries & T =150  &  T = 50  \bfseries & T =150  &  T = 50  \bfseries & T =150  &  T = 50  \bfseries & T =150  &  T = 5  \bfseries & T =20\\
%   \midrule
%  \textbf{CADP}   &\textbf{204}  & \textbf{207}  & \textbf{-361} &\textbf{-368} &\textbf{-1067} &\textbf{-1082} &323 &\textbf{350} &\textbf{33348} &\textbf{42566}\\
%    WSU               &203   &206  &-542     &-551   &-1915 &-1932 &323 &349   &\textbf{33348} &42564\\
%  MVP               &201   &204  &-704     &-717   &-2147 &-2179 &323 &\textbf{350}  &\textbf{33348} &42564 \\ 
% \midrule
%  Mirror            &181   &183  &-1650    &-1600  &-3676 &-3800 &314 &345   &\textbf{33348} &\textbf{42566}\\
%  Gradient          &203   &206  &-542     &-551   &-1915 &-1932 &323 &349   &\textbf{33348} &42564\\
% \midrule
%  MixTS             &167   &176  &-1761    &-1711  &-2857 &-3016 &\textbf{327} &\textbf{350}  &293 &-1026\\
% QMDP               &190   &183  & -       &-      &-     &-     &-   &-     &30705  &39626\\ 
% POMCP              &58      & 64    & -       &-      &-     &-     &-   &-     &25794  &30910\\
%   \midrule
%  Oracle     &210  & 213 &-168     &-172   &-882 &-894  &332 &360   &40159  &53856\\
%   \bottomrule
% \end{tabular}
% \end{table*}


%\clearpage

%\bibliography{uai2023-template}
%\bibliography{su_214}

\end{document}

%%% Local Variables:
%%% mode: latex
%%% TeX-master: t
%%% End:
