%latex import preamble from another file
%latex xr package preamble other file

%\documentclass{article}

% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%%% HELPER CODE FOR DEALING WITH EXTERNAL REFERENCES
\usepackage{xr}
\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother
\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}
%%% END HELPER CODE

\myexternaldocument{nelson_697}
%\myexternaldocument{File2}

\usepackage[utf8]{inputenc}

\usepackage{amsmath,amssymb,amsthm}
\usepackage{xcolor}

\usepackage[ruled]{algorithm}
%\usepackage[ruled,vlined]{algorithm2e}
\usepackage{algpseudocode}

\usepackage{hyperref}

\usepackage{chngcntr}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%% Useful Math
\newtheorem{theorem}{Theorem}
\newtheorem{assumption}{Assumption}
\newtheorem{conjecture}[theorem]{Conjecture}
\newtheorem{hypothesis}[theorem]{Hypothesis}
\newtheorem{corollary}{Corollary}[theorem]
\newtheorem{definition}[theorem]{Definition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{observation}[theorem]{Observation}
\newtheorem{fact}[theorem]{Fact}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{example}[theorem]{Example}
\newtheorem{notation}[theorem]{Notation}
\newtheorem{trick}[theorem]{Trick Result}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{remark}{Remark}

\newtheorem{innercustomthm}{Theorem}
\newenvironment{customthm}[1]
  {\renewcommand\theinnercustomthm{#1}\innercustomthm}
  {\endinnercustomthm}
  
\newtheorem{innercustomassumption}{Assumption}
\newenvironment{customassumption}[1]
  {\renewcommand\theinnercustomassumption{#1}\innercustomassumption}
  {\endinnercustomassumption}

\usepackage{apptools}
\AtAppendix{\counterwithin{theorem}{section}}
\AtAppendix{\counterwithin{assumption}{section}}
%\AtAppendix{\counterwithin{lemma}{section}}
%\AtAppendix{\counterwithin{corollary}{section}}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% Elliot's definitions:

\newcommand{\be}{\begin{equation}}
\newcommand{\ee}{\end{equation}}
%\newcommand{\ba}{\begin{align}}
%\newcommand{\ea}{\end{align}}
\newcommand{\rarr}{\rightarrow}
\newcommand{\larr}{\leftarrow}
\newcommand{\nn}{\nonumber}
\def\[{\left[}
\def\]{\right]}
\def\({\left(}
\def\){\right)}
\def\<{\langle}
\def\>{\rangle}

\DeclareRobustCommand{\bbone}{\text{\usefont{U}{bbold}{m}{n}1}}
%\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator\erf{erf}
\newcommand{\argmax}{\text{argmax}}

\newcommand{\EE}{\mathbb{E}}
\newcommand{\RR}{\mathbb{R}}
\newcommand{\N}{\mathcal{N}} % standard normal distribution
\newcommand{\xv}{\vec{x}}
\newcommand{\av}{\vec{a}}
\newcommand{\rv}{\vec{r}}
\newcommand{\zv}{\vec{z}}

\newcommand{\R}{\mathcal{R}} % regret

% quantities for latent bandit problem setting
\newcommand{\A}{\mathcal{A}} % action space
\newcommand{\Z}{\mathcal{Z}} % latent space
\newcommand{\muhat}{\hat{\mu}}
\newcommand{\mustara}{\mu_\star^{(a)}}
\newcommand{\mustaraprime}{\mu_\star^{(a')}}
\newcommand{\mustarastar}{\mu_\star^{(a^\star)}}

% quantities for linear bandit problem setting
\newcommand{\ctx}{c}
\newcommand{\ctxspace}{\mathcal{C}}
\newcommand{\ctxdim}{d} % if we change this to Z, check usage...
\newcommand{\ctxstar}{\ctx^\star}
\newcommand{\ctxhat}{\hat{\ctx}}
%\newcommand{\ctxmatrix}{C}
%\newcommand{\ctxv}{\vec{\ctx}}
\newcommand{\Pct}{P_{\ctx}^{(t)}} % probability density function for \ctx at time t
\newcommand{\pc}{\rho}
\newcommand{\pct}{\pc^{(t)}}

% quantities in estimator error bound derivation
\newcommand{\rhoeq}{\rho^{(\phi)}_{\rm eq}}
\newcommand{\biasmatrix}{A}
\newcommand{\error}{g}
\newcommand{\dta}{\delta_t^{(a)}}
\newcommand{\dtpa}{\delta_{t'}^{(a)}}
\newcommand{\rstdev}{\sigma}
%\newcommand{\covreward}{\Sigma}
\newcommand{\cov}{\Omega}
\newcommand{\covhat}{\hat{\cov}}
\newcommand{\covinv}{B} %redefine as needed
%\newcommand{\covinveqa}{B^{(a)}_{\rm eq}}
%\newcommand{\fmu}{f_\mu}
\newcommand{\covinvmu}{\covinv} %\covinv_{\mu}
\newcommand{\Beq}{\bar{B}}
\newcommand{\udelta}{U_\delta} %%% TEMPORARY
\newcommand{\udeltaoverZ}{U_{\delta/Z}} %%% TEMPORARY
\newcommand{\Dphitau}{D_\phi(\tau_1)} %%% TEMPORARY

% quantities in regret bound
\newcommand{\gapadv}{\Delta_{\rm likely}}
\newcommand{\gapworst}{\Delta_{\rm worst}}
% quantities in regret bound derivation
\newcommand{\dR}{\delta\mathcal{R}}
\newcommand{\Rconst}{\mathcal{R}_0}
\newcommand{\dctx}{\delta\ctx}
\newcommand{\dmu}{\nu} % denotes a small perturbation to \mu
\newcommand{\threshold}{\dmu} % free parameter in the regret derivation
\newcommand{\deltamu}{\delta_1}
\newcommand{\deltacov}{\delta_2}
\newcommand{\deltactx}{\delta_3}
\newcommand{\deltaR}{\delta_\R}
\newcommand{\Umu}{U_{\deltamu}^{(\muhat)}}
\newcommand{\Ucov}{U_{\deltacov}^{(\covhat)}}
\newcommand{\Uctx}{U_{\deltactx}^{(\ctxhat)}}
\newcommand{\Uerror}{U}
\newcommand{\Uexponent}{\alpha}
\newcommand{\Umunorm}{u_{\mu}}
\newcommand{\Uctxnorm}{u_{\ctx}}
\newcommand{\UR}{U_{\mathcal{R}}}
\newcommand{\rhoat}{\rho^{(t)}_a}
\newcommand{\rhoaa}{\rho}
%\newcommand{\Ugap}{\Delta_{\rm max}}

% quantities for least-squares transition matrix estimation
%\newcommand{\CE}{H} %% in case we want to quickly replace H with some other notation

% other quantities
\newcommand{\rand}{\eta} % a variable to denote random Gaussian noise ~N(0,1)
\newcommand{\randchi}{x} % chi-squared random variable

% UNCATEGORIZED quantities
\newcommand{\ppeq}{W}
\newcommand{\ca}{\kappa}
\newcommand{\cb}{\tilde{\kappa}}
%%% \newcommand{\cc}{\tilde{\kappa}}
\newcommand{\cdkl}{\zeta_{\phi}}
\newcommand{\cdklstar}{\zeta_{\phi^\star}}
\newcommand{\stdeveq}{\sigma_{\rm eq}}
\newcommand{\stdevz}{\sigma_z}
\newcommand{\stdevza}{\sigma^{(a)}_z}
\newcommand{\stdevzmax}{\sigma_{\rm max}}
\newcommand{\qtilde}{\tilde{q}}
\newcommand{\rhomin}{\rho_{\rm min}}
\newcommand{\coeff}{b}
\newcommand{\phat}{\hat{p}}
\newcommand{\pmin}{p_{\rm min}}
\newcommand{\one}{\mathbf{1}}
\newcommand{\bzero}{\mathbf{0}}
\newcommand{\pluseq}{\mathrel{+}=}
\newcommand{\eq}{{\rm eq}}
\newcommand{\tp}{\top}
\newcommand{\eps}{\epsilon}
\newcommand{\x}{\mathbf{x}}
\newcommand{\pb}{\mathbf{p}}
\newcommand{\rb}{\mathbf{r}}
\newcommand{\mub}{\boldsymbol\mu}
\newcommand{\Wb}{\mathbf{W}}
\newcommand{\Pb}{\mathbf{P}}
\newcommand{\Nv}{\vec{N}}
\newcommand{\Nmin}{N_{\rm min}}

% TEMPORARY / INTERMEDIATE:
\newcommand{\epsbar}{\bar{\epsilon}}
\newcommand{\epstemp}{\tilde{\epsilon}}
\newcommand{\yy}{y} % free parameter optimized in derivation
\newcommand{\ua}{u_1}
\newcommand{\ub}{u_2}
\newcommand{\uu}{u}
\newcommand{\vv}{v}
\newcommand{\xparallel}{x}

% colors:
\newcommand{\blue}[1]{\textcolor{blue}{#1}}
\newcommand{\cyan}[1]{\textcolor{black}{#1}}
\newcommand{\red}[1]{\textcolor{black}{#1}}
\newcommand{\gray}[1]{\textcolor{black}{#1}}
\newcommand{\new}[1]{\textcolor{black}{#1}}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\title{Linearizing Contextual Bandits with Latent State Dynamics (Supplementary material)}

%\author[1]{\href{mailto:<enelson@ibm.com>?Subject=Your UAI 2022 paper}{Elliot~Nelson}{}}
\author[1]{Elliot~Nelson}
\author[1]{Debarun~Bhattacharjya}
\author[1]{Tian~Gao}
\author[1]{Miao~Liu}
\author[1]{Djallel~Bouneffouf}
\author[2]{Pascal~Poupart}
% Add affiliations after the authors
\affil[1]{%
    IBM T. J. Watson Research Center\\
    Yorktown Heights, NY, USA
}
\affil[2]{%
    David R. Cheriton School of Computer Science\\
    University of Waterloo\\
    Waterloo, ON, Canada
}

\begin{document}

\onecolumn

\appendix

\maketitle

\numberwithin{equation}{section}
\counterwithin{figure}{section}

\section{Online Expectation Maximization for Hidden Markov models}
\label{app:online_em}

\newcommand{\rhat}{\hat{r}}
\newcommand{\rhophi}{\hat{\rho}^{(\phi)}}
\newcommand{\rhotheta}{\hat{\boldsymbol\rho}^{(\theta)}}
\newcommand{\suffstat}{\mathbf{s}}
\newcommand{\Sboldtheta}{\mathbf{\hat{S}}^{(\theta)}}
\newcommand{\Stheta}{\hat{S}^{(\theta)}}
\newcommand{\mux}{\nu}
\newcommand{\muxhat}{\hat{\mux}}
\newcommand{\varx}{\Sigma}
\newcommand{\varxhat}{\hat{\varx}}

In sections~\eqref{app:multinomial_em}-\eqref{app:gaussian_em} below, we describe the online EM algorithms used (by both L$^2$TS and L$^2$UCB) in our experiments.

These online EM algorithms involve updating the model posterior over the latent state with Bayes' rule,\footnote{The $\propto$ sign indicates equality up to a normalizing constant.}
\be
\phat_t(z) \propto \sum_{z'} \phat_{t-1}(z')
\hat{\phi}^{(t-1)}_{z,z'}p(x_t|z;\hat{\theta}^{(t-1)})
\ee
using the current parameter estimates $(\hat{\phi}^{(t-1)},\hat{\theta}^{(t-1)})$. (These updates are shown in Eqs. \eqref{eq:phat_update_discrete} and \eqref{eq:phat_update_gaussian} below, in the case of multinomial and Gaussian context distributions, respectively.)

In both cases, online EM uses a discount factor $\gamma_t\in(0,1)$ which is used to control the magnitude of parameter estimate updates over time. The rate at which $\gamma_t$ approaches zero as $t\rarr\infty$ controls the discounting of previously observed context data. %with $\gamma_t\propto1/t$ corresponding to the limiting case in which [there is no discounting of old data]
(In our experiments we use $\gamma_t=t^{-0.6}$.)

While we focus on Gaussian distributions in the case of continuous context data, the online EM algorithm of \cite{cappe2011online} applies more generally to context distributions $p(x|z)$ in the exponential family.

\subsection{Multinomial context distributions}
\label{app:multinomial_em}

For multinomial context distributions with $x\in\{1,...,X\}$, we define 
$\hat{\theta} = \{\hat{\mux}_{j,i}\}$
where $\hat{\mux}_{j,i} := p(x=i|z=j)$ satisfies $\sum_{i=1}^X\hat{\mux}_{j,i}=1$.
We use the algorithm of \cite{mongillo2008online} -- reproduced in Eqs. \eqref{eq:phat_update_discrete}-\eqref{eq:mux_update_discrete} below -- to implement the online EM update in L$^2$TS (Algorithm~\ref{alg:llts}) and L$^2$UCB (Algorithm~\ref{alg:llucb}).
We define $\text{OnlineEM}(x,\hat{\theta}^{(t-1)},\hat{\phi}^{(t-1)},\phat_{t-1},\hat{\psi}_{t-1})$ as the function which returns $(\hat{\theta}^{(t)},\hat{\phi}^{(t)},\hat{\psi}_{t})$,
where (in the categorical case) $\hat{\theta}^{(t)}=\{\hat{\mux}^{(t)}_{j,i}\}$, $\hat{\phi}^{(t)}$, \cyan{and $\hat{\psi}_t=\{\hat{\rho}^{(t)}_{i,j,h}(k)\}$ are} computed as in Eqs. \eqref{eq:mux_update_discrete}, \eqref{eq:phi_update_discrete}, and \eqref{eq:rho_update_discrete} respectively.

\iffalse
\begin{algorithm}
    \caption{$\text{OnlineEM}(x,\hat{\theta}^{(t-1)},\hat{\phi}^{(t-1)},\phat_{t-1},\hat{\rho}_{t-1})$ (Multinomial HMM)}\label{alg:hmm_discrete}
    Update\\
    Return
\end{algorithm}
\fi

\begin{align}
    \phat_t(z) &\propto %CHECKED
    \sum_{z'} \phat_{t-1}(z')
    \hat{\phi}^{(t-1)}_{z,z'} \mux^{(t-1)}_{z,x_t} \label{eq:phat_update_discrete} \\
    \hat{\rho}^{(t)}_{i,j,h}(k) &= %CHECKED
    \sum_l\Gamma_{l,h}(x_t)\((1-\gamma_t)\hat{\rho}^{(t-1)}_{i,j,l}(k) +  \gamma_t\one(x_t=k)\one(i=l)\one(j=h)\phat_{t-1}(l) \) \label{eq:rho_update_discrete} \\
    & \text{where \ } \Gamma_{i,j}(x_t) = %(CHECKED)
    \frac{\hat{\phi}^{(t-1)}_{i,j}\hat{\mux}^{(t-1)}_{j,x_t}}{\sum_{i',j'}\hat{\phi}^{(t-1)}_{i',j'}\hat{\mux}^{(t-1)}_{j',x_t}\phat_{t-1}(i')} \nn \\
    \hat{\phi}^{(t)}_{j,i} &\propto %CHECKED
    \sum_{k=1}^X\sum_{h=1}^Z\hat{\rho}^{(t)}_{i,j,h}(k) \label{eq:phi_update_discrete} \\
    \hat{\mux}^{(t)}_{j,i} &\propto %CHECKED
    \sum_{i,h=1}^Z\hat{\rho}^{(t)}_{i,j,h}(k) \label{eq:mux_update_discrete}
\end{align}

In the updates to $\phat_t$, $\hat{\phi}^{(t)}$, and $\hat{\mux}^{(t)}$ above, the $\propto$ sign indicates equality up to the normalizing factors required to ensure that $\sum_z\phat_t(z)=1$, $\sum_{z'}\hat{\phi}^{(t)}_{z',z}=1$, or $\sum_{i=1}^X\hat{\mux}_{j,i}=1$.

\subsection{Gaussian context distributions}
\label{app:gaussian_em}

For Gaussian context distributions $p(x|z;\hat{\theta})$, the parameters are means and variances, $\hat{\theta} = \{\muxhat_z,\varxhat_z\}_1^Z$, conditional on each latent state $z$.
In this case, we use Algorithm 1 of \cite{cappe2011online} to implement the online EM parameter update in L$^2$TS.
This algorithm is reproduced as follows, largely following the notation in \cite{cappe2011online}, with some modifications to maintain consistency with our notation in the main test.\footnote{In particular, \cite{cappe2011online} uses $\hat{\phi}$ to denote the posterior probability vector which we call $\phat$, and uses $q$ to denote the latent transition probabilities $\hat{\phi}$.}
We assume for simplicity that $x_t\in\RR$ so that $\muxhat^{(t)}_z$ is univariate. (The expressions in \cite{cappe2011online} apply also to the multivariate case.)

We again define $\text{OnlineEM}(x,\hat{\theta}^{(t-1)},\hat{\phi}^{(t-1)},\phat_{t-1},\hat{\psi}_{t-1})$ as the function which returns $(\hat{\theta}^{(t)},\hat{\phi}^{(t)},\hat{\psi}_t)$,
where now, in the Gaussian case,  $\hat{\theta}^{(t)}=\{\muxhat^{(t)}_z,\varxhat^{(t)}_z\}$, $\hat{\phi}^{(t)}$, \cyan{and $\hat{\psi}_t=\{\rhophi_t(i,j,k),\rhotheta_t(i,k)\}$ are} computed as in Eqs. \eqref{eq:mux_update_gaussian}-\eqref{eq:varx_update_gaussian}, \eqref{eq:phi_update_gaussian}, and \eqref{eq:rho_phi}-\eqref{eq:rho_theta}, respectively.
These updates involve the quadratic sufficient statistic, $\suffstat(x) = [1,x,x^2]$, for context observations $x\sim p(\cdot|z;\theta^\star)$.
In Eqs. \eqref{eq:rho_theta} and \eqref{eq:S_theta} below, $\rhotheta_t(i,k)$ shares the same vector dimension, which we indicate with bold symbols.

\begin{align}
    \phat_t(z) &\propto %CHECKED
    \sum_{z'=1}^Z \phat_{t-1}(z')
    \hat{\phi}^{(t-1)}_{z,z'}\frac{1}{\sqrt{2\pi\varxhat_z^{(t-1)}}}\exp\[-\(x_t-\muxhat_z^{(t-1)}\)^2\Big/2\varxhat_z^{(t-1)}\] \label{eq:phat_update_gaussian} \\
    \rhat_t(z|z') &= %CHECKED
    \frac{\phat_{t-1}(z)\hat{\phi}^{(t-1)}_{z',z}}{\sum_{z''}\phat_{t-1}(z'')\hat{\phi}^{(t-1)}_{z',z''}} \\
    \rhophi_t(i,j,k) &= %CHECKED
    \gamma_t \one(j=k)\rhat_t(i|j) 
    + (1-\gamma_t) \sum_{k'=1}^Z\rhophi_{t-1}(i,j,k')\rhat_t(k'|k) \label{eq:rho_phi} \\
    \rhotheta_t(i,k) &= %CHECKED
    \gamma_t\one(j=k)\suffstat(x_t)
    + (1-\gamma_t)
    \sum_{k'=1}^Z\rhotheta_{t-1}(i,k')\rhat_t(k'|k) 
    \label{eq:rho_theta} \\
    \hat{\phi}^{(t)}_{j,i} &= %CHECKED
    \frac{\sum_{z=1}^Z\rhophi_t(i,j,z)\phat_t(z)}{\sum_{z',z=1}^Z\rhophi_t(i,z',z)\phat_t(z)} \label{eq:phi_update_gaussian} \\
    \Sboldtheta_t(i) &= \sum_{k=1}^Z\rhotheta_t(i,k)\phat_t(k) \label{eq:S_theta} \\
    \muxhat^{(t)}_z &= %CHECKED
    \Stheta_{t,1}(z) / \Stheta_{t,0}(z) \label{eq:mux_update_gaussian} \\
    \varxhat^{(t)}_z &= %CHECKED
    \Stheta_{t,2}(z) / \Stheta_{t,0}(z) - (\muxhat^{(t)}_z)^2 \label{eq:varx_update_gaussian}
\end{align}

\section{Experiments}
\label{app:experiments}

In both L$^2$TS (Algorithm~\ref{alg:llts}) and L$^2$UCB (Algorithm~\ref{alg:llucb}), and for all experiments, we use the following settings:

\textit{Online EM hyperparameters.}
We use $\gamma_t = t^{-0.6}$, following \cite{cappe2011online}.

\textit{Linear bandit hyperparameters.}
\new{In L$^2$TS, we set $\tilde{\sigma}_r=1$.
In L$^2$UCB, we set $\alpha_{\rm UCB}=3$ for all experiments, which we found to improve convergence of regret compared to $\alpha_{\rm UCB}=1$. For both, we set $\lambda_\mu=1$.}

\subsection{Multinomial Context Distributions with Binary Rewards}
\label{app:experiment_discrete}

\paragraph{Problem 1.}

Expressing the multinomial context distribution probabilities
%as $p(x=i|z=j;\theta^\star) := \mux^\star_{j,i}\}$,
in matrix form, we set
\be
p(x|z) = \begin{bmatrix}
0.05 & 0.05 & 0.45 & 0.45 \\
0.45 & 0.45 & 0.05 & 0.05
\end{bmatrix},
\ee
where $x\in\{1,...,X\}$ with $X=4$, and $z\in\{1,2\}$.

Denoting the Bernoulli probabilities for (binary) reward values
%as $p(r=1|z,a;\muhat)=\hat{\mu}^{(a)}_z$, 
in matrix form, with actions $a\in\{1,2\}$ and latent states $z\in\{1,2\}$ indexing rows and columns,
%\cyan{we set
%$K=Z=2$, with $\hat{\mu}^{(a)}=$
we set
\be
p(r=1|z,a) = \begin{bmatrix}
0.4 & 0.4 \\
0.6 & 0.4
\end{bmatrix}.
\ee
\new{The initial latent state was generated from a probability vector %$\rho_0(z^\star)$
sampled from a (uniform) Dirichlet prior with concentration parameters $\alpha_z=1$ for $z=1,2$.}

\paragraph{Problem 2.}
In this problem, we set $(Z,X,K)=(4,12,8)$ with $p(x=i|z=j)=1/3$ when $(i-j)\text{mod} X\in\{-1,0,1\}$, and zero otherwise.
Reward probabilities $p(r=1|z,a)$ are sampled uniformly in $(0,1)$ for all $(z,a)$.
Latent states transition to the same state with probability $0.75$, and to any other state with equal probabilities.
\cyan{For this task only,} we omitted the optional reward update in L$^2$TS and L$^2$UCB, which we found to marginally increase regret. In this task, contexts $x_t$ contain significantly more information about the current latent state $z_t$ than do rewards $r_t$.
\new{The initial latent state was sampled from a uniform probability distribution.}

\textit{Online EM initialization.}
The sufficient statistics introduced in Appendix~\ref{app:multinomial_em} were initialized at \cyan{$\hat{\rho}^{(0)}_{i,j,h}(k)=0.01$} for all $(i,j,h)$ and all $k\in\{1,...,X\}$.
\new{The initial latent state probability vector $\phat_0(z)$ was sampled randomly from the uniform distribution over probability vectors.} %was set to a uniform distribution in Problem 1 

%\cyan{Theta hat random uniform, phi hat 0.3 0.7, probs z ....}

\subsection{Mining Application Details}
\label{app:experiment_mining}

We assume a Gaussian reward model, 
$p(r|z,a)=\mathcal{N}(\hat{\mu}^{(a)}_z,\tilde{\sigma}_r^2)$. 
The variance $\tilde{\sigma}_r^2$ is a fixed hyperparameter, which we equate with the hyperparameter $\tilde{\sigma}_r$ in Algorithm~\ref{alg:llts} used for Thompson sampling. (This hyperparameter is the variance of the implicit Gaussian reward likelihood used in L$^2$TS to update the multivariate Gaussian posterior over $\mu^{(a)}$.)

\textit{Online EM initialization.}
The sufficient statistics introduced in Appendix~\ref{app:gaussian_em} were initialized at $\rhophi_0(i,j,k)=1$, $\rhotheta_0(i,k)=[1,1,1]$ for all $(i,j,k)$.
%We initialized $\muxhat^{(0)}_z=z-1$ for $z\in\{$ % i.e. np.arange()
%\cyan{Theta mu init arange, stdev init ones; phi init 0.55 prob change, probs z ....}
\new{The initial latent state probability vector $\phat_0(z)$ was set to a uniform distribution.}

%\cyan{[These are the notes from the AISTATS submission:]}

\paragraph{Numerical Details of Application.}

We model this application using three latent rock classes $z=1,2,3$, two mining actions $a=1,2$, and Gaussian contextual observations for hand-held x-ray flourescent meter (XMET) measurements. % into four categories  $x=1,2,3,4$.


To model $p(x|z)$, we use the approach and numbers in~\cite{eidsvik_mukerji_bhattacharjya_2015} for how the continuous-valued XMET observations depend on the latent rock class. The ore grade $o$ and the observed continuous XMET observation $x$ follow Gaussian distributions as follows:
\be
o = \beta_0 + \beta_1 z + N(0, \sigma^2); \, x = o + N(0, \tau^2),
\ee
where $\beta_0 = -0.18$ and $\beta_1 = 1.32$ are regressions coefficients. The latter coefficient signifies that a higher rock class results in higher ore grade. $\sigma = 0.62$ captures the uncertainty in the ore grade and $\tau = 0.45$ captures the quality of the observed XMET. These numbers are directly from~\cite{eidsvik_mukerji_bhattacharjya_2015}. 

%\cyan{We sample $x_c$ and discretize it into four bins: $(-\infty, 0), [0, 2), [2, 4), [4, \infty]$. This procedure results in the following categorical distribution, where rows from top to bottom are for $z=1,2,3$:
%\be
%p(x|z) = \begin{bmatrix}
%0.05 & 0.85 & 0.09 & 0.01 \\
%0.01 & 0.19 & 0.79 & 0.01 \\
%0.01 & 0.01 & 0.59 & 0.39
%\end{bmatrix}
%\ee
%}

For the reward distribution $p(r|a,z)$, we assume the profit depends on a revenue factor ($r_f$) per ore grade from mined ore as well as both fixed ($c_f$) and uncertain (variable) costs ($c_v$):
\be
\text{Profit}(a,z) = o(z) * r_f(a) - c_f(a) - c_v(a),
\ee
where $c_v(a) \sim N(0, \sigma_c^2)$.
We choose numbers such that action $a=1$ has a higher revenue factor and more fixed cost but less variable cost compared to action $a=2$. Note that the profit is Gaussian as it is linear in Gaussian random variables. 

%An action is deemed successful when revenue is greater than cost, i.e. when it is profitable. \cyan{We sample rewards to end up with the following reward distribution for the two actions:
%\be
%p^\intercal(r|a=1,z) = \begin{bmatrix}
%0.2 & 0.5 & 0.8
%\end{bmatrix}; \,
%p^\intercal(r|a=2,z) = \begin{bmatrix}
%0.3 & 0.4 & 0.5
%\end{bmatrix}.
%\ee
%}

We choose a transition matrix over latent states that favors the diagonal, because
spatial modeling in general heavily uses covariance related concepts (such as variograms) where regions that are geographically closer are more correlated. In our first experiment, we choose the following matrix, where rows from top to bottom are for $z=1,2,3$:
\be\label{eq:phi_mining_1}
p(z_t|z_{t-1}) = \begin{bmatrix}
0.7 & 0.25 & 0.05 \\
0.25 & 0.5 & 0.25 \\
0.05 & 0.25 & 0.7
\end{bmatrix}
\ee
In our second experiment, we choose a matrix for which latent state changes are very rare, occuring only every $O(50)$ steps:
\be\label{eq:phi_mining_2}
p(z_t|z_{t-1}) = \begin{bmatrix}
0.98 & 0.01 & 0.01 \\
0.01 & 0.98 & 0.01 \\
0.01 & 0.01 & 0.98
\end{bmatrix}
\ee
\new{In both cases, the initial latent state was generated from a probability vector sampled from a Dirichlet prior with concentration parameters $\alpha_z=1$ for $z=1,2,3$ (i.e. a uniform prior over probability vectors).}

\subsection{Parameter estimation error}

\new{
The gap between L$^2$TS and the corresponding oracle variant in Figure~\ref{fig:regret_gaussian} (which conditions on the true parameters $\theta^\star$, $\phi^\star$) is a consequence of parameter estimation error.
In Figure~\ref{fig:error} below we show parameter estimation error of online EM, when used by L$^2$TS for the mining application described above.
%with the true transition matrix given by Eq. \eqref{eq:phi_mining_1}, 
We show mean squared errors averaged over 10 episodes with different randomly generated ground truth transition matrices (with each column sampled from a uniform distribution over probability vectors), as well as different randomly generated mean values $\mathbb{E}[x|z]\sim\mathcal{N}(0,1)$ for the Gaussian conditional distributions $p(x|z)$. (Otherwise, we use the same environment parameters as described above.)
}

\begin{figure}[t!]
    \centering
    \includegraphics[width=9.0cm]{error_ts.png}
    %\includegraphics[width=7.0cm]{error_ts_probchange.png}
    \caption{Mean squared error (MSE) of model estimates for the latent transition matrix ($||\hat{\phi}^{(t)}-\phi^\star||_2^2$) and context distributions ($||\hat{\theta}^{(t)}-\theta^\star||_2^2$) used by L$^2$TS on the mining application, averaged over 10 different true transition matrices and context distributions.}
%    \textbf{Right:} The same quantities for L$^2$UCB (which uses the same online EM parameter learning method).}
    \label{fig:error}
\end{figure}

\subsection{Baseline Details.}

We allowed the discounted Thompson Sampling (dTS) algorithm to access the true transition matrix to set its discount factor to $\gamma=Z^{-1}\sum_z\phi^\star_{z,z}$.

For umTS \citep{Hong2020}, we used $N=100$ particles with a minimum effective sample size $ESS_{min}=20$ for particle resampling.
% [this version of the code had a bug:] (We found that umTS was able to learn the true transition matrix with $\simeq10\%$ accuracy, and mean rewards somewhat less accurately.)

For Exp4.P \citep{exp4p}, we pretrained $10$ expert modules with linear regression (in the case of categorical contexts and rewards) or MLP classification\footnote{We use the default architecture settings specified at \\ \href{https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html}{https://scikit-learn.org/stable/modules/generated/sklearn.neural\_network.MLPClassifier.html}.} (in the case of Gaussian context and rewards) to classify observations $x$ into corresponding optimal actions, based on $1000$ samples of contexts $x\sim p(x)=\sum_z p(x|z;\theta^\star)p_i(z)$ and action-wise rewards $r_a\sim p(r|a) = \sum_z p(r|z,a)p_i(z)$. For each expert $i$, we used a different categorical distribution $p_i(z)$ obtained by sampling a uniform distribution over the $Z$-simplex. (We found comparable performance when increasing to $50$ experts.)
The Exp4.P algorithm then learns to give greater weight to experts who were trained on distributions $p_i(z)$ which assign higher probability to recently occurring latent states in the non-stationary environment.

\section{Derivation of Theorem \ref{theorem:estimator}}
\label{app:estimator_bound}

We would like to bound the error in the action-wise, vector-valued mean reward estimators $\hat{\mu}^{(a)}$, defined in Eq. \eqref{eq:mu_hat}, and (as discussed in Section~\ref{sec:reduction}), used by Algorithm~\ref{alg:llts} with the linear bandit context vector $\ctx_t$ set equal to the vector of posterior probabilities over the latent state, $\phat_t$. As stated in Theorem~\ref{theorem:estimator}, we set $(\theta,\Phi)=(\theta^\star,\Phi^\star)$ throughout this section, and thus replace the model posterior $\phat_t$ with the ``true'' posterior $p^\star_t$ as defined in Eq. \eqref{eq:p_star_def}.
We will occasionally denote the $T$-dependence of some quantities explicitly as an argument, when it is helpful to remember, but will in general leave it suppressed in the interest of simplicity.

It will be useful to express the difference between the estimated (Eq. \eqref{eq:mu_hat}) and true mean reward parameters as
\be\label{eq:mu_error_appendix}
\hat{\mu}^{(a)} - \mustara = (\covinv^{(a)})^{-1} \error^{(a)},
\ee
where
\be\label{eq:error_numerator_def}
\error^{(a)} := %CHECKED
f_\mu^{(a)} - \covinv^{(a)} \mustara = %CHECKED
\sum_{t=1}^T \one(a_t=a) p^\star_t \( r_t - (p^\star_t)^\tp\mustara \).
%% older notes moved to 'notes_appendix_a.tex'
\ee
For reference, it is also useful to write down the element-wise definitions of the vector $\error^{(a)}$ and matrix $\covinv^{(a)}$:
\be
\error^{(a)}_z = %CHECKED
\sum_{t=1}^T \one(a_t=a) p^\star(z_t=z|x_{1:t})
\Big(r_t - \sum_{z'} p^\star(z_t=z'|x_{1:t})(\mustara)_{z'}\Big).
\ee
\be
\covinv^{(a)}_{zz'} = %CHECKED
\sum_{t=1}^T \one(a_t=a) p^\star(z_t=z|x_{1:t})p^\star(z_t=z'|x_{1:t}).
\ee
We will drop the $^\star$ superscript on $p_t$ in the following sections, to avoid notational clutter, but emphasize that throughout this section, all quantities are conditioned on the true parameters $(\theta^\star,\Phi^\star)$. We will also occasionally use the shorthand notation
\be\label{eq:prob_conditional_range}
p_{t:t'}(z):=p(z_{t'}=z|x_{t:t'})
\ee
to simplify expressions.
For simplicity, we will remove the $^\star$ when denoting the transition matrix; we restore it in Theorem~\ref{theorem:estimator}.

The derivation of Theorem~\ref{theorem:estimator} proceeds as follows.
In Appendix~\ref{app:mixing_bounds} we derive several intermediate results using a contraction property \citep{boyenkoller1998} of the Kullback-Leibler divergence between two posterior beliefs over the state of a hidden Markov process, which implies that the KL distance between two beliefs about the current latent state $z_t$ contracts exponentially in time as the beliefs are updated over time with additional context observations $x_t$.
We use this result to upper bound the dependence of posterior beliefs of the form $p_t(z):=p(z_t=z|x_{1:t})$ on data $x_{t-\tau}$ observed in the distant past (large $\tau$), such that probabilities $p_t(z)$ and $p_{t'}(z)$ may be treated as approximately i.i.d. random variables when $|t'-t|$ is large. Since the estimators $\hat{\mu}^{(a)}$ are constructed via linear regression with probabilities $p_t(z)$ being dependent variables, the approximate i.i.d. nature of time-separated posteriors leads to a reduction (and asymptotic convergence to zero) in estimator variance.
We demonstrate this explicitly as follows:
\begin{itemize}
    \item In Appendix \ref{app:numerator} and Appendix \ref{app:cov}, we use the results of Appendix~\ref{app:mixing_bounds} to obtain element-wise upper bounds on the variance %%across histories $x_{1:T}$ 
    of, respectively, 
    the error vector $\error^{(a)}$ and the empirical inverse covariance matrix $\covinv^{(a)}$.
    \item In Appendix~\ref{app:eigen} we convert the element-wise bound on $\covinv^{(a)}$ into a bound on the largest eigenvalue of $(\covinv^{(a)})^{-1}$.
    \item In Appendix~\ref{app:finalbound} we combine the results of the previous two sections to obtain the final high-probability bound on the estimator error $\hat{\mu}^{(a)}-\mustara$.
\end{itemize}

\subsection{Mixing rate bounds on conditional posterior probabilities} % (Data-dependence bounds)
\label{app:mixing_bounds}

In this section we will derive an upper bound on the expected total variation distance, $\EE[ \sum_z |p_t(z) - q_t(z)| ]$ and KL divergence $D_{KL}[p_t(z)||q_t(z)]$, between two distinct posteriors $(p_t,q_t)$ obtained by updating corresponding priors $(p_1,q_1)$ with the same sequence of context observations $x_{1:t}$, and using the same likelihood function and transition matrix. The contraction of these distribution distances indicates that the posterior probabilities at a given time depend predominantly on recent observations, with dependence on distant past observations, $x_{t-\tau}$, being exponentially suppressed (with respect to $\tau$).
% shows that old data is forgotten by the current posteriors
%% use to show that E[pp]~E[p]E[p] when computing variances ... $\EE[p_t(z)p_{t'}(z')]\approx\EE[p_t(z)]\EE[p_{t'}(z')]$
%%In this section we derive several useful inequalities for the dependence of posterior probabilities of the form $p(z_t=z|x_{t_1:T})$, on context observations $x_{t_1:T}$, where $t_1\leq t\leq T$. %%Intuitively, we would like to show that $p(z_t=z|x_{t_1:T})$ depends primarily on data $x_{t'}$ for $t'$ close to $t$, and only very slightly on distant past or future data.

As stated in Theorem~\ref{theorem:estimator}, we assume that the latent Markov process is ergodic, and thus has a unique equilibrium distribution (or stationary distribution) $\rhoeq(z)$ defined by $\Phi\rhoeq=\rhoeq$.

Our analysis will make use of the \textit{minimal mixing rate} \citep{boyenkoller1998} of a transition matrix,
\be\label{eq:gamma_phi_def}
\gamma_\phi := \min_{z_1,z_2} \sum_z \min(\phi_{z,z_1},\phi_{z,z_2}).
\ee
Given two initial distributions $p_1(z)=\one(z=z_1)$ and $p_2(z)=\one(z=z_2)$, with all of their probability mass concentrated respectively on states $z_1$ and $z_2$, the quantity $\sum_z\min(\phi_{z,z_1},\phi_{z,z_2})$ is the minimal probability mass which is moved to shared successor states $z$ by applying the transition matrix to $p_1$ and $p_2$.
Thus, $\gamma_\phi$ quantifies the minimal probability mass that is moved from different states to a shared state, for any initial distributions $p_1$ and $p_2$.
The minimal mixing rate was used by \cite{boyenkoller1998} to prove a contraction theorem for the KL divergence between two different distributions:
\begin{theorem}[Theorem 3 in \cite{boyenkoller1998}]
For any two prior distributions $p_0$ and $q_0$ over states $z\in\{1,...,Z\}$, the distributions $p=\Phi p_0$, $q=\Phi q_0$ induced by a transition matrix $\Phi$ satisfy
\be\label{eq:boyenkoller_thm3}
D_{KL}[p||q]\leq(1-\gamma_\phi)D_{KL}[p_0||q_0],
\ee
with the minimal mixing rate $\gamma_\phi$ defined in Eq. \eqref{eq:gamma_phi_def}.
\end{theorem}

We will also make use of the fact \citep{boyenkoller1998} that conditioning on additional data reduces the KL divergence between different distributions, in expectation:
% (could comment on why we need this)

\begin{lemma}\label{lemma:fact1}
Given two distinct priors $p(z)$ and $q(z)$, and corresponding posteriors obtained by conditioning on a real-valued observation $x$ generated from a known likelihood distribution $\ell(x|z)$,
\be\label{eq:bayes_update_two_priors}
p(z|x) = p(z)\ell(x|z)/p(x), \ \ q(z|x) = q(z)\ell(x|z)/q(x),
\ee
where $p(x):=\sum_{z}p(z)\ell(x|z)$ and $q(x):=\sum_{z}q(z)\ell(x|z)$, 
the KL divergence between the posteriors $p(z|x)$ and $q(z|x)$ satisfies
\be\label{eq:conditional_kl_bound}
\EE_{x\sim p(x)}[D_{KL}[p(z|x)||q(z|x)]] \leq D_{KL}[p(z)||q(z)].
\ee
\end{lemma}
\begin{proof}
Using Eq. \eqref{eq:bayes_update_two_priors}, we have
\begin{align}
    \EE_{x\sim p(x)}[D_{KL}[p(z|x)||q(z|x)]]
    & = \EE_{x\sim p(x)} \[ \sum_z \frac{p(z)\ell(x|z)}{p(x)}\log\( \frac{p(z)\ell(x|z)}{p(x)} \frac{q(x)}{q(z)\ell(x|z)}\) \] \nn \\
    & = \EE_{x\sim p(x)}  \[ \sum_z \frac{p(z)\ell(x|z)}{p(x)}\(\log \frac{p(z)}{q(z)} - \log\frac{p(x)}{q(x)}\) \] \nn \\
    & = \sum_z p(z)\log\frac{p(z)}{q(z)}
    \EE_{x\sim p(x)}[\ell(x|z)/p(x)]
    - \EE_{x\sim p(x)}\[\frac{\sum_z p(z)\ell(x|z)}{p(x)}\log\frac{p(x)}{q(x)}\] \nn \\
    & = D_{KL}[p(z)||q(z)] - D_{KL}[p(x)||q(x)]. \label{eq:conditional_kl_bound_px}
\end{align}
In the last line, we have used the fact that $\frac{\sum_z p(z)\ell(x|z)}{p(x)}=1$ by definition, and $\EE_{x\sim p(x)}[\ell(x|z)/p(x)]=\EE_{x\sim \ell(\cdot|z)}[1]=1$.
Since $D_{KL}[p(x)||q(x)]\geq0$, we recover Eq. \eqref{eq:conditional_kl_bound}.
\end{proof}

Eq. \eqref{eq:boyenkoller_thm3} and Eq. \eqref{eq:conditional_kl_bound} can be combined to show that the KL divergence between two prior beliefs over the hidden state contracts in expectation during a single transition and subsequent observation:
\begin{lemma}
Given two prior probability distributions $q_0(z)$ and $\qtilde_0(z)$ over the hidden state $z$, the posterior distributions over the successor state $z'$, conditional on observing $x\sim p(\cdot|z';\theta)$, that is
\be
q(z') \propto \sum_z \Phi_{z',z}q_0(z) p(x|z';\theta), \ \ \ 
\qtilde(z') \propto \sum_z \Phi_{z',z}\qtilde_0(z) p(x|z';\theta), \nn
\ee
where the sequence $x_{1:t}$ is generated via a sequence of latent states using the transition matrix $\Phi$, 
satisfy
\be\label{eq:kl_bound_hmm_timestep}
\EE_{x\sim p(\cdot|z';\theta), z'\sim\Phi\qtilde_0}
[D_{KL}[\qtilde||q]] \leq (1-\gamma_\phi)D_{KL}[\qtilde_0||q_0],
\ee
where the expectation is taken over $x\sim p(x)=\sum_{z,z'}\Phi_{z',z}\qtilde_0(z)p(x|z';\theta)$.
\end{lemma}
\begin{proof}
Applying Eq. \eqref{eq:conditional_kl_bound} with prior probability vectors $\Phi\qtilde$ and $\Phi q$ over $z_t$, we have
\be
\EE_{x\sim p(x)}
[D_{KL}[\qtilde||q]] \leq %CHECKED
D_{KL}[\Phi\qtilde_0||\Phi q_0]. \nn
\ee
where $p(x)=\sum_{z'}(\Phi\qtilde_0)_{z'}p(x|z';\theta)$. Applying Eq. \eqref{eq:boyenkoller_thm3}, we recover Eq. \eqref{eq:kl_bound_hmm_timestep}. %CHECKED
\end{proof}
Note that Eq. \eqref{eq:kl_bound_hmm_timestep} -- and consequently also Eqs. \eqref{eq:kl_bound_expectation} and \eqref{eq:tvd_bound_expected} below -- is asymmetric with respect to $q$ and $\qtilde$, since the expectation is over data $x$ generated with the first argument, $\qtilde_0$.

Eq. \eqref{eq:kl_bound_hmm_timestep} can be applied recursively to show that the KL divergence contracts exponentially as the two distributions are propagated forward in time:
\begin{lemma}
\label{lemma:KL_bound_contraction}
Given two prior probability distributions $q_0(z)$ and $\qtilde_0(z)$ over the initial latent state $z_0$, 
%% (and assuming the true transition matrix and likelihoods $p(x|z)$ are known)
the resulting posterior distributions over the state $z_t$ at time $t$, that is
\begin{align}
q_t(z') &:= \sum_z q_0(z) p(z_t=z'|x_{1:t},z_0=z), \label{eq:q_t_def} \\ %% \theta=\theta^\star,\phi=\phi^\star
\qtilde_t(z') &:= \sum_z \qtilde_0(z) p(z_t=z'|x_{1:t},z_0=z), \label{eq:qtilde_t_def}
\end{align}
satisfy
\be\label{eq:kl_bound_expectation}
\EE_{x_{1:t}|z_0\sim \qtilde_0}[D_{KL}[\qtilde_t||q_t]] \leq e^{-\gamma_\phi t} D_{KL}[\qtilde_0||q_0],
\ee
where the expectation is over histories $x_{1:t}$ which are generated from initial latent states $z_0\sim\qtilde_0(\cdot)$.
\end{lemma}
\begin{proof}
Applying Eq. \eqref{eq:kl_bound_hmm_timestep} to the transition at time $t$, with priors $(\qtilde_0,q_0)\rarr(\qtilde_{t-1},q_{t-1})$ in Eq. \eqref{eq:kl_bound_hmm_timestep} determined by a fixed sequence $x_{1:t-1}$ of preceding data, we have
\be
\EE_{x_t|x_{1:t-1},z_0\sim \qtilde_0}[D_{KL}[\qtilde_t||q_t]]
\leq (1-\gamma_\phi) D_{KL}[\qtilde_{t-1}||q_{t-1}],
\ee
where we have denoted that the expectation is taken only over $x_t\sim p(x)=\sum_z(\Phi\tilde{q}_{t-1})_z p(x|z;\theta)$. 
Taking the remaining expectations recursively over $x_{t-1},...,x_1$, backwards in time, we have
\be
\EE_{x_{1:t}|z_0\sim \qtilde_0}[D_{KL}[\qtilde_t||q_t]]
\leq (1-\gamma_\phi)^t D_{KL}[\qtilde_0||q_0],
\ee
Since $(1-\gamma_\phi)^t=(e^{\log(1-\gamma_\phi)})^t=e^{t\log(1-\gamma_\phi)}<e^{-\gamma_\phi t}$ for $\gamma_\phi\in(0,1)$ and $t>0$, 
we recover Eq. \eqref{eq:kl_bound_expectation}.
\end{proof}

Note that Eq. \eqref{eq:kl_bound_expectation} is a conservative bound, for two reasons:
(1) If there exist pairs of states $(z_1,z_2)$ in Eq. \eqref{eq:gamma_phi_def} -- e.g. spatially distant states -- which cannot transition to any common state $z$, we have $\gamma_\phi=0$. However, mixing may still occur efficiently over several timesteps -- e.g. allowing for several transitions between spatially connected states -- leading to a similar exponential contraction with respect to a more general mixing rate.
(2) Eq. \eqref{eq:conditional_kl_bound} is a weaker bound than Eq. \eqref{eq:conditional_kl_bound_px}, which may be substantially tighter when the marginal context distributions $p(x)$ and $q(x)$ are separated by a large KL distance. This can occur when the conditional context distributions $p(x|z;\theta)$ -- denoted $\ell(x|z)$ in Lemma~\ref{lemma:fact1} -- are very different, making observations $x$ highly informative about $z$.
%We leave as a direction for future work the development of a stronger bound which leverages additional structure in either the transition matrix or context distributions.

Eq. \eqref{eq:kl_bound_expectation} can be converted into a bound on the expected total variation distance, or $1$-norm, between two posteriors:
\begin{corollary}\label{corollary:tvd_bound_expected}
The $1$-norm difference between two distributions $(\qtilde_t,q_t)$ over the state $z_t$, as defined in Eqs. \eqref{eq:q_t_def}-\eqref{eq:qtilde_t_def}, satisfies the upper bound
\be\label{eq:tvd_bound_expected}
\EE_{x_{1:t}|z_0\sim \qtilde_0}\[\sum_z|\qtilde_t(z) - q_t(z)|\] % |p(z_t=z|x_{1:t}) - q(z_t=z|x_{1:t})|
\leq  e^{-\frac{1}{2}\gamma_\phi t} \sqrt{2 D_{KL}[\qtilde_0||q_0]}.
\ee
\end{corollary}
\begin{proof}
Pinsker's inequality states that for any two probability distributions $\qtilde$ and $q$, %on a measurable space, 
the $1$-norm and KL divergence satisfy $||\qtilde-q||_1 \leq \sqrt{2 D_{KL}[\qtilde||q]}$.\footnote{The symmetry of the left hand side under exchange of $\qtilde$ and $q$ implies the same relation holds with respect to the reverse KL divergence $D_{KL}[q||\qtilde]$.} 
Setting $||\qtilde-q||_1=\sum_z|\qtilde_t(z) - q_t(z)|$, and taking the expectation, we have
\be %CHECKED
\EE_{x_{1:t}|z_0\sim \qtilde_0}\[\sum_z|\qtilde_t(z) - q_t(z)|\]
\leq \EE_{x_{1:t}|z_0\sim \qtilde_0} \[\sqrt{2 D_{KL}[\qtilde_t||q_t]}\]. \nn
\ee
Applying Jensen's inequality to bring the expectation under the square root, we have
\be %CHECKED
\EE_{x_{1:t}|z_0\sim \qtilde_0}\[\sum_z|\qtilde_t(z) - q_t(z)|\]
\leq \sqrt{2\cdot\EE_{x_{1:t}|z_0\sim \qtilde_0}[D_{KL}[\qtilde_t||q_t]]}. \nn
\ee
Applying Eq. \eqref{eq:kl_bound_expectation}, we arrive at Eq. \eqref{eq:tvd_bound_expected}. %CHECKED
\end{proof}

\iffalse
To end this section, we show that the optimal actions corresponding to two posterior beliefs are exponentially unlikely to be different, if the total variation distance between the posteriors has contracted over time as in Corollary~\ref{corollary:tvd_bound_expected}. This result will be useful in Section~\ref{app:cov} for bounding the convergence of the covariance matrix $\covinv^{(a)}$ to its asymptotic form as $t\rarr\infty.$

\begin{lemma}\label{lemma:opt_action_diff}
Given two distributions $(\qtilde_t,q_t)$ over the state $z_t$, as defined in Eqs. \eqref{eq:q_t_def}-\eqref{eq:qtilde_t_def}, 
and defining the corresponding optimal actions,
\be %CHECKED
\tilde{a}^\star_t := \argmax_a\tilde{q}^\star_t\mustara, \ \ \ 
a^\star_t = \argmax_a q^\star_t\mustara.
\ee
the probability that $\tilde{a}^\star_t$ and $a^\star_t$ are not equivalent satisfies the upper bound,
\be\label{eq:opt_action_diff_bound}
\mathbb{P}[\tilde{a}^\star_t\neq a^\star_t] \leq
2\sqrt{ p_\mu}
(2D_{KL}[\tilde{q}_0||q_0])^{1/4}e^{-\frac{1}{4}\gamma_\phi t}
\ee
where $p_\mu$, defined in Eq. \eqref{eq:p_mu_def} below, is the maximum probability density as $t\rarr\infty$ that the reward gap between the two actions, $\tilde{q}_t^\top\mu_\star^{(\tilde{a}^\star_t)} - \tilde{q}_t^\top\mu_\star^{(a^\star_t)}$, will be infinitesimally small.
\end{lemma}
\begin{proof}
We first define
\be
\Delta^{(a)}:=q_t^\top\mustara - \tilde{q}_t^\top\mustara
\ee
as the increase in expected reward for action $a$ resulting from replacing the belief $\tilde{q}_t$ with $q_t$. %CHECKED
The optimal actions $\tilde{a}^\star_t$ and $a^\star_t$ can only differ %(CHECKED)
if exchanging $\tilde{q}_t$ for $q_t$ results in an increase in expected reward for $a^\star_t$ relative to $\tilde{a}^\star_t$ which exceeds the reward gap $\tilde{q}_t^\top(\mu_\star^{(\tilde{a}^\star_t)} - \mu_\star^{(a^\star_t)})$ between the two actions given the belief $\tilde{q}_t$, that is, if
\be %(CHECKED)
\Delta^{(a^\star_t)} - \Delta^{(\tilde{a}^\star_t)}
> \tilde{q}_t^\top(\mu_\star^{(\tilde{a}^\star_t)} - \mu_\star^{(a^\star_t)}). \nn
\ee
Thus, the probability that $\tilde{a}^\star_t\neq a^\star_t$ is upper bounded,
\be
\mathbb{P}[\tilde{a}^\star_t\neq a^\star_t]
\leq %(CHECKED)
\mathbb{P}
\[ \Delta^{(a^\star_t)} - \Delta^{(\tilde{a}^\star_t)}
> \tilde{q}_t^\top(\mu_\star^{(\tilde{a}^\star_t)} - \mu_\star^{(a^\star_t)})
\] \nn
\ee
Noting that the reward difference for any $(a,\tilde{a})$ is
\be %CHECKED
\Delta^{(a)} - \Delta^{(\tilde{a})} 
%% = (q_t - \tilde{q}_t)^\top\mustara - (q_t - \tilde{q}_t)^\top\mu_\star^{(\tilde{a})} 
= (q_t - \tilde{q}_t)^\top(\mustara - \mu_\star^{(\tilde{a})}), \nn
\ee
and defining $\nu:=\mu_\star^{(\tilde{a}^\star_t)} - \mu_\star^{(a^\star_t)}$, we have
\be
\mathbb{P}[\tilde{a}^\star_t\neq a^\star_t]
\leq %CHECKED
\mathbb{P}
\[ -(q_t - \tilde{q}_t)^\top\nu
> \tilde{q}_t^\top\nu\]
\leq %CHECKED
\mathbb{P}
\[ |(q_t - \tilde{q}_t)^\top\nu|
> \tilde{q}_t^\top\nu \]
%\leq %CHECKED
%\mathbb{P}
%\[ ||\nu||_1 \cdot ||q_t - \tilde{q}_t||_1
%\geq \tilde{q}_t^\top\nu \].
\label{eq:prob_opt_a_differ_bound1}
\ee
In the second inequality, we have used the fact that $\tilde{q}_t^\top\nu>0$ due to the definition of $\tilde{a}^\star_t$. %CHECKED

Since the reward gap $\tilde{q}_t^\top\nu$ depends on the history $x_{1:t}$ via $\tilde{q}_t$ (conditional on the prior $\tilde{q}_0$) and the history-dependent actions in $\nu$, it is a (real-valued) random variable. For a given pair of actions $(a^\star,a)$, we define its probability density near zero -- ultimately a function of the problem parameters $(\theta,\Phi)$ -- as
\be
p_{a^\star,a}(t|\tilde{q}_0) := \lim_{\eps\rarr0}\frac{1}{\eps}\cdot\mathbb{P}
\[\frac{\tilde{q}_t^\tp(\mustarastar-\mustara)}{||\mustarastar-\mustara||_1}\leq\eps\].
\ee
Setting $(a^\star,a)\rarr(\tilde{a}^\star_t,a^\star_t)$, in the $\eps\rarr0$ limit we have
\be\label{eq:eps_reward_gap}
\mathbb{P}[\tilde{q}^\tp_t\nu\leq\eps||\nu||_1]
= %CHECKED
\eps\cdot p_{\tilde{a}^\star_t,a^\star_t}(t|\tilde{q}_0) \leq p_\mu
\ee
As $t\rarr\infty$, the posterior $\tilde{q}_t$ loses dependence on the original prior $\tilde{q}_0$ at an exponential rate (Corollary~\ref{corollary:tvd_bound_expected}), so $p_{a^\star,a}(t|\tilde{q}_0)$ will approach a $\tilde{q}_0$-independent constant as $t\rarr\infty$.
We define
\be\label{eq:p_mu_def}
p_\mu := \max_{a,a'} \lim_{t\rarr\infty} p_{a^\star,a}(t|\tilde{q}_0)
\ee
as the maximal possible probability density (over context and action histories) for which the reward gap between two actions vanishes, in the asymptotic $t\rarr\infty$ limit. 
We now treat $\eps>0$ as a free parameter to be optimized, and will see below that when $t$ is large $\eps$ can be chosen arbitrarily close to $0$, justifying the use of Eq. \eqref{eq:eps_reward_gap}. Multiplying Eq. \eqref{eq:prob_opt_a_differ_bound1} by a factor of 
\be %CHECKED
1=\mathbb{P}[\tilde{q}^\tp_t\nu<\eps||\nu||_1] + \mathbb{P}[\tilde{q}^\tp_t\nu\geq\eps||\nu||_1], \nn
\ee
we have
\begin{align}
    \mathbb{P}[\tilde{a}^\star_t\neq a^\star_t]
    &\leq %CHECKED (copied)
    \mathbb{P} \[ |(q_t - \tilde{q}_t)^\top\nu| > \tilde{q}_t^\top\nu \] \nn \\
    &\leq %(CHECKED)
    \mathbb{P}[\tilde{q}^\tp_t\nu\leq\eps||\nu||_1]
    \cdot\mathbb{P}[|(q_t - \tilde{q}_t)^\top\nu|>\tilde{q}^\tp_t\nu]
    + \mathbb{P}[\tilde{q}^\tp_t\nu>\eps||\nu||_1]
    \cdot
    \mathbb{P}[|(q_t - \tilde{q}_t)^\top\nu|>\eps||\nu||_1]. \nn \\
    &\leq %CHECKED UP TO FINITENESS OF EPS
    \eps\cdot p_{\tilde{a}^\star_t,a^\star_t}(t|\tilde{q}_0) + \mathbb{P}[|(q_t - \tilde{q}_t)^\top\nu|>\eps||\nu||_1] \nn \\
    &\leq %CHECKED
    \eps\cdot p_{\tilde{a}^\star_t,a^\star_t}(t|\tilde{q}_0) + \mathbb{P}[||(q_t - \tilde{q}_t)||_1>\eps] , \nn
\end{align}
where we have used the condition $\tilde{q}^\tp_t\nu>\eps||\nu||_1$ to replace $\tilde{q}^\tp_t\nu\rarr\eps||\nu||_1$ at the very end of the second line (weakening but preserving the bound), have used $\mathbb{P}[\tilde{q}^\tp_t\nu\geq\eps||\nu||_1]\leq1$ and $\mathbb{P}[|(q_t - \tilde{q}_t)^\top\nu|>\tilde{q}^\tp_t\nu]\leq1$ in the third line, 
and have used $|(q_t - \tilde{q}_t)^\tp\nu|\leq||q_t - \tilde{q}_t||_1||\nu||_1$ %CHECKED
in the last line.

Applying Markov's inequality, which states that $\mathbb{P}[X\geq x]\leq\EE[X]/x$ for any non-negative random variable $X$ and any $x>0$, with 
$X\rarr \sum_z|\tilde{q}^\star_t(z) - q^\star_t(z)| $ 
and $x\rarr \eps$, we see that
\be %CHECKED
\mathbb{P}[\tilde{a}^\star_t\neq a^\star_t] \leq
\eps\cdot p_{\tilde{a}^\star_t,a^\star_t}(t|\tilde{q}_0)
+
\frac{1}{\eps}\EE\[\sum_z|\tilde{q}^\star_t(z) - q^\star_t(z)|\] \nn
\ee
Recalling Eq. \eqref{eq:p_mu_def} and using Corollary~\ref{corollary:tvd_bound_expected} to set $\EE\[\sum_z|\tilde{q}^\star_t(z) - q^\star_t(z)|\]\leq e^{-\frac{1}{2}\gamma_\phi t}\sqrt{2 D_{KL}[\tilde{q}_0||q_0]}$, %CHECKED
we see that
\be %CHECKED
\mathbb{P}[\tilde{a}^\star_t\neq a^\star_t] \leq
\eps\cdot p_\mu + \frac{1}{\eps}e^{-\frac{1}{2}\gamma_\phi t}\sqrt{2 D_{KL}[\tilde{q}_0||q_0]}. \nn
\ee
Optimizing $\eps$ by setting to zero the derivative with respect to $\eps$, we find the optimal value
\be
\eps^\star := %CHECKED
\frac{1}{\sqrt{p_\mu}}
(2D_{KL}[\tilde{q}_0||q_0])^{1/4}e^{-\frac{1}{4}\gamma_\phi t}, \nn
\ee
for which
\be %CHECKED
\mathbb{P}[\tilde{a}^\star_t\neq a^\star_t] \leq
2\sqrt{p_\mu}
(2D_{KL}[\tilde{q}_0||q_0])^{1/4}e^{-\frac{1}{4}\gamma_\phi t}, \nn
\ee
recovering %CHECKED
Eq. \eqref{eq:opt_action_diff_bound} above.
\end{proof}
\fi

\subsection{Partial bound on the estimator error}
\label{app:numerator}

In this section, we compute the variance of the vector $\error^{(a)}$ defined in Eq. \eqref{eq:error_numerator_def}, across different reward and observation histories. We show that the variance converges to zero as $T\rarr\infty$, and then show that $|\error^{(a)}|$ converges to zero asymptotically.
In the following sections, we will use this result to bound the estimator error $\hat{\mu}^{(a)}-\mustara = (\covinv^{(a)})^{-1}\error^{(a)}$.

\begin{lemma}\label{lemma:partial_bound}
When %the latent Markov process is in equilibrium, with $z_0\sim\rhoeq(\cdot)$, and when 
the ground truth parameters $(\theta,\Phi)$ are known, each element of 
$\error^{(a)}$, Eq. \eqref{eq:error_numerator_def}, %% \ctxmatrix\rvec - \covinv\mu^\star
satisfies the upper bound
\be\label{eq:partial_bound}
(\error^{(a)}_z/T)^2 \leq \frac{1}{\delta\cdot T}
\Big( \stdeveq^2 + ||\mustara||_1^2\frac{4}{\gamma_\phi}\big(1+\log\cdkl\big) \Big)
\ee
with probability at least $1-\delta$, for any $\delta\in(0,1)$,
where 
\be\label{eq:stdeveq_def_appendix}
\stdeveq^2 := \max_a \sum_z\rhoeq(z){\rm Var}[r|z,a]
\ee
is the maximal variance in rewards when the latent state has reached equilibrium, 
and $\gamma_{\phi}^{-1}\log\cdkl:=\tau^\star$
%% (i.e. the time $\tau$ measured in units of the mixing time $1/\gamma_{\phi}$) at which
is the integer number of timesteps satisfying
\begin{align}
    \tau^\star:=\min_{\tau\in\mathbb{N}}|\log D_{\phi}(\tau)-\gamma_{\phi}\tau|,  %% \Dphitau
\end{align}
where
\be\label{eq:D_phi_tau_def}
D_{\phi}(\tau) %% \Dphitau
:= \max_z \max_{t\geq1} \EE_{x_{1:t+\tau}} \[
    D_{KL}[p(z_{t+\tau}|x_{1:t+\tau})||p(z_{t+\tau}|z_t=z;x_{1:t+\tau})] \]
\ee
is a measure of how much information the latent state $z_t$ at any time $t$ can possibly contain about a future latent state $z_{t+\tau}$.
\end{lemma}
\begin{proof}
We will use the shorthand notation $\dta:=\one(a_t=a)$ for the indicator function which picks out times $t$ for a given action $a$. 
First, we observe that the expectation of $\error^{(a)}$ (conditional on any action sequence $a_{1:T}$) %%over possible histories $x_{1:T}$, 
is zero:
\begin{align}
    \EE[\error^{(a)}_z|a_{1:T}] &= 
    \EE_{x_{1:T}}[\error^{(a)}_z|x_{1:T},a_{1:T}] \nn \\
%    \EE_{x_{1:t}}[\EE_{x_{t+1:T}}[\biasmatrix_{z,z'}]]
    &= %(CHECKED)
    \EE_{x_{1:T}}\Big[\sum_{t=1}^T \dta p(z_t=z|x_{1:t})\EE[r_t|x_{1:T},a_t=a] \nn \\
    & \ \ \ \ \ \ \ \ - \sum_{t=1}^T \dta p(z_t=z|x_{1:t})\sum_{z'}p(z_t=z'|x_{1:t})(\mustara)r_{z'} \Big] \nn \\
    &= %(CHECKED)
    \sum_{t=1}^T \dta
    \EE_{x_{1:T}}\[p(z_t=z|x_{1:t})\sum_{z'}(p(z_t=z'|x_{1:T}) - p(z_t=z'|x_{1:t}))(\mustara)_{z'} \] \nn \\
    &= %(CHECKED)
    \sum_{t=1}^T \dta
    \sum_{z'} (\mustara)_{z'}\cdot
    \EE_{x_{1:t}}\[
    p(z_t=z|x_{1:t})(\EE_{x_{t+1:T}}[p(z_t=z'|x_{1:T})] - p(z_t=z'|x_{1:t}))\] \nn \\
    &= %(CHECKED)
    \sum_{t=1}^T \dta
    \sum_{z'}(\mustara)_{z'}\cdot
    \EE_{x_{1:t}}[
    p(z_t=z|x_{1:t})(p(z_t=z|x_{1:t}) - p(z_t=z|x_{1:t}))] = 0.
    \label{eq:expected_error_zero}
\end{align}
Here, we have used the fact that $\EE[r_t|x_{1:T},a_t=a]=\sum_{z'} p(z_t=z'|x_{1:T})(\mustara)_{z'}$ to take the expectation over reward data, followed by the partial expectation over context data $x_{t+1:T}$.

Since $\EE[\error^{(a)}_z]=0$, we compute the variance to obtain an upper bound on $|\error^{(a)}_z|$.
To compute the variance of the vector element $\error^{(a)}_z$, we first take the expectation over rewards, conditional on a specific context history $x_{1:T}$.
Defining the reward noise
\be %CHECKED
\eta^{(a)}_t
:= r_t - \sum_{z'} p(z_t=z'|x_{1:t})(\mustara)_{z'}
= r_t - p_t^\top \mustara,
\ee
so that for brevity we can write $\error^{(a)} = \sum_t \dta p_t\eta_t^{(a)}$, or equivalently
\be %CHECKED
\error^{(a)}_z = \sum_{t=1}^T \dta p(z_t=z|x_{1:t}) \eta^{(a)}_t, \nn
\ee
we have (for any $(z_1,z_2)$)
\begin{align}
    \EE[\error^{(a)}_{z_1}\error^{(a)}_{z_2}|x_{1:T},a_{1:T}]
    & = %CHECKED
    \sum_{t,t'} \dta \dtpa
    p(z_t=z_1|x_{1:t})p(z_{t'}=z_2|x_{1:t'})
    \cdot\EE[\eta^{(a)}_t\eta^{(a)}_{t'}|x_{1:T},a_t=a_{t'}=a]. \label{eq:numerator_cov_xcond}
\end{align}
Since $\EE[r_t|x_{1:T},a_t=a]=\sum_z p(z_t=z|x_{1:T})(\mustara)_z$ and
$\EE[r_t r_{t'}|x_{1:T},a_t=a_{t'}=a] = \sum_{z,z'}p(z_t=z,z_{t'}=z'|x_{1:T})(\mustara)_z(\mustara)_{z'}$, %CHECKED
the correlation between reward noise at times $t$ and $t'\neq t$ is
\begin{align} \label{eq:eta_cov}
    &({\rm for} \  t\neq t') \nn \\ &\EE[\eta_t\eta_{t'}|x_{1:T},a_t=a_{t'}=a] = %(CHECKED)
    \nn \\
    & \ \ \ \sum_{z,z'}(\mustara)_z(\mustara)_{z'}\big[p(z_t=z,z_{t'}=z'|x_{1:T}) - p(z_t=z|x_{1:t})p(z_{t'}=z'|x_{1:T}) \nn \\
    & \ \ \ \ \ \ - p(z_t=z|x_{1:T})p(z_{t'}=z'|x_{1:t'})
    + p(z_t=z|x_{1:t})p(z_{t'}=z'|x_{1:t'})
    \big].
\end{align}
When $t=t'$ we have
\begin{align}
    \EE[\eta_t^2|x_{1:T},a_t=a]
    &= %CHECKED
    \sum_z p(z_t=z|x_{1:T})((\stdevza)^2 + [(\mustara)_z]^2)
    \nn \\
    & \ \ \ - 2\Big(\sum_z p(z_t=z|x_{1:t})(\mustara)_z\Big)\Big(\sum_{z'} p(z_t=z'|x_{1:T})(\mustara)_{z'}\Big) \nn \\
    & \ \ \ + \Big(\sum_z p(z_t=z|x_{1:t})(\mustara)_z\Big)^2,
    \label{eq:eta_tt}
%% \leq \stdevzmax^2,
\end{align}
where
\be
\stdevza := \EE_{r\sim p(\cdot|z,a)}[r^2] - \EE_{r\sim p(\cdot|z,a)}[r]^2 = \EE_{r\sim p(\cdot|z,a)}[r^2] - [(\mustara)_z]^2.
\ee
%% and $\stdevzmax^2 := \max_z\stdevz^2$. 

We now take the expectation over $x_{1:T}$.
Because Eq. \eqref{eq:numerator_cov_xcond} only depends on $x_{t'+1:T}$ via the conditional expectation of reward noise $\EE[\eta_t\eta_{t'}|x_{1:T}]$, we can take the partial expectation over $x_{t'+1:T}$ as follows:
\begin{align}
    & \EE[\error^{(a)}_{z_1}\error^{(a)}_{z_2}|a_{1:T}] = \EE_{x_{1:T}}\[\EE[\error^{(a)}_{z_1}\error^{(a)}_{z_2}|x_{1:T},a_{1:T}] \] \nn \\
    & \ \ \ = %CHECKED
    2\sum_{t,t'>t} \dta \dtpa
    \EE_{x_{1:t'}} \[ p(z_t=z_1|x_{1:t})p(z_{t'}=z_2|x_{1:t'})
    \cdot\EE_{x_{t'+1:T}}[\EE[\eta_t\eta_{t'}|x_{1:T},a_t=a_{t'}=a]] \] \nn \\
    & \ \ \ \ \ \ + \sum_t \dta
    \EE_{x_{1:t}}\[p(z_t=z_1|x_{1:t})p(z_t=z_2|x_{1:t})
    \cdot\EE_{x_{t+1:T}}[\EE[\eta_t^2|x_{1:T},a_t=a]] \]
    \label{eq:numerator_cov_2}
\end{align}
where we have decomposed the double sum over time as $\sum_{t,t'}=\sum_{t=t'}+2\sum_{t,t'>t}$. 
Using Eq. \eqref{eq:eta_cov} for the $t<t'$ terms, we have
\begin{align}
    ({\rm for} \  t < t') \ \
    & \EE_{x_{t'+1:T}}[\EE[\eta_t\eta_{t'}|x_{1:T},a_t=a_{t'}=a]]
    \nn \\
    & = %CHECKED
    \sum_{z,z'}(\mustara)_z (\mustara)_{z'}
    \big[ \EE_{x_{t'+1:T}}[p(z_t=z,z_{t'}=z'|x_{1:T})]
    \nn \\
    & \ \ \ \ \ \ \ \ \ \ \ \ -
    p(z_t=z|x_{1:t})\EE_{x_{t'+1:T}}[p(z_{t'}=z'|x_{1:T})]
    \nn \\
    & \ \ \ \ \ \ \ \ \ \ \ \ - \EE_{x_{t'+1:T}}[p(z_t=z|x_{1:T})]p(z_{t'}=z'|x_{1:t'})
    \nn \\
    & \ \ \ \ \ \ \ \ \ \ \ \
    + p(z_t=z|x_{1:t})p(z_{t'}=z'|x_{1:t'}) \big]
    \nn \\
    & = %CHECKED
    \sum_{z,z'} (\mustara)_z (\mustara)_{z'}
    \big[ p(z_t=z,z_{t'}=z'|x_{1:t'}) - p(z_t=z|x_{1:t})p(z_{t'}=z'|x_{1:t'}) \nn \\
    & \ \ \ \ \ \ \ \ \ \ \ \ - p(z_t=z|x_{1:t'})p(z_{t'}=z'|x_{1:t'})
    + p(z_t=z|x_{1:t})p(z_{t'}=z'|x_{1:t'})
    \big] \nn \\
    & = %CHECKED
    \sum_{z,z'} (\mustara)_z (\mustara)_{z'}
    p(z_t=z|x_{1:t'})\big( p(z_{t'}=z'|z_t=z,x_{1:t'}) - p(z_{t'}=z'|x_{1:t'}) \big)  \label{eq:eta_cov_final}
\end{align}
In the second equality we have cancelled two equivalent terms, and in the last line we have factored the joint distribution over $(z,z')$ into a marginal and conditional. 
Similarly, using Eq. \eqref{eq:eta_tt} for the $t'=t$ terms, we have
\begin{align}
    \EE[\eta_t^2|x_{1:t},a_t=a] &=
    \EE_{x_{t+1:T}}[\EE[\eta_t^2|x_{1:T},a_t=a]] \nn \\
    &= %CHECKED
    \sum_z p(z_t=z|x_{1:t})((\stdevza)^2 + [(\mustara)_z]^2) - \Big(\sum_z p(z_t=z|x_{1:t})(\mustara)_z\Big)^2
    \nn \\
    &\leq %CHECKED
    \sum_z p(z_t=z|x_{1:t})((\stdevza)^2 + [(\mustara)_z]^2). \label{eq:eta_var_final}
\end{align}
%\be
%\EE[\eta_t^2] = 
%\EE_{x_{1:t}}[\EE[\eta_t^2|x_{1:t}]] \leq %%CHECKED
%\sum_z\rhoeq(z)(\stdevz^2+(\mu^\star_z)^2). \nn
%\ee
Substituting Eqs. \eqref{eq:eta_cov_final} and \eqref{eq:eta_var_final} into Eq. \eqref{eq:numerator_cov_2}, taking the absolute value to obtain an upper bound, 
using $p(z_{t}=z_1|x_{1:t})p(z_{t'}=z_2|x_{1:t'})\leq1$ and $ p(z_t=z|x_{1:t'})\leq1$ to simplify the expression, 
using the fact that $\EE_{x_{1:t}}[p(z_t=z|x_{1:t})]=\rhoeq(z)$ %%assumption that $z_0\sim\rhoeq(\cdot)$ to set $\EE_{x_{1:t}}[p(z_t=z|x_{1:t})]=\rhoeq(z)$
in the $t=t'$ contribution,
and setting $z_1=z_2$ for simplicity, 
we have
\begin{align}
    {\rm Var}[\error^{(a)}_{z_1}] &\leq
    %CHECKED
    T \sum_z\rhoeq(z)((\stdevza)^2+[(\mustara)_z]^2) \nn \\
    & \ \ + \sum_{z,z'}|(\mustara)_z (\mustara)_{z'}|\times
    2\sum_{t,t'>t}
    \EE_{x_{1:t'}} \[
    %% p(z_t=z|x_{1:t'}) \cdot
    |p(z_{t'}=z'|z_t=z,x_{1:t'}) - p(z_{t'}=z'|x_{1:t'})| \] \label{eq:g_var}
\end{align}
We have also used $\dta\dtpa\leq1$ and have removed the action-conditioning on ${\rm Var}[\error^{(a)}]$, since after setting $\dta\dtpa\leq1$ the right-hand side no longer depends on the action sequence, and thus the inequality holds for any action sequence. 
Introducing a free parameter $\tau_1$ satisfying $1\leq\tau_1\leq t'-t$, we take the partial expectation over $x_{t+\tau_1:t'}$ of the difference in conditional probabilities by applying Corollary~\ref{corollary:tvd_bound_expected} to bound the expectation value over $x_{t+\tau_1+1:t'}$:
\begin{align}
%%    & \EE_{x_{1:t'}} \[ p(z_t=z|x_{1:t})
%%    \cdot|p(z_{t'}=z'|z_t=z,x_{1:t'}) - p(z_{t'}=z'|x_{1:t'})| \]
%%    \EE_{x_{1:t}} \[ p(z_t=z|x_{1:t})\cdot ...... \]
    & \EE_{x_{1:t'}} [|p(z_{t'}=z'|z_t=z,x_{1:t'}) - p(z_{t'}=z'|x_{1:t'})|] \nn \\
    & = \EE_{x_{1:t+\tau_1}}
    \EE_{x_{t+\tau_1+1:t'}} [|p(z_{t'}=z'|z_t=z,x_{1:t'}) - p(z_{t'}=z'|x_{1:t'})|] \nn \\
    & \leq %(CHECKED~ish)
    e^{-\frac{1}{2}\gamma_\phi(t'-(t+\tau_1))}
    \EE_{x_{1:t+\tau_1}}\[ %% p(z_t=z|x_{1:t})
    \sqrt{2 D_{KL}[p(z_{t+\tau_1}|x_{1:t+\tau_1})||p(z_{t+\tau_1}|z_t=z;x_{1:t+\tau_1})]} \]
    \nn \\
    & \leq %CHECKED
    e^{-\frac{1}{2}\gamma_\phi(t'-(t+\tau_1))}
    \sqrt{2\cdot \EE_{x_{1:t+\tau_1}} \[
    D_{KL}[p(z_{t+\tau_1}|x_{1:t+\tau_1})||p(z_{t+\tau_1}|z_t=z;x_{1:t+\tau_1})] \] } \nn \\
    & \leq %CHECKED
    e^{-\frac{1}{2}\gamma_\phi(t'-(t+\tau_1))}
    \sqrt{2\Dphitau}. \nn
\end{align}
In the second inequality, we have applied Jensen's inequality to bring the expectation inside the square root.
In the last line, we have recalled the definition of $\Dphitau$ in Eq. \eqref{eq:D_phi_tau_def}.  For $\tau_1\gg 1/\gamma_\phi$, the latent state will have evolved through multiple mixing times, so we expect $\Dphitau$ to become small, decreasing to zero as $\tau_1\rarr\infty$.

% For small $t'-t$, the exponential factor does not introduce any significantly suppression.
We now introduce a second free parameter $\tau_0\in\mathbb{N}$ (which we will optimize below), and use it to decompose the sum over $t'-t$ into a contribution from widely separated times, $t'-t>\tau_0$, where the exponential suppression is strong, and a contribution from nearby times, $t'-t\leq\tau_0$, over which the posterior probabilities may be more strongly correlated and there is not significant exponential suppression:
\begin{align} %CHECKED
    {\rm Var}[\error^{(a)}_{z_1}] &\leq 
    T\sum_z\rhoeq(z)((\stdevza)^2+[(\mustara)_z]^2)
    \label{eq:eps_cov} \\
    & \ \ \ + 2||\mustara||_1^2
    \sum_{t,t'>t} \Big[ \one(t'-t\leq\tau_0)
    + \one(t'-t>\tau_0)
    e^{-\frac{1}{2}\gamma_\phi(t'-(t+\tau_1))} \sqrt{2\Dphitau} \Big]. \nn
\end{align}
Here, we have used the fact that 
$$
\sum_{z,z'}|(\mustara)_z (\mustara)_{z'}| \leq
\sum_z|(\mustara)_z|\times\sum_{z'}|(\mustara)_{z'}| = ||\mustara||_1^2,
$$
and (in the $t'-t\leq\tau_0$ term) the fact that the difference of probabilities in Eq. \eqref{eq:g_var} is between 0 and 1.  
%% Eq. \eqref{eq:eta_cov_final} to upper bound $\EE[\eta_t\eta_{t'}|x_{1:t'},a_t=a_{t'}=a]\leq||\mustara||_1^2$ in the $1\leq t'-t\leq\tau_0$ contribution.
The $t'-t>\tau_0$ contribution can be upper bounded as follows:
\be
\sum_{t,t'>t}\one(t'-t>\tau_0)
e^{-\frac{1}{2}\gamma_\phi(t'-(t+\tau_1))}
\leq %CHECKED
T\sum_{\tau=\tau_0+1}^T e^{-\frac{1}{2}\gamma_\phi(\tau-\tau_1)}
\leq %(CHECKED)
T\int_{\tau_0}^\infty d\tau e^{-\frac{1}{2}\gamma_\phi(\tau-\tau_1)}
= %CHECKED
\frac{2T}{\gamma_\phi}e^{-\frac{1}{2}\gamma_\phi(\tau_0-\tau_1)}. \nn
\ee
Here, we have used monotonicity with respect to $\tau$ to bound the discrete sum with a continuous integral.
Using this in Eq. \eqref{eq:eps_cov}, we have
\be
{\rm Var}[\error^{(a)}_{z_1}]  \leq %CHECKED
T\sum_z\rhoeq(z)((\stdevza)^2+[(\mustara)_z]^2)
+ 2||\mustara||_1^2 \Big(
T\tau_0 + \frac{2T}{\gamma_\phi}e^{-\frac{1}{2}\gamma_\phi(\tau_0-\tau_1)}\sqrt{2\Dphitau}
\Big).
\ee
Setting to zero the derivative with respect to $\tau_0$, and solving for $\tau_0$, we find the optimal value %(rounded up to the nearest integer)
\be %CHECKED
\tau^\star_0 :=
%\lceil
\tau_1 +  \frac{1}{\gamma_\phi}\log(2\Dphitau), %\rceil,
\nn
\ee
for which the upper bound becomes
\be %CHECKED
{\rm Var}[\error^{(a)}_{z_1}] \leq
T\sum_z\rhoeq(z)((\stdevza)^2+[(\mustara)_z]^2)
+ 2T||\mustara||_1^2
\Big(\tau_1+\frac{1}{\gamma_\phi}\big(2+\log\Dphitau\big)\Big). \nn
\ee
%%% Defining $\cc:=\tau_1\gamma_\phi$ to express the free parameter $\tau_1$ in terms of the mixing rate $\gamma_\phi$, we have
%%% \be %CHECKED
%%% {\rm Var}[\error_{z_1}] \leq
%%% T\sum_z\rhoeq(z)(\stdevz^2+(\mu^\star_z)^2)
%%% + \frac{2T}{\gamma_\phi}||\mu^\star||_1^2
%%% \big(2+\cc+\log\Dphiconst\big). \nn
%%% \ee
%% \cyan{We now make the mild assumption that there exists a finite value of $\cc$ at which $\log\Dphiconst<\cc$, and set $\cc$ to the minimal such value, which we call $\log\cdkl$. (The value of $\cc$ should be an $O(1)$ numerical constant.)}
We now approximately optimize $\tau_1$ by setting it equal to the value $\tau_1^\star$ at which $\gamma_{\phi}\tau^\star_1=\log D_\phi(\tau_1^\star):=\log\cdkl$.
% -- as stated in Lemma~\ref{lemma:partial_bound} above --
% as the solution to $\log D_\phi(\cdkl/\gamma_{\phi})=\cdkl$.
Furthermore, since $\sum_z\rhoeq(z)((\mustara)_z)^2<\sum_z((\mustara)_z)^2=||\mustara||_2^2<||\mustara||_1^2$ and $1/\gamma_\phi\geq1$, the expression for ${\rm Var}[\error^{(a)}_z]$ simplifies to:
\be
{\rm Var}[\error^{(a)}_z] \leq %CHECKED
T \Big( \stdeveq^2 + ||\mustara||_1^2\frac{4}{\gamma_\phi}\big(1+\log\cdkl\big) \Big),
\ee
where
\be\label{eq:stdeveq_def}
\stdeveq^2:=\max_a\sum_z\rhoeq(z)(\stdevza)^2.
\ee
Finally, we apply Chebyshev's inequality, which states that
\be %(CHECKED)
|\error^{(a)}_z-\EE[\error^{(a)}_z]|<\sqrt{\frac{{\rm Var}[\error^{(a)}_z]}{\delta}}
\ee
with probability at least $1-\delta$ for any $\delta\in(0,1)$. 
Recalling from Eq. \eqref{eq:expected_error_zero} that $\EE[\error^{(a)}_z]=0$, 
we recover %CHECKED
Eq. \eqref{eq:partial_bound} above.
\end{proof}

\subsection{Bound on the inverse covariance matrix}
\label{app:cov}

In this section we derive a theoretical bound on the action-wise inverse covariance matrix $\covinv^{(a)}$ in the $T\rarr\infty$ limit.

We will
(i) use a mild assumption on the frequency with which optimal actions are selected in order to lower bound the expected elements $\EE[\covinv_{z,z'}^{(a)}]$ of the action-wise inverse covariance matrices, 
%(i) show that the expectation $\EE[\covinv^{(a)}]$ converges to a fixed asymptotic matrix as $T\rarr\infty$, 
(ii) show that the variance around this expectation decreases as $1/\gamma_\phi T$, 
and (iii) combine these results to obtain a high-probability lower bound on the empirical inverse covariance matrix $\covinv^{(a)}$.
%on the deviation of the empirical inverse covariance matrix $\covinv^{(a)}$ from its asymptotic expected form

Recalling that the context history $x_{1:t}$ determines (conditional on the true task parameters\footnote{We restore the $^\star$ notation in Eq. \eqref{eq:a_star_t_def} to denote this.}) an optimal action
\be\label{eq:a_star_t_def}
a^\star_t %% = a^\star(x_{1:t}) 
:= \argmax_a \sum_z p^\star(z_t=z|x_{1:t})\mustara,
\ee
we state the lower bound of point (i) above:
\begin{lemma}\label{lemma:expected_Bzz}
Assuming that at any $t$ the optimal action given $x_{1:t}$, Eq. \eqref{eq:a_star_t_def}, 
%% $a^\star_t=\argmax_a \sum_z p(z_t=z|x_{1:t})(\mustara)_z$
is selected by a policy $\pi$ with probability at least $\pi_{\rm min}>0$, 
the expectation over histories $x_{1:T}$ of the empirical inverse covariance matrix, 
$\covinv^{(a)}$, %% :=\ctxmatrix\ctxmatrix^\tp
satisfies the lower bound
\be\label{eq:expected_B_bound}
\frac{1}{T}\EE[\covinv^{(a)}(T)] \succcurlyeq
\pi_{\rm min} \Beq^{(a)}(T),
\ee
where $A\succcurlyeq B$ indicates that $A-B$ is positive semidefinite,
and
\be\label{eq:B_eq_element_def}
\Beq^{(a)}_{zz'}(T) := \frac{1}{T}\sum_{t=1}^T \EE_{x_{1:t}}\[\one(a=a^\star_t)p(z_t=z|x_{1:t})p(z_t=z'|x_{1:t})\].
\ee
\end{lemma}
\begin{proof}
We first express the expectation value of the matrix element $\covinv^{(a)}_{zz'}$ as a sum over expected values at each time,
\begin{align}
\EE[\covinv^{(a)}_{zz'}(T)]
& = %CHECKED
\sum_{t=1}^T 
\EE_{x_{1:t}} \[
\EE_{r_{1:t-1},a_{1:t-1}|x_{1:t}} [
\one(a_t=a)] %% |x_{1:t},r_{1:t-1},a_{1:t-1}
p(z_t=z|x_{1:t})p(z_t=z'|x_{1:t}) \] \nn \\
&= \sum_{t=1}^T 
\EE_{x_{1:t}} \[
P_\pi(a_t=a|x_{1:t})
p(z_t=z|x_{1:t})p(z_t=z'|x_{1:t}) \].
\label{eq:B_a_expectation}
\end{align}
In the first line, we have decomposed the expectation into an inner context-conditioned expectation over actions and rewards, and an outer expectation over contexts. 
The former only involves the binary indicator $\one(a_t=a)$, and is the probability
\be
P_\pi(a_t=a|x_{1:t}) := %CHECKED
\EE_{r_{1:t-1},a_{1:t-1}|x_{1:t}}
[\one(a_t=a)] %% |x_{1:t},r_{1:t-1},a_{1:t-1})]
\ee
that a given policy $\pi$ selects action $a_t=a$ conditional on the context history $x_{1:t}$.
As stated in Theorem~\ref{theorem:estimator}, we make the mild assumption that the optimal action $a^\star_t$ is selected with a minimal nonzero probability $\pi_{\rm min}$. (Any policy that learns the task should converge to $\pi_{\rm min}\rarr1$ as $T\rarr\infty$.)
That is,
\be\label{eq:pi_min_def}
P_\pi(a_t=a|x_{1:t}) \geq \pi_{\rm min}\cdot\one(a=a^\star_t),
\ee
where we conservatively lower bound the probability at zero for $a\neq a^\star_t$.
%Using Eq. \eqref{eq:pi_min_def} in Eq. \eqref{eq:B_a_expectation}, we have
%\be\label{eq:B_expectation_lowerbound}
%\EE[\covinv^{(a)}_{zz'}(T)] \geq %CHECKED
%\pi_{\rm min} \sum_{t=1}^T \EE_{x_{1:t}} [\one(a=a^\star_t) p(z_t=z|x_{1:t})p(z_t=z'|x_{1:t}) ].
%\ee
%%A particular history-dependent realization of $\covinv^{(a)}_{zz'}$ may fall below the lower limit given in Eq. \eqref{eq:B_expectation_lowerbound}. 
Since the rank one matrix $p_t p_t^\top$ with elements
\be
(p_t p_t^\top)_{z,z'} = p(z_t=z|x_{1:t})p(z_t=z'|x_{1:t}) \nn
\ee
is positive semidefinite\footnote{This matrix has $Z-1$ zero eigenvalues, and a nonzero eigenvalue $\sum_z p(z_t=z|x_{1:t})^2$.} for any $x_{1:t}$, Eq. \eqref{eq:pi_min_def} implies that, for any $p_t$,
$$P_\pi(a_t=a|x_{1:t}) p_t p_t^\top \succcurlyeq\pi_{\rm min}\cdot\one(a=a^\star_t)p_t p_t^\top$$ %CHECKED
and hence
$$\EE_{x_{1:t}}[P_\pi(a_t=a|x_{1:t})p_t p_t^\top]
\succcurlyeq
\pi_{\rm min} \EE_{x_{1:t}}[\one(a=a^\star_t)p_t p_t^\top].$$
Applying this bound to each matrix term of $\EE[\covinv^{(a)}_{zz'}(T)]$ in Eq. \eqref{eq:B_a_expectation} 
%% and in the lower bound, Eq. \eqref{eq:B_expectation_lowerbound}, 
we see that
\be\label{eq:B_expectation_lowerbound_matrix}
\EE[\covinv^{(a)}(T)]
\succcurlyeq %(CHECKED)
\pi_{\rm min} \cdot T\cdot \Beq^{(a)}(T),
\ee
with $\Beq^{(a)}(T)$ defined in Eq. \eqref{eq:B_eq_element_def}.
Hence we recover the matrix lower bound Eq. \eqref{eq:expected_B_bound} above.
\end{proof}

We now show that the variance of the empirical matrix $\covinv$ around its asymptotic expected form can be upper bounded:

\begin{lemma}\label{lemma:var_Bzz}
When %the latent Markov process is in equilibrium, with $z_0\sim\rhoeq(\cdot)$, and when 
the ground truth parameters $(\theta,\Phi)$ are known, the variance across histories $x_{1:T}$ of the empirical inverse covariance matrix element  $\covinv_{zz'}(T)$, satisfies the upper bound
\be\label{eq:B_var_bound}
{\rm Var}\[\frac{1}{T} \covinv^{(a)}_{zz'}(T)\] \leq \frac{2}{\gamma_\phi T}(\ca + \log\log(1/\rhomin)),
\ee
where $\ca\approx 6.78$, and $\rhomin:=\min_z\rhoeq(z)$ is the equilibrium probability of the least probable latent state.
\end{lemma}
\begin{proof}
The variance over context histories $x_{1:T}$ of the matrix element $\covinv_{zz'}(T)$, conditioned on actions $a_{1:T}$ (and using the shorthand notation $\dta=\one(a_t=a)$), is
\begin{align}
    & {\rm Var}[\covinv^{(a)}_{zz'}|a_{1:T}] = \EE_{x_{1:T}}[(\covinv^{(a)}_{zz'})^2|a_{1:T}] - \EE_{x_{1:T}}[\covinv^{(a)}_{zz'}|a_{1:T}]^2 \nn \\
    & \ \ \ = %CHECKED
    \sum_{t,t'} \dta \dtpa \Big(\EE_{x_{1:t'}}[
% [LONG VERSION]    
%    p(z_t=z|x_{1:t})p(z_t=z'|x_{1:t})
%    p(z_{t'=z|x_{1:t'})p(z_{t'}=z'|x_{1:t'})] \nn \\
%    & \ \ \ - \EE_{x_{1:T}}[p(z_t=z|x_{1:t})p(z_t=z'|x_{1:t})]
%    \times \EE_{x_{1:T}}[p(z_{t'}=z|x_{1:t'})p(z_{t'}=z'|x_{1:t'})]
    p_{1:t}(z)p_{1:t}(z')p_{1:t'}(z)p_{1:t'}(z')]
    - \EE_{x_{1:t'}}[p_{1:t}(z)p_{1:t}(z')]
    \EE_{x_{1:t'}}[p_{1:t'}(z)p_{1:t'}(z')]
    \Big).
    \label{eq:var_B_element}
%% \EE[p_t(z)p_t(z')p_{t'}(z)...]...
%% use shorthand p_t(z) etc.?
\end{align}
Here, we have trivially taken the expectation over $x_{t'+1:T}$.
Using again the shorthand notation $p_{t:t'}(z):=p(z_{t'}=z|x_{t:t'})$
(with $p_{t:t'}\in\RR^Z$ denoting the vector of probabilities),
%and $\delta p_t(z;\tau)$ 
%defined in Eq. \eqref{eq:prob_conditional_range},
%and \eqref{eq:delta_p_def}, 
and defining
\be\label{eq:delta_p_def}
\delta p_t(z;\tau) := p_{1:t}(z) - p_{t-\tau+1:t}(z),
\ee
we can write, for $t'>t$,
\begin{align}
    p_{1:t'}(z)p_{1:t'}(z') = (p_{t+1:t'}(z') + \delta p_{t'}(z;t'-t))(p_{t+1:t'}(z') + \delta p_{t'}(z';t'-t)), \label{eq:pp_decomp}
\end{align}
\iffalse
    \EE_{x_{1:T}}[p_{1:t'}(z)p_{1:t'}(z')]
    &= %(CHECKED)
    \EE_{x_{t+1:t'}}[p_{t+1:t'}(z)p_{t+1:t'}(z')]
    \nn \\ & \ \ 
    + \EE_{x_{1:t'}}[\delta p_{t'}(z;t'-t)p_{t+1:t'}(z')] 
    \nn \\ & \ \ 
    + \EE_{x_{1:t'}}[p_{t+1:t'}(z)\delta p_{t'}(z';t'-t)]
    \nn \\ & \ \ 
    + \EE_{x_{1:t'}}[\delta p_{t'}(z;t'-t)\delta p_{t'}(z';t'-t)], \nn
\fi
Using Corollary~\ref{corollary:tvd_bound_expected} to bound the expectation over $x_{t+1:t'}$, and using the fact that %CHECKED
$$D_{KL}[p_{1:t}||\rhoeq]=\sum_z p_{1:t}(z)\log\(\frac{p_{1:t}(z)}{\rhoeq(z)}\)\leq\sum_z p_{1:t}(z)\log(1/\rhomin)=\log(1/\rhomin),$$
we have
\be
\EE_{x_{1:t'}}\Big[\sum_z|\delta p_{t'}(z;\tau)|\Big] 
\leq e^{-\frac{1}{2}\gamma_\phi\tau}
\EE_{x_{1:t}}[\sqrt{2 D_{KL}[p_{1:t}||\rhoeq}]
\leq 
e^{-\frac{1}{2}\gamma_\phi\tau}\sqrt{2\log(1/\rhomin)} := u(\tau).
\label{eq:delta_p_bound}
\ee
Thus, for $t'>t$,
\begin{align}
    | \EE_{x_{1:t'}}[p_{1:t'}(z)p_{1:t'}(z')] 
    - \EE_{x_{t+1:t'}}[p_{t+1:t'}(z)p_{t+1:t'}(z')] |
    &\leq %CHECKED
    \EE_{x_{1:t'}}[|\delta p_{t'}(z;t'-t)|]
    + \EE_{x_{1:t'}}[|\delta p_{t'}(z';t'-t)|] \nn \\
    & \ \ \ + \EE_{x_{1:t'}}[|\delta p_{t'}(z;t'-t)|\cdot|\delta p_{t'}(z';t'-t)|] \nn \\
    &\leq %CHECKED
    3u(t'-t), \label{eq:EppEpp_bound}
\end{align}
where we have used $p_{t+1:t'}\leq1$ and $|\delta p_{t'}|\leq1$ to conservatively bound the expectation.
Applying the decomposition in Eq. \eqref{eq:pp_decomp} again for the first term in Eq. \eqref{eq:var_B_element}, we have
\begin{align}
    \EE_{x_{1:t'}}[p_{1:t}(z)p_{1:t}(z')p_{1:t'}(z)p_{1:t'}(z')] \leq %(CHECKED)
    \EE_{x_{1:t}}[p_{1:t}(z)p_{1:t}(z')]\cdot\EE_{t+1:t'}[p_{t+1:t'}(z)p_{t+1:t'}(z')] + 3u(t'-t).
    \label{eq:Epppp_bound}
\end{align}
Here we have used the fact that $p_{1:t}(z)p_{1:t}(z')\leq1$ to simplify the last term. 
Combining Eq. \eqref{eq:EppEpp_bound} and \eqref{eq:Epppp_bound}, we have (for $t'>t$)
\be
| \EE_{x_{1:t'}}[p_{1:t}(z)p_{1:t}(z')p_{1:t'}(z)p_{1:t'}(z')] 
- \EE_{x_{1:t}}[p_{1:t}(z)p_{1:t}(z')] \EE_{x_{1:t'}}[p_{1:t'}(z)p_{1:t'}(z')] | \leq 6u(t'-t).
\ee
As in Lemma~\ref{lemma:partial_bound}, we now introduce a free parameter $\tau_0$, and break the sum in Eq. \eqref{eq:var_B_element} into a contributions from small $|t'-t|$ (where the difference in Eq. \eqref{eq:var_B_element} may be large but cannot exceed one) and large $|t'-t|$ (where the upper bound on the difference in Eq. \eqref{eq:var_B_element} is strong). 
The variance ${\rm Var}[\covinv_{z,z'}]$, Eq. \eqref{eq:var_B_element}, can then be upper bounded:
\be
    {\rm Var}[\covinv^{(a)}_{zz'}|a_{1:T}]
    \leq %(CHECKED)
    \sum_{t,t'}\dta\dtpa
    \[\one(|t'-t|\leq\tau_0) + \one(|t'-t|>\tau_0)6u(|t'-t|)\] \nn
\ee
Using $\dta\dtpa\leq1$ to apply the inequality for any action sequence $a_{1:T}$, and thus removing the action conditioning, we have
\be
    {\rm Var}[\covinv^{(a)}_{zz'}]
    \leq %(CHECKED)
    \sum_{t,t'}\one(|t'-t|\leq\tau_0) + 2\sum_{t,t'}\one(t'-t>\tau_0)6u(t'-t). \label{eq:var_B_element_bound}
\ee
Here, we have also used the symmetry of Eq. \eqref{eq:var_B_element} under exchange of $t$ and $t'$ to sum only over $t'>t$.
The bound on ${\rm Var}[\covinv_{z,z'}]$ becomes
\begin{align}
    {\rm Var}[\covinv^{(a)}_{zz'}] &\leq %(CHECKED)
    T(2\tau_0+1) + 12T\sum_{\tau=\tau_0+1}^T e^{-\frac{1}{2}\gamma_\phi\tau}
    \sqrt{2\log(1/\rhomin)} \nn \\
    & \leq T(2\tau_0+1) + 12T\sqrt{2\log(1/\rhomin)} \int_{\tau_0}^\infty d\tau e^{-\frac{1}{2}\gamma_\phi\tau} \nn \\
    & = %CHECKED
    T(2\tau_0+1) + 12T\sqrt{2\log(1/\rhomin)} \frac{2}{\gamma_\phi}e^{-\frac{1}{2}\gamma_\phi\tau_0} \nn
\end{align}
where we have again used the monotonicity with respect to $\tau$ to bound the discrete sum with a continuous integral.
We are now in a position to optimize the free parameter $\tau_0$ to make the bound as tight as possible.
Setting to zero the derivative with respect to $\tau_0$, and solving for $\tau_0$, we find the optimal value
\be %CHECKED
\tau^\star_0 := \frac{1}{\gamma_\phi}\log\(72\log(1/\rhomin)\),
\ee
for which the upper bound becomes
\begin{align}
    {\rm Var}[\covinv^{(a)}_{zz'}]
    &\leq %CHECKED
    T + 2\frac{T}{\gamma_\phi}\[2+\log\(72\log(1/\rhomin)\)\]
    \nn \\
    & \leq %CHECKED
    2\frac{T}{\gamma_\phi}(\ca+\log\log(1/\rhomin)), \nn
\end{align}
where we have used the fact that $\gamma_\phi\leq1$, and $\ca=\frac{5}{2}+\log72=\frac{5}{2}+3\log2+2\log3\approx6.78$.
\end{proof}
Note that the unusual log-log dependence in Eq. \eqref{eq:B_var_bound} originates in the exponential contraction in Eq.~\ref{eq:kl_bound_expectation}, which suppresses an initial KL-distance that is already logarithmic in probabilities.

Finally, we
%% [OLD] combine Lemmas~\ref{lemma:expected_Bzz} and \ref{lemma:var_Bzz}, which bound $\EE[\covinv^{(a)}_{zz'}]$ and ${\rm Var}[\covinv^{(a)}_{zz'}]$, respectively, and
apply Chebyshev's inequality to bound the deviation of the $\covinv^{(a)}_{zz'}$ from its asymptotic expected value:

\begin{lemma}\label{lemma:cov_chebyshev}
When %the latent Markov process is in equilibrium, with $z_0\sim\rhoeq(\cdot)$, and when 
the ground truth parameters $(\theta,\Phi)$ are known, any matrix element of the empirical inverse covariance matrix $\covinv^{(a)}(T)$, for any particular history $(x_{1:T},a_{1:T})$ of contexts and actions, satisfies the inequality
\be\label{eq:cov_chebyshev} %CHECKED
\frac{1}{T}|\covinv_{zz'}^{(a)}(T) - \EE[\covinv^{(a)}_{zz'}(T)]|
\leq 
\sqrt{\frac{1}{\delta}\frac{2}{\gamma_\phi T}(\ca + \log\log(1/\rhomin))}
\ee
where $\ca\approx 6.78$, with probability at least $1-\delta$, for any $\delta\in(0,1)$.
\end{lemma}
\begin{proof}
Chebyshev's inequality states that for any random variable $X$ with variance ${\rm Var}[X]$, $|X-\EE[X]|\leq\sqrt{{\rm Var}[X]/\delta}$ with probability at least $1-\delta$.
% [alternate] We will want to put a lower bound on $X$, so we rewrite the inequality (in slightly weakened form) as $X\geq \EE[X] - \sqrt{{\rm Var}[X]/\delta}$.
Setting $X=\frac{1}{T}\covinv_{zz'}^{(a)}$ and using Eq. \eqref{eq:B_var_bound} to upper bound the variance, we recover Eq. \eqref{eq:cov_chebyshev} above. %CHECKED
\end{proof}

\subsection{Bound on covariance matrix eigenvalues}
\label{app:eigen}

In Appendix~\ref{app:cov} we derived a high-probability upper bound on the deviation of the elements of the empirical inverse covariance matrix $\covinv^{(a)}$ from their asymptotic expected values.
We would like to convert this into a bound on the covariance matrix $(\covinv^{(a)})^{-1}$, in order to bound the estimator error $(\covinv^{(a)})^{-1}\error^{(a)}$, Eq. \eqref{eq:mu_error_appendix}.
In this section, we show that an element-wise bound such as Eq. \eqref{eq:cov_chebyshev} can be converted to an eigenvalue bound which can be applied to the inverse matrix.
%% using the fact that eigenvalues are converted to their reciprocals under matrix inversion.
%% Here we convert the matrix element-wise bound from Appendix~\ref{app:cov} to a lower bound on the minimal eigenvalue of the empirical inverse covariance matrix.

\begin{lemma}\label{lemma:eigen}
For symmetric matrices $\bar{M}$, $M=\bar{M}+\Delta M$, with $|\Delta M_{z,z'}|\leq U_\delta$ for any given $(z,z')$ with probability at least $1-\delta$, the minimal eigenvalue $\lambda_1$ of $M$ satisfies the lower bound
\be\label{eq:eigenvalue_bound}
\lambda_1\geq\bar{\lambda}_1 - Z U_\delta
\ee
with probability at least $1-Z\delta$, where $\bar{\lambda}_1$ is the minimal eigenvalue of $\bar{M}$.
\end{lemma}
\begin{proof}
Let $\lambda_1$ and $\bar{\lambda}_1$ be, respectively, the minimal eigenvalues of $M$ and $\bar{M}$.
Since $M$ and $\bar{M}$ are symmetric, $\Delta M$ is also symmetric.
The Weyl inequality for symmetric, real-valued square matrices states that if $\bar{\lambda}_1$ and $\lambda_1^{(\Delta)}$ are the minimal eigenvalues of matrices $\bar{M}$ and $\Delta M$, then the minimal eigenvalue $\lambda_1$ of the matrix sum $\bar{M}+\Delta M$ satisfies the lower bound
\be\label{eq:weyl_ineq}
\lambda_1\geq\bar{\lambda}_1+\lambda_1^{(\Delta)}.
\ee
The Gershgorin circle theorem can be used to bound the eigenvalue $\lambda_1^{(\Delta)}$ in terms of the matrix elements $\Delta M_{z,z'}$. For a real square matrix $A$, the Gershgorin circle theorem states that the $i$'th eigenvalue satisfies the inequality
\be
|\lambda_i - A_{ii}|\leq \sum_{j\neq i}|A_{ij}|, \nn
\ee
which implies that
\be\label{eq:gershgorin} %(CHECKED)
|\lambda_i| \leq \sum_j |A_{ij}|
\ee
Applying Eq. \eqref{eq:gershgorin} to any eigenvalue $\lambda^{(\Delta)}_z$ of $\Delta M$, %% for which $|\Delta M_{zz}|\leq U_\delta$ and $\sum_{z'\neq z}|\Delta M_{zz'}|\leq (Z-1)U\delta$, we have
we have
\be\label{eq:gershgorin_highprob}
|\lambda^{(\Delta)}_z| \leq %CHECKED
\sum_{z'}|\Delta M_{zz'}| \leq %CHECKED
Z U_\delta .
\ee
Since Eq. \eqref{eq:gershgorin_highprob} only holds if $|\Delta M_{zz'}|\leq U_\delta$ for all $z'$, the probability of the bound is at least $(1-\delta)^Z > 1-Z\delta$.
%% If $|\Delta M_{zz'}|\leq U_\delta$ for a given $(z,z')$ with probability at least $1-\delta$, then conservatively, $|\Delta M_{zz'}|\leq U_\delta$ holds for all $(z,z')$ with probability at least $1-Z^2\delta$. 
Combining Eq. \eqref{eq:gershgorin_highprob} with Eq. \eqref{eq:weyl_ineq}, we recover Eq. \eqref{eq:eigenvalue_bound}. %CHECKED
\end{proof}

We now use the element-wise bound on $\covinv^{(a)}_{zz'}$ from Lemma~\ref{lemma:cov_chebyshev} to apply Lemma~\ref{lemma:eigen} to the minimal eigenvalue of the inverse covariance matrix $\covinv^{(a)}$, which immediately translates into an upper bound on the maximal eigenvalue of $(\covinv^{(a)})^{-1}$.
\begin{lemma}\label{lemma:eigen_cov}
Under the same conditions as Lemma~\ref{lemma:cov_chebyshev}, the minimal eigenvalue $\lambda^{(a)}_1(T)$ of the empirical inverse covariance matrix $\frac{1}{T}\covinv^{(a)}(T)$ satisfies the lower bound
\be\label{eq:eigenvalue_bound_cov}
\lambda^{(a)}_1(T) \geq \lambda_{\rm min}^{(a)}(T)/\cb,
\ee
where $\lambda^{(a)}_{\rm min}(T)$ is the minimal eigenvalue of $\Beq^{(a)}(T)$ defined in Eq. \eqref{eq:B_eq_element_def},
with probability at least $1-\delta_\lambda$, where
\be\label{eq:delta_lambda_def}
\delta_\lambda := %CHECKED
\frac{Z^3}{(\lambda^{(a)}_{\rm min}(T))^2}\(\pi_{\rm min}-\cb^{-1}\)^{-2}\frac{2}{T\gamma_\phi}(\ca+\log\log(1/\rhomin)),
\ee
for any $\cb\in(1/\pi_{\rm min},\cb_{\rm max})$, with
\be\label{eq:c_max}
\frac{1}{\cb_{\rm max}} = \pi_{\rm min}-\frac{Z}{\lambda^{(a)}_{\rm min}(T)}\sqrt{\frac{2}{T\gamma_\phi}(\ca+\log\log(1/\rhomin))}.
\ee
%% with all quantities as defined in Lemma~\ref{lemma:cov_chebyshev}.
\end{lemma}
\begin{proof}
Recalling Eq. \eqref{eq:cov_chebyshev}, we apply Lemma~\ref{lemma:eigen} with
\be %CHECKED
\bar{M}\rarr \frac{1}{T}\EE[\covinv^{(a)}(T)], \ \ 
M\rarr\frac{1}{T} \covinv^{(a)}(T), \ \ 
U_\delta\rarr
\sqrt{\frac{1}{\delta}\frac{2}{T\gamma_\phi}(\ca+\log\log(1/\rhomin))}, \nn
\ee
and have
\be
\lambda_1^{(a)}(T) \geq \bar{\lambda}_1^{(a)}(T) - Z U_\delta,
\ee
%%%\bar{\lambda}_1\rarr \pi_{\rm min}\lambda^{(a)}_{\rm min}, %CHECKED
%%%\ \ \ \lambda_1\rarr\tilde{\lambda}^{(a)}_1. \nn %CHECKED
with probability at least $1-Z\delta$, 
where $\lambda^{(a)}_1(T)$ and $\bar{\lambda}^{(a)}_1(T)$ are the minimal eigenvalues of $\frac{1}{T}\covinv^{(a)}(T)$ and $\frac{1}{T}\EE[\covinv^{(a)}(T)]$, respectively.
Using the fact (Lemma~\ref{lemma:expected_Bzz}) that $\frac{1}{T}\EE[\covinv^{(a)}(T)]\succcurlyeq\pi_{\rm min}\Beq^{(a)}(T)$, 
or equivalently %CHECKED
$\frac{1}{T}\EE[\covinv^{(a)}(T)]=\pi_{\rm min}\Beq^{(a)}(T)+\text{PSD}$ where PSD is a positive semidefinite symmetric matrix with non-negative minimal eigenvalue, 
and applying the Weyl inequality again (as in Lemma~\ref{lemma:eigen}), 
we have %(CHECKED)
$\bar{\lambda}^{(a)}_1(T)\geq\pi_{\rm min}\lambda^{(a)}_{\rm min}(T)$, and thus,
\be\label{eq:eig_bound_cov}
\lambda_1^{(a)}(T) \geq \pi_{\rm min}\lambda_{\rm min}^{(a)}(T) - Z U_\delta.
\ee
Defining
\be\label{eq:const_B_def}
\cb^{-1}:=\pi_{\rm min} - \frac{Z}{\lambda^{(a)}_{\rm min}(T)}
\sqrt{\frac{1}{\delta} \frac{2}{T\gamma_\phi}(\ca+\log\log(1/\rhomin)) },
\ee
Eq. \eqref{eq:eig_bound_cov} takes the form of Eq. \eqref{eq:eigenvalue_bound_cov}, %CHECKED
with $\cb$ inheriting its range, as stated in Lemma~\ref{lemma:eigen_cov} above, from the range of $\delta\in(0,1)$. 
Inverting Eq. \eqref{eq:const_B_def} to express the probability $\delta_\lambda:=Z\delta$ in terms of other parameters, we recover Eq. \eqref{eq:delta_lambda_def}. %CHECKED
\end{proof}

\subsection{Final Bound on Estimator Error}
\label{app:finalbound}

In the preceding sections, we derived high-probability bounds for the empirical covariance matrix $(\covinv^{(a)})^{-1}$ and the error vector $\error^{(a)}$.
In this section, we combine these results to derive Theorem 1, a high-probability upper bound on the estimator error $\hat{\mu}^{(a)} - \mustara = (\covinv^{(a)})^{-1}\error^{(a)}$:

\begin{proof} [Proof of Theorem~\ref{theorem:estimator}]
From Lemma~\ref{lemma:partial_bound}, we have $(\error^{(a)}_z/T)^2\leq \udelta^2$ -- using $\udelta^2$ as a shorthand for the right hand side of Eq. \eqref{eq:partial_bound} -- with probability at least $1-\delta$ for any $z$, and thus with probability at least $(1-\delta)^Z>1-Z\delta$ for all $z$.
Thus, renaming $\delta\rarr\delta/Z$, the $1$-norm of the estimator error is upper bounded with probability at least $1-\delta$:
\be\label{eq:estimator_bound_Bsum}
|\hat{\mu}^{(a)}_z - (\mustara)_z| \leq %CHECKED
\sum_{z'}|((\covinv^{(a)})^{-1})_{zz'}|\cdot |\error^{(a)}_{z'}|
\leq %CHECKED
T\cdot\udeltaoverZ\sum_{z'}\Big|((\covinv^{(a)})^{-1})_{zz'}\Big|. 
\ee
The sum over elements $|((\covinv^{(a)})^{-1})_{zz'}|$ can be upper bounded in terms of the Frobenius norm $||(\covinv^{(a)})^{-1}||_F$,
\be
\sum_{z'}|((\covinv^{(a)})^{-1})_{zz'}| \leq %CHECKED
Z\times\max_{z,z'}|((\covinv^{(a)})^{-1})_{zz'}| \leq %CHECKED
Z\sqrt{\sum_{z,z'}|((\covinv^{(a)})^{-1})_{zz'}|^2} = %CHECKED
Z||(\covinv^{(a)})^{-1}||_F. \nn
\ee
The singular value decomposition of $(\covinv^{(a)})^{-1}$, which is symmetric and positive semidefinite, can be written $(\covinv^{(a)})^{-1} = \frac{1}{T}U_a\Lambda_a^{-1} U_a^\tp$ where $U_a$ is an orthogonal matrix and $\Lambda_a$ is the diagonal matrix whose nonzero entries are the eigenvalues of $\frac{1}{T}\covinv^{(a)}$. (Recall that the elements of the matrix $\covinv^{(a)}$ increase linearly with $T$, with $\frac{1}{T}\covinv^{(a)}$ approaching a constant matrix at large $T$.)
The Frobenius norm  %%$||M||_F:=\sqrt{\sum_{z,z'}|M_{zz'}|^2}$
of a matrix is unchanged under a (left or right) orthogonal transformation, so
\be
T\cdot||(\covinv^{(a)})^{-1}||_F = ||\Lambda_a^{-1}||_F = %CHECKED
\sqrt{\sum_z(\lambda_z^{(a)})^{-2}} \leq %CHECKED
\frac{\sqrt{Z}}{\lambda^{(a)}_1}, \nn
\ee
where $\lambda^{(a)}_1$ is the minimal eigenvalue (at time $T$) of $\frac{1}{T}\covinv^{(a)}$. 
Thus, $T\cdot\sum_{z'}|(\covinv^{(a)})^{-1})_{zz'}|\leq Z^{3/2}/\lambda_1 ^{(a)}$. %CHECKED
Substituting this into Eq. \eqref{eq:estimator_bound_Bsum} above, and recalling %% Corollary~\ref{corollary:eigen_cov} of 
Lemma~\ref{lemma:eigen_cov}, we have
\be %CHECKED
|\hat{\mu}^{(a)}_z - (\mustara)_z|\leq 
\frac{Z^{3/2}\cb}{\pi_{\rm min}\lambda^{(a)}_{\rm min}(T)}\udeltaoverZ 
\nn
\ee
with probability at least
\be %CHECKED
(1-\delta)(1-\delta_\lambda) > 1 - \delta - \delta_\lambda. \nn
\ee
With the definition of $\delta_\lambda$ in Eq. \eqref{eq:delta_lambda_def}, recalling that $U_\delta^2$ refers to the upper limit in Eq. \eqref{eq:partial_bound}, and setting $\cb=2/\pi_{\rm min}$ for simplicity, we recover Theorem 1 %CHECK
as stated above.
\end{proof}

Note from Eq. \eqref{eq:delta_lambda_def} (with $\cb=2/\pi_{\rm min}$) that in order for the probability of the bound to become positive, the time $T$ (measured in mixing times $1/\gamma_\phi)$ must exceed a minimal threshold value,
\be
T\gamma_\phi > \frac{8Z^3}{\pi_{\rm min}\lambda^{(a)}_{\rm min}}(\ca+\log\log(1/\rhomin)).
\ee
Before this timescale, insufficient data can be gathered to reliably reduce the variance of the estimator. Once $T\gamma_\phi$ exceeds this threshold value, which is parametrically large in the number of latent states $Z$, the bound becomes nontrivial.
%the exponential suppression factor $e^{-\frac{1}{2}\gamma_\phi T}$ in Eq. \eqref{eq:expected_B_bound} will be vanishingly small, justifying our neglect of this contribution in Lemma~\ref{lemma:cov_chebyshev}.

\section{Derivation of Theorem \ref{theorem:regret_llTS}}
\label{app:regret}

\cyan{As outlined in the main text,} the derivation of Theorem~\ref{theorem:regret_llTS} involves (i) a generic procedure for converting bounds on empirical estimates into a regret bound for linear Thompson sampling, and (ii) application of Theorem~\ref{theorem:estimator} and related results, which bound empirical reward estimates and empirical covariance matrices in the latent bandit setting, to apply the resulting linear Thompson sampling regret bound to the latent bandit setting. This involves the following steps:
\begin{itemize}
    \item In Appendix~\ref{app:context_dist}, we define an important feature of the distribution over contexts in the linear bandit setting, which quantifies the amount of probability mass concentrated on contexts $\ctx_t$ where the reward gap between the optimal action $\argmax_a\ctx_t^\top\mustara$ and the next-best action is very small.
    \item In Appendix~\ref{app:assumptions} we state several assumptions used in our derivation, including bounds on empirical estimates which we later show to take a specific form in the case of Lemma~\ref{lemma:reduction} and Theorem~\ref{theorem:estimator} (where the linear bandit problem is obtained by reducing from the latent bandit setting, and conditioning on the true parameters $(\theta^\star,\phi^\star)$).
    \item In Appendix~\ref{app:pi_bound}, we derive (under these assumptions) a high-probability bound on the probability that linear Thompson sampling will select a suboptimal action at any time, given an observed context vector $\ctxstar$.
    \item In Appendix~\ref{app:regret_timestep} we upper bound (with high probability) the regret incurred by linear Thompson sampling at a given time, by taking an average over possible context vectors $\ctxstar$ of the mean regret incurred conditional on $\ctxstar$ (which is determined by the probability of suboptimal actions). We show that the bulk of expected regret comes from contexts $\ctxstar$ for which the best two actions have very similar expected reward.
    \item In Appendix~\ref{app:regret_linTS} we sum over timesteps to bound the cumulative regret of linear Thompson sampling. Since the regret bound at each timestep derived in Appendix~\ref{app:regret_timestep} fails with small but nonzero probability, an additional worst-case regret is incurred on timesteps when the bound fails. We optimize the probability of failure (a free parameter at each timestep) in order to tighten the bound on cumulative regret.
    Lastly, we specify from a more general case to the specific case in which empirical estimate error decreases as $1/\sqrt{T}$, which leads to $O(\sqrt{T})$ regret.
    \item Lastly, in Appendix~\ref{app:regret_latent}, we use Theorem~\ref{theorem:estimator}, which bounds the error in empirical reward parameter estimators in the latent bandit setting (Section~\ref{sec:setting_latent}), along with a corresponding bound on empirical covariance matrices (Lemma~\ref{lemma:eigen_cov}), to apply the generic linear TS regret bound of Appendix~\ref{app:regret_linTS} to the setting in which context vectors are posterior probability vectors over a latent state undergoing Markovian state transitions, $\ctxstar_t=p^\star_t$.
\end{itemize}

We remind the reader of the linear Thompson sampling algorithm (used by L$^2$TS as a subroutine), which is the focus of our analysis in this Appendix:

% [ERROR:] \setcounter{algorithm}{0}
\begin{algorithm}[H]
    \caption*{Linear Thompson Sampling (\cite{agrawal2013linTS})}\label{alg:linTS}
        \textbf{Input:}\\
        \ \ $\lambda_\mu>0,$ \ $\tilde{\sigma}_r^{(a)}>0$ for $a\in\A$\\ %% Prior variance parameters
        \ \ $\hat{\mu}^{(a)}=\bzero_{\ctxdim}$, $f^{(a)} = \bzero_{\ctxdim}$, $B^{(a)}=\lambda_\mu\bbone_{\ctxdim}$, for $a\in\A$\\
        \textbf{for }$t\leftarrow 1,2,...$\textbf{ do}\\
        \ \ Receive context $\ctxhat_t$\\ %\sim\Pct(\cdot)
        \ \ Sample $\mu^{(a)}\sim\N(\hat{\mu}^{(a)},(\tilde{\sigma}_r^{(a)})^2(B^{(a)})^{-1})$ for $a\in\A$\\
        \ \ Select action $a=\argmax_{a'}\ctxhat_t^\top\mu^{(a')}$ \\
        \ \ Observe reward $r_t$\\
        \ \ Update mean reward estimates:\\ %%and covariance: \\
        \ \ \ \ $B^{(a)} \larr B^{(a)} + \ctxhat_t \ctxhat_t^\top$, \ \ $f^{(a)} \larr f^{(a)} + \ctxhat_t r_t$ \\
        \ \ \ \ $\hat{\mu}^{(a)} = (B^{(a)})^{-1}f^{(a)} $
\end{algorithm}

\paragraph{Preliminaries.}

We will distinguish the true context $\ctxstar$ -- which determines the ground-truth mean reward, $\EE[r_t|a_t=a]=(\ctxstar_t)^\top\mustara$ -- from the context $\ctxhat$ which is accessible to the linear Thompson sampling agent. Throughout our analysis of linear Thompson sampling, we allow for error or corruption of observed contexts, $\ctxhat\neq\ctxstar$, which we assume to satisfy a bound (Assumption \ref{assumption_ctx_error_bound}). While we ultimately set $\ctxhat=\ctxstar$ when applying our analysis to the latent bandit setting, our \cyan{regret bound for linear Thompson sampling (Lemmas~\ref{lemma:regret_timestep} and \ref{lemma:regret_generic}, and Corollary~\ref{corollary:regret_generic_sqrtT})} applies more generally.

With the exception of Appendix~\ref{app:regret_latent}, we assume context feature vectors are in a $\ctxdim$-dimensional Euclidean space, $\ctxstar,\ctxhat\in\RR^\ctxdim$.
%(As such, the bulk of our analysis applies in other applications of the linear bandit setting.)
In Appendix~\ref{app:regret_latent} we specify to our particular setting of interest, where $\ctxstar=p^\star$ is a probability vector restricted to the $(\ctxdim-1)$-dimensional simplex, and $\ctxdim=Z$ is the latent state dimensionality.

%As indicated in Algorithm~\ref{alg:linTS},
We use $\Pct$ to denote the true distribution over linear bandit context vectors at time $t$, that is, $\ctxstar_t\sim\Pct(\cdot)$, \cyan{keeping in mind that in the latent bandit setting, $\Pct$ will become the distribution over posterior probability vectors with elements $p^\star_t(z) := p(z_t=z|x_{1:t},\theta^\star,\phi^\star)$, with the context history $x_{1:t}$ being a random sequence generated from given ground-truth parameters $(\theta^\star,\phi^\star)$.}

Regarding notation, we define $\covhat^{(a)}:=(B^{(a)})^{-1}$ as the empirical covariance matrices used by linear Thompson sampling, and will assume $\tilde{\sigma}_r=1$ for simplicity. 
We use $\muhat:=\{\muhat^{(a)}\}_{a=1}^K$, $\covhat:=\{\covhat^{(a)}\}_{a=1}^K$ to collectively denote the set of action-wise estimators and action-wise covariance matrices.

\subsection{Linear bandit context distribution}
\label{app:context_dist}

% moved notes from previous version to linear-bandit-context-distribution.tex

In this section, we define an important task-relevant feature of the context distribution $\Pct$, which quantifies the likelihood of encountering contexts for which the optimal action has only marginally higher expected reward than the next-best action. Such ``adversarial'' contexts make it hard to resolve the best action, and are likely to induce suboptimal actions.
As we will see, these regions of context space contribute significantly to expected regret.

Recall that, in the linear bandit setting of Section~\ref{sec:linear}, a given context vector context $\ctx_t$ determines an optimal action $a^\star_t := \argmax_a(\ctx_t)^\top\mustara$, conditional on the true reward parameters $\{\mustara\}_{a=1}^K$.
Thus, the space of context vectors may be partitioned into regions of optimality which favor different actions. (Note that in the latent bandit setting, this amounts to partitioning the simplex of probability vectors over the latent state.)

We will see that the asymptotic regret of linear Thompson sampling is controlled by the density of the context distribution near the borders of these regions of optimality, in the following way. 
We first define, for any context vector $\ctx\in\RR^\ctxdim$ and pair of actions $(a^\star,a)$, the component $\ctx_\parallel^{(a^\star,a)}\in\RR$ 
%and $\ctx_\perp\in\RR^\ctxdim$ 
parallel and perpendicular to the reward gap direction $\mustarastar-\mustara$, that is,
\begin{align}
    \ctx^{(a,a^\star)}_\parallel(\ctx) &:= %CHECKED
    \ctx^\tp(\mustarastar - \mustara)/||\mustarastar - \mustara||_2. \label{eq:parallel_def}
    \\
    \ctx^{(a,a^\star)}_\perp(\ctx) &:= %CHECKED
    \Pi_{a,a^\star}\ctx, \label{eq:perp_def}
\end{align}
where the projection matrix
$$ %~CHECKED
\Pi_{a,a^\star} = \bbone - \frac{(\mustarastar - \mustara)(\mustarastar - \mustara)^\top}{||\mustarastar - \mustara||_2^2}
$$
projects $\ctx$ onto the $(\ctxdim-1)$-dimensional hyperplane orthogonal to the vector difference $\mustarastar - \mustara$. 

Equivalently to Eq. \eqref{eq:parallel_def}, the difference in expected reward between actions $a^\star$ and $a$ depends (only) on the parallel component of $\ctx$,
\be\label{eq:parallel_def_v2}
\ctx^\top(\mustarastar-\mustara)=\ctx_\parallel^{(a^\star,a)}\Delta_{a^\star,a},
\ee
where
\be
\Delta_{a^\star,a} := ||\mustarastar-\mustara||_2
\ee
is the magnitude of the vector difference of reward parameters for actions $a^\star$ and $a$.
Thus, given fixed reward parameters $\mu_\star=\{\mustara\}_{a=1}^K$, the marginal distribution over the parallel component $\ctx_\parallel^{(a^\star,a)}$ of the context
%% \be
%% \int d\ctx^{(a,a')}_\perp P(\ctx), \nn %%% d^{\\ctxdim-1}
%% \ee
determines the probability distribution over the difference in expected rewards between $a^\star$ and $a$.
Its density at small $\ctx_\parallel^{(a^\star,a)}$ quantifies the probability for ``adversarial'' contexts for which the better action (between $a^\star$ and $a$) becomes impossible to resolve. 
We define this limit as
% [may be confusing, as it doesn't look symmetric:] \footnote{Since the action values $\ctx^\top\mustara$ and $\ctx^\top\mustarastar$ become identical become identical in this limit, $\rhoaa_{a^\star,a}^{(t)}$ is symmetric with respect to $a^\star$ and $a$.}
%%% along with the corresponding conditional distribution over $\ctx_\perp^{(a^\star,a)}$,
%%% (both important quantities in our regret bound)
\be\label{eq:probadv_def}
\rhoaa_{a^\star,a}^{(t)} := %CHECKED
\lim_{\eps\rarr0+}\frac{1}{\eps}
\mathbb{P}_{\ctx\sim\Pct}\(a(\ctx)=a^\star,
\ctx^\top(\mustarastar-\mustara)    <\eps||\mustarastar-\mustara||_2\).
%%% \lim_{\xparallel\rarr0+}P_\parallel^{(t)}(\xparallel|a^\star,a)\cdot\mathbb{P}(a(\ctx)=a^\star|\ctx_\parallel^{(a^\star,a)}=\xparallel).
\ee
This quantity is the probability density which the distribution $\Pct$ assigns to contexts for which action $a^\star$ is optimal, but is only infinitesimally preferred to action $a$.
Note that, since the inequality in Eq. \eqref{eq:probadv_def} can be written as $\ctx_\parallel^{(a^\star,a)}<\eps$, 
$\rhoaa_{a^\star,a}^{(t)}$ only depends on the direction of the vector difference $\mustarastar-\mustara$ (which determines $\ctx_\parallel^{(a^\star,a)}$, see Eq. \eqref{eq:parallel_def}), and not its magnitude $\Delta_{a^\star,a}$.
As mentioned above, when we interpret the context vectors $\ctx_t$ as posterior probability vectors in the latent bandit setting, $\Pct$ becomes a distribution over these posteriors, which depend on the history of observations $x_{1:t}$.
In this setting, Eq. \eqref{eq:probadv_def} can be rewritten as shown in Eq. \eqref{eq:probadv_def_latent} in the main text.

\subsection{Assumptions}
\label{app:assumptions}

%Our derivation of Theorem~\ref{theorem:regret_llTS} will, for the most part, .....

In this section we state several assumptions used throughout the derivation of Theorem~\ref{theorem:regret_llTS}.
First, we make assumptions pertaining to the boundedness of context vectors and reward parameter vectors:

\begin{assumption}
\label{assumption_mu_bound}
The Euclidean norm of any context vector $\ctxstar\sim\Pct$ (at any time $t$) is strictly upper bounded,
$$
||\ctxstar||_2 \leq \Uctxnorm.
$$
\end{assumption}
Assumption \ref{assumption_mu_bound} is automatically satisfied with $\Uctxnorm=1$ when the contexts $\ctxstar$ are posterior probability vectors.

\begin{assumption}
\label{assumption_ctx_bound}
For all $a$, the $1$-norm of the true mean reward parameter vector $\mustara$ is upper bounded,
$$
||\mustara||_1<\Umunorm.
$$
\end{assumption}

We also make three generic assumptions on empirical estimates, which will be applied in Appendix~\ref{app:pi_bound} to quantities at a single time $t$.

\begin{assumption}
\label{assumption_mu_error_bound}
For each $(a,z)$ and for $\deltamu\in(0,1)$, the error of the $z$'th vector element of the estimator $\muhat^{(a)}$ is upper bounded,
$$|\muhat^{(a)}_z - (\mustara)_z| \leq \Umu,$$
with probability at least $1-\deltamu$.
\end{assumption}

\begin{assumption}
\label{assumption_eigen_bound}
For each $a$ and for $\deltacov\in(0,1)$, the maximal eigenvalue of the empirical covariance matrix $\covhat^{(a)}$ is upper bounded,
$$\max_z\lambda^{(a)}_z \leq (\Ucov)^2,$$
with probability at least $1-\deltacov$.
\end{assumption}

\begin{assumption}
\label{assumption_ctx_error_bound}
The $1$-norm error of the estimated context vector is upper bounded,
$$||\ctxhat-\ctxstar||_1 := \sum_z|\ctxhat_z - \ctxstar_z|\leq\Uctx,$$
with probability at least $1-\deltactx$.
\end{assumption}

In general, we allow the upper bounds in Assumptions \ref{assumption_mu_error_bound}-\ref{assumption_ctx_error_bound} to be unspecified functions of the bound probabilities $\delta_i$.
In Appendix~\ref{app:regret_latent}, we will the use specific functional forms for these upper bounds (including time-dependence) which apply in the latent bandit setting under the conditions of Theorem~\ref{theorem:estimator}.
As noted above, in the latent bandit setting we will only consider the $\Uctx=0$ case, but our analysis of linear Thompson sampling applies more generally.

\subsection{Upper bound on suboptimal action probabilities}
\label{app:pi_bound}

In this section, we show that for linear Thompson sampling, the probability of making a suboptimal action, given a Thompson sampling distribution $\mathcal{N}(\muhat,\covhat)$, can be bounded in terms of upper bounds on the error in $\muhat$ and on the eigenvalue spectrum of $\covhat$, with suboptimal actions becoming impossible when the confidence ellipsoid determined by $\covhat$ shrinks to zero (i.e. $\Ucov\rarr0$ in Assumption \ref{assumption_eigen_bound}) and reward error approaches zero.

The action probability bound, Eq. \eqref{eq:prob_a_cond_bound_final}, is expressed in terms of a free parameter $\yy$ which we will optimize in Appendix~\ref{app:regret_timestep} in order to tighten the resulting regret bound.
Note that in Eq. \eqref{eq:prob_a_cond_bound_final}, in the limit where $\ua$ and $\ub$ (which are proportional to the upper bounds in Assumptions \ref{assumption_mu_error_bound}-\ref{assumption_ctx_error_bound} on estimation error and uncertainty) become very small, $\yy$ may be chosen to be very large, such that the probability of suboptimal action $a$ can be upper bounded at a very small value except for contexts for which the reward gap $\Delta_a(\ctxstar)$ is infinitesimally small.

\begin{lemma}\label{lemma:prob_action}
When assumptions \ref{assumption_mu_bound} and \ref{assumption_mu_error_bound}-\ref{assumption_ctx_error_bound} are satisfied, the probability
\begin{align}
    \label{eq:pi_def}
    \pi(a|\ctxhat,\muhat,\covhat) := P(a_t=a|\ctxhat_t=\ctxhat,\muhat(t)=\muhat,\covhat(t)=\covhat)
\end{align}
of linear Thompson sampling selecting any action $a$ at any time $t$, conditional on empirical quantities $(\ctxhat,\muhat,\covhat)$ (the estimated or noisily observed context vector, the estimated reward parameters, and the empirical covariance matrix), 
with $\ctxhat-\ctxstar$ bounded by Assumption \ref{assumption_ctx_error_bound}, satisfies the upper bound
\be\label{eq:prob_a_cond_bound_final}
\pi(a|\ctxhat,\muhat,\covhat) \leq
\one\big(\Delta_a(\ctxstar)<\yy\ua+ \ub\big) + \frac{1}{2y}e^{-\yy^2}
\ee
for any $\yy>0$, 
with probability at least $1-\deltactx-2(\ctxdim\deltamu+\deltacov)$, where $\ctxstar$ is the true context vector (whose difference from $\ctxhat$ is bounded by Assumption \ref{assumption_ctx_error_bound}), 
\be\label{eq:Delta_a_c_def}
\Delta_a(\ctxstar) := (\ctxstar)^\top(\mu_\star^{(a(\ctxstar))}-\mustara),
\ee
is the context-dependent reward gap between action $a$ and the optimal action for context $\ctxstar$, and
\begin{align}
    \ua &:= 2\big(\Uctxnorm+\Uctx\big)\Ucov, \label{eq:u1_def} \\
    \ub &:= \Uctx||\mustarastar-\mustara||_1 + 2\ctxdim\Umu(\Uctxnorm + \Uctx). \label{eq:u2_def}
\end{align}
\end{lemma}

\begin{proof}
For Thompson sampling, the action probabilities are averages over the multivariate normal distributions\footnote{We denote the multivariate Gaussian probability distribution function with mean $\mu$ and covariance $\cov$ as $P_G(\cdot|\mu,\cov)$.} from which the action-wise reward parameters $\mu^{(a)}$ are sampled:\footnote{The second product over actions ensures that the probability for selecting action $a$ is the integrated probability mass in the space of samples $\{\mu^{(a)}\}_{a=1}^K$ for which action $a$ has the highest expected reward $\ctxhat^\top\mustara$.}
\begin{align}
    \pi(a|\ctxhat,\muhat,\covhat) = %CHECKED
    \int
    \prod_{a'} d\mu^{(a')}P_G(\mu^{(a')}|\muhat^{(a')},\covhat^{(a')})
    \cdot\prod_{a'\neq a}\one(\ctxhat^\top\mu^{(a)} - \ctxhat^\top\mu^{(a')}>0).
\end{align}
For any $a^\star\neq a$, we can replace the indicator functions for all $a'\neq a, a^\star$ with 1, resulting in the upper bound:
\begin{align}
    \label{eq:prob_a_vs_astar}
    \pi(a|\ctxhat,\muhat,\covhat)
    & \leq %CHECKED
    \int
    d\mu^{(a)}P_G(\mu^{(a)}|\muhat^{(a)},\covhat^{(a)})
    d\mu^{(a^\star)}P_G(\mu^{(a^\star)}|\muhat^{(a^\star)},\covhat^{(a^\star)}) \nn \\
    & \ \ \ \ \ \ \ \ \ \ \ \cdot\one(\ctxhat^\top\mu^{(a)} - \ctxhat^\top\mu^{(a^\star)}>0).
\end{align}
We define the difference %$\mu^{(a)}-\mu^{(a(\ctxstar))}$
in sampled reward parameters for actions $a$ and  $a^\star$ (which will be set to the optimal action $a(\ctxstar)$), shifted relative to the mean reward parameter estimators $(\muhat^{(a)},\muhat^{(a^\star)})$, as
\begin{align}
    \dmu := (\mu^{(a)} - \muhat^{(a)}) - (\mu^{(a^\star)} - \muhat^{(a^\star)}),
\end{align} 
(For simplicity, we will suppress the implicit $(a,a^\star)$-dependence of $\dmu$.)
Since the indicator function in Eq. \eqref{eq:prob_a_vs_astar} depends only on the difference
$$
\mu^{(a)}-\mu^{(a^\star)} = %CHECKED
\dmu + \muhat^{(a)}-\muhat^{(a^\star)},
$$
we can change variables from $(\mu^{(a)},\mu^{(a^\star)})$ to $(\mu^{(a)} - \mu^{(a^\star)},\mu^{(a)} + \mu^{(a^\star)})$ and integrate out the latter sum variable. 
The distribution of the difference $\mu^{(a)}-\mu^{(a^\star)}$ of two variables $\mu^{(a)}\sim P_G(\cdot|\muhat^{(a)},\covhat^{(a)})$ and $\mu^{(a^\star)}\sim P_G(\cdot|\muhat^{(a^\star)},\covhat^{(a^\star)})$ is %CHECKED
Gaussian distributed with a covariance given by the sum of the individual covariances, that is,
\be
\mu^{(a)}-\mu^{(a^\star)}\sim %CHECKED
P_G(\cdot|\muhat^{(a)}-\muhat^{(a^\star)},
\covhat^{(a)}+\covhat^{(a^\star)}).
\ee
In terms of the zero-mean variable $\dmu$, then, we can rewrite Eq. \eqref{eq:prob_a_vs_astar} -- for any $a^\star\neq a$ -- as 
\begin{align}
    \pi(a|\ctxhat,\muhat,\covhat) \leq %CHECKED
    \int d\dmu P_G(\dmu|\mathbf{0},\covhat^{(a)}+\covhat^{(a^\star)})
    \cdot\one(\ctxhat^\top\dmu > \ctxhat^\top(\muhat^{(a^\star)}-\muhat^{(a)})). \label{eq:prob_a_integral_dmu}
\end{align}
We now introduce a free parameter $\eps>0$ and insert a factor of 
$$ %CHECKED
1=\one(\ctxhat^\top\dmu\geq \eps||\ctxhat||_2) + \one(\ctxhat^\top\dmu<\eps||\ctxhat||_2)
$$
inside the integral in Eq. \eqref{eq:prob_a_integral_dmu}.
This divides the space of samples $\dmu$ into samples which are more or less optimistic about action $a$ relative to $a^\star$ (relative to the estimated difference $\muhat^{(a^\star)}-\muhat^{(a)}$).
Thus, for any $a^\star\neq a$,
\begin{align}
    \pi(a|\ctxhat,\muhat,\covhat)
    & \leq %CHECKED
    \int d\dmu P_G(\dmu|\mathbf{0},\covhat^{(a)}+\covhat^{(a^\star)})
    \cdot\one(\ctxhat^\top\dmu > \ctxhat^\top(\muhat^{(a^\star)}-\muhat^{(a)})) \nn \\
    & \ \ \ \ \ \ \ \ \times\(\one(\ctxhat^\top\dmu\geq \eps||\ctxhat||_2) + \one(\ctxhat^\top\dmu<\eps||\ctxhat||_2)\) \nn \\
    & \leq %CHECKED
    \mathbb{P}\Big(\ctxhat^\top\dmu\geq \eps||\ctxhat||_2 \Big| \dmu\sim\N(\mathbf{0},\covhat^{(a)}+\covhat^{(a^\star)}) \Big) + \one(\eps||\ctxhat||_2>\ctxhat^\top(\muhat^{(a^\star)}-\muhat^{(a)})) \label{eq:prob_a_cond_event_bound}
\end{align}
In the first term we've used $\one(\ctxhat^\top\dmu > \ctxhat^\top(\muhat^{(a^\star)}-\muhat^{(a)}))\leq1$, and in the second term 
%When $\ctxhat^\top\dmu<\eps||\ctxhat||_2$ 
%%-- that is, when Thompson sampling is [LESS] optimistic towards $a^\star$ relative to $a$ -- 
we've upper bounded the indicator function,
$$ %CHECKED
\one(\ctxhat^\top\dmu > \ctxhat^\top(\muhat^{(a^\star)}-\muhat^{(a)}))
\leq\one(\eps||\ctxhat||_2 > \ctxhat^\top(\muhat^{(a^\star)}-\muhat^{(a)})).
$$
and taken the upper bound on the indicator function outside the integral, which is upper bounded by 1.

We will now use Assumption \ref{assumption_eigen_bound} to derive an upper bounds on the first term in Eq. \eqref{eq:prob_a_cond_event_bound}, and Assumptions \ref{assumption_mu_error_bound} and \ref{assumption_ctx_error_bound} to correspondingly bound the second term.

\textit{Upper bound on the first term in Eq. \eqref{eq:prob_a_cond_event_bound}.}

Eq. \eqref{eq:tailbound} from Appendix~\ref{app:tailbound} gives an upper bound on the probability mass in the tail of a Gaussian distribution, which we use to upper bound the first term in \eqref{eq:prob_a_cond_event_bound}. 
Recalling that the inner product of a Gaussian random vector $\dmu\sim\N(\mathbf{0},\cov)$ with any vector $\ctx$ is a Gaussian variable with mean zero and variance $\ctx^\top\cov\ctx$, %(CHECKED)
Eq. \eqref{eq:tailbound} yields (for any $a^\star\neq a$)
\be\label{eq:prob_outlier_sample_bound}
\mathbb{P}\Big(\ctxhat^\top\dmu\geq \eps||\ctxhat||_2 \Big| \dmu\sim\N(\mathbf{0},\covhat^{(a)}+\covhat^{(a^\star)}) \Big) \leq %CHECKED
\frac{1}{\sqrt{2}\eps}
\sigma(\ctxhat) \exp \big[ -\eps^2/2\sigma^2(\ctxhat) \big],
\ee
where the variance
\be
\sigma^2(\ctxhat) := ||\ctxhat||_2^{-2} \ctxhat^\top(\covhat^{(a)}+\covhat^{(a^\star)})\ctxhat
\ee
depends on the estimated context $\ctxhat$.
To simplify the expectation over contexts required to compute regret, we upper bound $\sigma^2(\ctxhat)$ in terms of the eigenvalues of the empirical covariance matrices $\{\covhat^{(a)}\}$.
Defining $\lambda^{(a,a^\star)}_z$ as the $z$'th eigenvalue of the covariance matrix $\covhat^{(a)}+\covhat^{(a^\star)}$, the variance $\sigma^2(\ctxhat)$ along any direction of the confidence ellipsoid specified by the same covariance matrix satisfies the upper bound
\be %~CHECKED (clarify for readers?)
\sigma^2(\ctxhat) \leq \max_z \lambda^{(a,a^\star)}_z.
\ee
Furthermore, by the Weyl inequality (and since the matrices $\covhat^{(a)}$ are real, symmetric, and positive definite), %CHECKED -- according to https://math.stackexchange.com/questions/2053767/bounds-for-sum-of-the-largest-eigenvalues-of-two-matrices/2053786
$$
\max_z \lambda^{(a,a^\star)}_z \leq \max_z \lambda_z^{(a)} + \max_z \lambda_z^{(a^\star)}.
$$
Assumption \ref{assumption_eigen_bound} states that $\max_z \lambda_z^{(a)}, \ \max_z \lambda_z^{(a^\star)}\leq(\Ucov)^2$ with probability at least $(1-\deltacov)^2>1-2\deltacov$, %CHECKED
and consequently,
\be %CHECKED
\sigma^2(\ctxhat) \leq 2(\Ucov)^2
\ee
with the same probability. %CHECKED
Therefore, since the upper bound in Eq. \eqref{eq:prob_outlier_sample_bound} increases monotonically with $\sigma(\ctxhat)$, we have, for any $\ctxhat$ and $a^\star\neq a$, and with probability at least $1-2\deltacov$,
\begin{align}
    \mathbb{P}\Big(\ctxhat^\top\dmu\geq \eps||\ctxhat||_2 \Big| \dmu\sim\N(\mathbf{0},\covhat^{(a)}+\covhat^{(a^\star)}) \Big)
    \leq %CHECKED
    \frac{\Ucov}{\eps}
    \exp \big[ -\eps^2/(2\Ucov)^2\big]
    = \frac{1}{2y} e^{-y^2}
    \label{eq:prob_outlier_sample_bound_eigen}
\end{align}
where 
\be\label{eq:yy_def}
\yy := \frac{\eps}{2\Ucov},
%\eps\frac{1}{\sqrt{2\ctxhat^\top(\covhat^{(a)}+\covhat^{(a^\star)})\ctxhat}},
\ee
is a rescaled version of the free parameter $\eps$.

\textit{Upper bound on the second term in Eq. \eqref{eq:prob_a_cond_event_bound}.}

Applying Assumption \ref{assumption_mu_error_bound} to each element of the vector estimator $\muhat^{(a)}$, we have $||\muhat^{(a)} - \mustara||_1\leq \ctxdim\cdot\Umu$ with probability at least $(1-\deltamu)^\ctxdim>1-\ctxdim\deltamu$. %CHECKED
Applying this bound for both action $a$ and $a^\star$, we have
\begin{align}
    \ctxhat^\top(\muhat^{(a^\star)}-\muhat^{(a)}) &= %CHECKED
    \ctxhat^\top(\mustarastar-\mustara) + \ctxhat^\top(\muhat^{(a^\star)} - \mustarastar) + \ctxhat^\top(\mustara - \muhat^{(a)}) \nn \\
    & \geq %CHECKED
    \ctxhat^\top(\mustarastar-\mustara) - ||\ctxhat||_1 ||\muhat^{(a^\star)} - \mustarastar||_1 - ||\ctxhat||_1 ||\muhat^{(a)} - \mustara||_1 \nn \\
    & \geq %CHECKED
    \ctxhat^\top(\mustarastar-\mustara) - 2\ctxdim||\ctxhat||_1 \Umu
\end{align}
with probability at least $(1-\ctxdim\deltamu)^2\geq 1-2\ctxdim\deltamu$ %CHECKED
(for any $a,a^\star\neq a$). It follows that
\begin{align}\label{eq:bound_ctxhat}
    \one(\eps||\ctxhat||_2>\ctxhat^\top(\muhat^{(a^\star)}-\muhat^{(a)}))
    \leq %CHECKED
    \one(\eps||\ctxhat||_2>\ctxhat^\top(\mustarastar-\mustara) - 2\ctxdim||\ctxhat||_1 \Umu )
\end{align}
with the same probability.
We will now apply Assumption \ref{assumption_ctx_error_bound} to bound the deviation of the estimated context $\ctxhat$ from the true context $\ctxstar$.
Recalling the shorthand notation $\Delta_a(\ctxstar) := (\ctxstar)^\top(\mu_\star^{(a(\ctxstar))}-\mustara)$ of Eq. \eqref{eq:Delta_a_c_def}, 
and recalling that $a^\star=a(\ctxstar)$ is enforced in Eq. \eqref{eq:regret_timestep_1},
we have
$$
\ctxhat^\top(\mustarastar-\mustara) = %CHECKED
\Delta_a(\ctxstar) + \(\ctxhat - \ctxstar\)^\top(\mustarastar-\mustara).
$$
Applying Assumption \ref{assumption_ctx_error_bound} to bound the error $\ctxhat-\ctxstar$, we have
$$
|(\ctxhat-\ctxstar)^\top(\mustarastar-\mustara)|
\leq %CHECKED
||\ctxhat-\ctxstar||_1 ||\mustarastar-\mustara||_1
\leq %CHECKED
\Uctx ||\mustarastar-\mustara||_1
$$
with probability at least $1-\deltactx$. %CHECKED
Consequently, $\ctxhat^\top(\mustarastar-\mustara)\geq\Delta_a(\ctxstar)-\Uctx ||\mustarastar-\mustara||_1$ %CHECKED
with the same probability, %CHECKED
and thus
\begin{align}
    & \one(\eps||\ctxhat||_2>
    \ctxhat^\top\big(\mustarastar-\mustara) - 2\ctxdim||\ctxhat||_1 \Umu\big) \nn \\
    & \leq %CHECKED
    \one\big(\eps||\ctxhat||_2>
    \Delta_a(\ctxstar) - \Uctx||\mustarastar-\mustara||_1 - 2\ctxdim||\ctxhat||_1\Umu\big) \nn \\
    & \leq %CHECKED
    \one\big(\Delta_a(\ctxstar) < \eps(||\ctxstar||_1+\Uctx) + \Uctx||\mustarastar-\mustara||_1 + 2\ctxdim\Umu(||\ctxstar||_1 + \Uctx) \big), \label{eq:gap_error_bound}
\end{align}
with probability at least $1-\deltactx$.
In the last line, we have again used Assumption \ref{assumption_eigen_bound} to exchange $\ctxhat$ in favor of $\ctxstar$, by using
$$ %CHECKED
||\ctxhat||_2 \leq ||\ctxhat||_1 = ||\ctxstar + (\ctxhat-\ctxstar)||_1 \leq ||\ctxstar||_1 + ||\ctxhat-\ctxstar||_1 \leq ||\ctxstar||_1 + \Uctx
$$
in the first term, and similarly
$||\ctxhat||_1 \leq ||\ctxstar||_1 + \Uctx$ %CHECKED
in the last term.
% no extra reduction in probability since we already assumed the bound holds with prob 1-d3
Lastly, we use Assumption \ref{assumption_mu_bound} to upper bound $||\ctxstar||_1$ in Eq. \eqref{eq:gap_error_bound}. Combining Eq. \eqref{eq:gap_error_bound} with Eq. \eqref{eq:bound_ctxhat}, we then have
\be
\one(\eps||\ctxhat||_2>\ctxhat^\top(\muhat^{(a^\star)}-\muhat^{(a)}))
\leq %CHECKED
\one\big(\Delta_a(\ctxstar) < \eps(\Uctxnorm+\Uctx) + \Uctx||\mustarastar-\mustara||_1 + 2\ctxdim\Umu(\Uctxnorm + \Uctx) \big), \label{eq:second_term_bound}
\ee
with probability at least %CHECKED
$(1-\deltactx)(1-2\ctxdim\deltamu)\geq 1-\deltactx-2\ctxdim\deltamu$.
To simplify the expression, we introduce the variables $\ua$ and $\ub$ defined above in Eqs. \eqref{eq:u1_def}-\eqref{eq:u2_def}, which summarize the influence of the error bounds from Assumptions \ref{assumption_mu_error_bound}-\ref{assumption_ctx_error_bound}, 
and write Eq. \eqref{eq:second_term_bound} as
\be
\one(\eps||\ctxhat||_2>\ctxhat^\top(\muhat^{(a^\star)}-\muhat^{(a)}))
\leq %CHECKED
\one(\Delta_a(\ctxstar) < \yy\ua + \ub), \label{eq:second_term_bound_short}
\ee
again with probability at least $1-\deltactx-2\ctxdim\deltamu$, 
where $\yy$ was defined in Eq. \eqref{eq:yy_def}.
Finally, combining Eqs. \eqref{eq:prob_outlier_sample_bound_eigen} and \eqref{eq:second_term_bound_short} to upper bound (respectively) the first and second terms in Eq. \eqref{eq:prob_a_cond_event_bound}, we arrive at the final high-probability bound on action probabilities, Eq. \eqref{eq:prob_a_cond_bound_final}, with the probability of the bound obtained by combining the probabilities of Eqs. \eqref{eq:prob_outlier_sample_bound_eigen} and \eqref{eq:second_term_bound_short}.
\end{proof}

\subsection{Instantaneous regret bound}
\label{app:regret_timestep}

The suboptimal action probability bound, Lemma~\ref{lemma:prob_action}, conditions on a particular context vector $\ctxhat$, which is approximately equal to the true context $\ctxstar$ (with the difference bounded by Assumption~\ref{assumption_ctx_error_bound}).
We now take an expectation over the context distribution $\Pct$ from which $\ctxstar$ is generated at time $t$, in order to extend Lemma~\ref{lemma:prob_action} into a corresponding high-probability bound on the expected regret incurred at time $t$.

\begin{lemma}\label{lemma:regret_timestep}
When Assumptions \ref{assumption_mu_bound} and \ref{assumption_mu_error_bound}-\ref{assumption_ctx_error_bound} are satisfied, and furthermore when 
\be\label{eq:uu_bound}
(\Ucov)^2 < \frac{1}{8e}\frac{\Delta_{a^\star,a}^2}{\Uctxnorm\rhoaa_{a^\star,a}^{(t)}}
\ee
the expected regret incurred by linear Thompson sampling at a single timestep %averaged over actions but conditioned on model estimates $(\ctxhat,\muhat,\covhat)$, 
is upper bounded,
\begin{align}\label{eq:regret_bound_timestep}
\dR^{(t)}%(\ctxhat,\muhat,\covhat) 
\leq %CHECKED
\sum_{a^\star,a} \frac{\rhoaa_{a^\star,a}^{(t)}}{\Delta_{a^\star,a}} \Big( \big(||\mustarastar-\mustara||_1\Uctx + 2\ctxdim\Uctxnorm\Umu\big)^2 + 8\Uctxnorm^2\big(\Ucov\big)^2\log\zeta\Big) + O(\Uerror^3)
\end{align}
with probability at least $1 - 2(\ctxdim\deltamu+\deltacov) - \deltactx$, 
where
\be\label{eq:zeta_def}
\zeta :=
\frac{\Delta_{a^\star,a}^2}{2\rhoaa_{a^\star,a}^{(t)}}\frac{1}{\Uctxnorm(2\Ucov)^2},
\ee
and where $O(\Uerror^3)$ denotes contributions which scale cubically or higher with the upper bounds $\Umu, \ \Ucov, \ \Uctx$ on estimation errors.
\end{lemma}

\begin{proof}
The instantaneous or per-timestep expected regret incurred by selecting action $a_t$ -- averaged over possible ground-truth context vectors $\ctxstar$ (and $\ctxhat\approx\ctxstar$ up to error bounded by Assumption~\ref{assumption_ctx_error_bound}) and actions $a$, but conditioned on the empirical estimates $(\muhat,\covhat)$ -- is
\begin{align}
    \dR^{(t)}(\muhat,\covhat)
    &= %CHECKED
    \sum_a \EE_{\ctxstar\sim\Pct}[(\ctxstar)^\top(\mu_\star^{(a(\ctxstar))} - \mustara)\pi(a|\ctxhat,\muhat,\covhat)] \\ 
    &= %CHECKED
    \sum_{a^\star,a}
    \dR^{(t)}_{a^\star,a}(\ctxhat,\muhat,\covhat),
    \label{eq:regret_timestep_pairwise}
\end{align}
where
\be\label{eq:regret_timestep_1}
\dR^{(t)}_{a^\star,a}(\muhat,\covhat)
:= \EE_{\ctxstar\sim\Pct}
\big[\one(a(\ctxstar)=a^\star)
\Delta_a(\ctxstar)
%\cdot(\ctxstar)^\top(\mu_\star^{(a^\star)} - \mustara)
\pi(a|\ctxhat,\muhat,\covhat)\big]
\ee
is the pair-wise expected regret incurred due to taking action $a$ when $a^\star$ is optimal, 
and we have used the definition of the reward gap $\Delta_a(\ctx)$ in Eq. \eqref{eq:Delta_a_c_def}.
Using Lemma~\ref{lemma:prob_action} to upper bound the action probability $\pi(a|\ctxhat,\muhat,\covhat)$ in Eq. \eqref{eq:regret_timestep_1}, the action pair-wise regret $\dR^{(t)}_{a^\star,a}$ satisfies the upper bound
\be\label{eq:Raa_bound_twoterms}
\dR^{(t)}_{a^\star,a}%(\ctxhat,\muhat,\covhat) 
\leq %CHECKED
\EE_{\ctxstar\sim\Pct}
\Big[\one(a(\ctxstar)=a^\star)
\Delta_a(\ctxstar)
\Big(\one\big(\Delta_a(\ctxstar)<\yy\ua+ \ub\big) + \frac{1}{2y}e^{-\yy^2} \Big)
\Big]
\ee
for any $\yy>0$, with probability at least $1-\deltaR$, where
\be\label{eq:delta_regret}
\deltaR:=2(\ctxdim\deltamu+\deltacov)+\deltactx.
\ee
\cyan{We have removed the arguments $(\muhat,\covhat)$ of regret, since the upper bound holds for any values of these arguments, and thus also bounds the expected regret, averaged over possible realizations of these estimators, $\dR^{(t)}_{a^\star,a}=\EE_{\muhat,\covhat}[\dR^{(t)}_{a^\star,a}(\muhat,\covhat)]$.}

\textit{Asymptotic limit of small errors.}

In the asymptotic, large-$T$ limit, we expect that the $\ua$ and $\ub$ -- which scale linearly with the upper bounds on the errors $(\Umu,\Ucov,\Uctx)$ in $(\muhat,\covhat,\ctxhat)$ in Assumptions \ref{assumption_mu_error_bound}-\ref{assumption_ctx_error_bound} -- will converge towards zero. In this regime, the indicator function in the first term in Eq. \eqref{eq:Raa_bound_twoterms} will only be nonzero when the context-dependent reward gap between actions $a$ and $a^\star$ is very small, making it difficult to resolve the better action.
Defining $\epsbar:=\yy\ua+ \ub$ for brevity, the first term in Eq. \eqref{eq:Raa_bound_twoterms} can be evaluated as follows, by expressing the expectation over $\ctxstar$ as an integral over the parallel component $\ctx^{(a^\star,a)}_\parallel\sim P_\parallel^{(t)}(\cdot|a^\star,a)$
% (we don't need this:) \ctx^{(a^\star,a)}_\perp
introduced above in \eqref{eq:parallel_def}:
%renaming $\ctx_\parallel\rarr\xparallel$ to lighten notation
\begin{align}
    &\EE_{\ctx\sim\Pct}
    \Big[\one(a(\ctx)=a^\star)\Delta_a(\ctx)\cdot\one\big(\Delta_a(\ctx)<\epsbar\big) \Big]
    = %CHECKED
    \int_0^{\epsbar/\Delta_{a^\star,a}} d\xparallel P_\parallel^{(t)}(\xparallel|a^\star,a) x\Delta_{a^\star,a}
    \mathbb{P}(a(\ctx)=a^\star|\ctx_\parallel^{(a^\star,a)}=\xparallel) \nn \\
    & = %CHECKED
    \Delta_{a^\star,a}\int_0^{\epsbar/\Delta_{a^\star,a}} d\xparallel
    \cdot x\times\[\lim_{\xparallel\rarr0+}P_\parallel^{(t)}(\xparallel|a^\star,a)
    \cdot\mathbb{P}(a(\ctx)=a^\star|\ctx_\parallel^{(a^\star,a)}=\xparallel) 
    + O(\epsbar)\] \nn \\
    & = %CHECKED
    \frac{\epsbar^2}{2\Delta_{a^\star,a}} \[\frac{1}{\epsbar}\int_0^{\epsbar} dx P_\parallel^{(t)}(x|a^\star,a)
    \cdot\mathbb{P}(a(\ctx)=a^\star|\ctx_\parallel^{(a^\star,a)}=\xparallel) 
    + O(\epsbar) \] \nn \\
    & = %CHECKED
    \frac{\epsbar^2}{2\Delta_{a^\star,a}} \[\frac{1}{\epsbar}
    \mathbb{P}_{\ctx\sim\Pct}\(a(\ctx)=a^\star,\ctx_\parallel^{(a^\star,a)}<\epsbar\) + O(\epsbar) \]
\end{align}
In the first line, we have conditioned on the event ($a(\ctxstar)=a^\star$) that $a^\star$ is optimal by restricting $\xparallel\sim P_\parallel^{(t)}(\cdot|a^\star,a)$ to be positive,\footnote{Recall from Eq. \eqref{eq:parallel_def} that the sign of $\ctx_\parallel^{(a^\star,a)}$ specifies whether or not action $a^\star$ is preferred to $a$.}
In the second line, since $\xparallel<\epsbar/\Delta_{a^\star,a}$, we have written the integrant as its limit as the parallel component approaches zero, up to $O(\epsbar)$ corrections.
In the third line, we have evaluated the integral over $x$ and rewritten the limiting quantity in brackets as an integral, which is exact up to an additional $O(\epsbar)$ correction.
In the last line, we have rewritten the integral over the marginal and joint distributions as a joint probability.
In the $\epsbar\rarr0$ limit, recalling the definition of $\ctx_\parallel^{(a^\star,a)}$ in Eqs. \eqref{eq:parallel_def}-\eqref{eq:parallel_def_v2}, this final quantity in brackets is the limiting pairwise probability density $\rhoaa_{a^\star,a}$ defined above in Eq. \eqref{eq:probadv_def}.
Therefore, up to an additional $O(\epsbar)$ correction due to the change in this quantity away from its limit as $\epsbar\rarr0$, we have
\be
\EE_{\ctx\sim\Pct}
\Big[\one(a(\ctx)=a^\star)\Delta_a(\ctx)\cdot\one\big(\Delta_a(\ctx)<\epsbar\big) \Big]
= \frac{\rhoaa_{a^\star,a}^{(t)}}{2\Delta_{a^\star,a}}\epsbar^2 + O(\epsbar^3)
\label{eq:Raa_bound_firstterm}
\ee
While this limiting form obviously fails for large $\epsbar$, at late times we expect the error bounds $(\Umu,\Ucov,\Uctx)$ to become tight, and hence $\epsbar$ to approach zero.

The expectation in the second term in Eq. \eqref{eq:Raa_bound_twoterms} can also be upper bounded,
\be
\EE_{\ctxstar\sim\Pct}[\one(a(\ctxstar)=a^\star)\Delta_a(\ctxstar)]
\leq %CHECKED
\Delta_{a^\star,a}\cdot\EE_{\ctxstar\sim\Pct}[||\ctxstar||_2] \leq \Uctxnorm\Delta_{a^\star,a}. \label{eq:Raa_bound_secondterm}
\ee
Here, we have used $\one(a(\ctxstar)=a^\star)\leq1$, set $a(\ctxstar)=a^\star$, upper bounded the vector inner product $\Delta_a(\ctxstar)$ in terms of the Euclidean norms $||\ctxstar||_2$ and $||\mustarastar-\mustara||_2 = \Delta_{a^\star,a}$, used $||\ctxstar||_2\leq||\ctxstar||_1$, and used Assumption \ref{assumption_mu_bound} again.

Applying Eq. \eqref{eq:Raa_bound_firstterm} and Eq. \eqref{eq:Raa_bound_secondterm} in Eq. \eqref{eq:Raa_bound_twoterms}, $\dR_{a^\star,a}$ can be upper bounded as follows:
\be
\dR^{(t)}_{a^\star,a}%(\ctxhat,\muhat,\covhat)
\leq %CHECKED
\frac{\rhoaa_{a^\star,a}^{(t)}}{2\Delta_{a^\star,a}}(\yy\ua+ \ub)^2
+ \Uctxnorm\Delta_{a^\star,a} \frac{1}{2y}e^{-\yy^2}
+ O(\Uerror^3),
\ee
with probability at least $1-\deltaR$ %CHECKED
as specified in Eq. \eqref{eq:delta_regret}, where $O(\Uerror^3)$ indicates contributions which scale with the cube of the error bounds $(\Umu,\Ucov,\Uctx)$. This is in contrast with the leading terms, which scale quadratically with $\ua$ and $\ub$ and hence\footnote{Below, we will set the free parameter $\yy$ to an optimal value which scales only logarithmically with the error bounds.} with the error bounds. In the limit where all errors become very small, the $O(\Uerror^3)$ contribution will become negligible compared to the leading terms.

\textit{Optimization of the free parameter $\yy$.}

We are now in a position to optimize the free parameter $\yy$.
Noting that 
$$ %CHECKED
(\yy\ua + \ub)^2 \leq 2(\yy^2\ua^2 + \ub^2),
$$
and defining 
\be
\uu := \frac{2\rhoaa_{a^\star,a}^{(t)}}{\Delta_{a^\star,a}^2\Uctxnorm}\ua^2, \ \ \ \
\vv := \frac{1}{2}\Uctxnorm\Delta_{a^\star,a}
\ee
to simplify notation, we have
\be\label{eq:regret_optimize_yy}
\dR^{(t)}_{a^\star,a}%(\ctxhat,\muhat,\covhat) 
\leq
\frac{\rhoaa_{a^\star,a}^{(t)}}{\Delta_{a^\star,a}}\ub^2 + \vv\(\uu\yy^2 + \yy^{-1}e^{-\yy^2} \) + O(\Uerror^3)
\ee
Defining a rescaled variable $\tilde{\yy}:=\uu e^{\yy^2}$, the second term is
\be %CHECKED
\uu\vv\times\Big(\log(\tilde{\yy}/\uu) + \frac{1}{\tilde{\yy}\sqrt{\log(\tilde{\yy}/\uu)}} \Big). \nn
\ee
Setting $\tilde{\yy}=1$ for simplicity, so that $\yy^2=\log(1/\uu)$, it is straightforward to check that the first term is larger as long as $\uu<1/e\approx0.368$.
Under this assumption and with this choice of $\yy$, then, the second term in Eq. \eqref{eq:regret_optimize_yy} is 
$\leq2\uu\vv\log(1/\uu)$, %CHECKED
and hence
\be
\dR^{(t)}_{a^\star,a}
\leq %CHECKED
\frac{\rhoaa_{a^\star,a}^{(t)}}{\Delta_{a^\star,a}}
\(\ub^2 + 2\ua^2\log \(
\frac{\Delta_{a^\star,a}^2}{2\rhoaa_{a^\star,a}^{(t)}}\frac{\Uctxnorm}{\ua^2}
\) \) + O(\Uerror^3)
\ee
Finally, absorbing the terms in $\ua$ and $\ub$ -- see Eqs. \eqref{eq:u1_def}-\eqref{eq:u2_def} -- which scale quadratically with the error bounds $(\Umu,\Ucov,\Uctx)$ into the subleading $O(\Uerror^3)$ contribution, 
and summing over actions as in Eq. \eqref{eq:regret_timestep_pairwise}, 
we arrive at the final form of the instantaneous regret bound, Eq. \eqref{eq:regret_bound_timestep}.
%% [REMOVING THIS, SINCE THE BOUND IS STRONGER WHEN CONDITIONING ON A (PARTIAL) HISTORY] There, we have removed the arguments $(\ctxhat,\muhat,\covhat)$, since the upper bound does not depend on them, making it trivial to take the expectation over the history-dependent estimates $(\ctxhat,\muhat,\covhat)$ and apply the bound to the expected regret ...
(The condition that $\uu<1/e$ is given in Eq. \eqref{eq:uu_bound}, again with the higher order term in $\ua$ removed.)
\end{proof} %\square

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\iffalse
\cyan{[(optional) LinTS i.i.d. theorem]}
%\subsection{Error bounds: I.I.D. case}
%\cyan{[Retitle? move this subsection?]}
%$\ctx_i\ctx_j$
When Assumptions \ref{assumption_mu_bound} and \ref{assumption_mu_error_bound}-\ref{assumption_ctx_error_bound} are satisfied with estimation error upper bounds at time $t$ of the form
\begin{align}
...
\end{align}
for some $t_0\geq1$, \cyan{[do we need $t_0$ large to ensure $\uu<1/e$ condition of instantaneous regret?]}
and with
\begin{align}
    \deltamu(t) &= \deltamu(t_0)(t_0/t)^\Uexponent \\
    \deltacov(t) &= \deltacov(t_0)(t_0/t)^\Uexponent \\
    \deltactx(t) &= \deltactx(t_0)(t_0/t)^\Uexponent \\
\end{align}
for some $\Uexponent\geq1/2$ and 
\fi
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\subsection{Regret bound for Linear Thompson Sampling}
\label{app:regret_linTS}

We now sum over timesteps in order to extend the per-timestep regret bound from the previous section into a cumulative regret bound.
In the following Lemma, we assume a generic form for the per-timestep bound, which will be partially specified in the subsequence Corollary, and fully specified using Lemma~\ref{lemma:regret_timestep} above in the following section.

\newcommand{\powtime}{\nu_1}
\newcommand{\powdelta}{\nu_2}
\newcommand{\powschedule}{\nu_\delta}

\begin{lemma}\label{lemma:regret_generic}
When Assumption \ref{assumption_mu_bound} is satisfied (such that $||\ctxstar||_2\leq\Uctxnorm$), 
and when the per-timestep regret at any time $t$ for a given algorithm satisfies the upper bound
\be\label{eq:regret_timestep_generic}
\dR^{(t)} \leq \frac{\UR}{t^{\powtime} \delta_t^{\powdelta}}
\ee
with probability at least $1-\delta_t$, for any $\delta_t\in(0,1)$ and for \cyan{$\powtime\in(0,1],\powdelta\in[0,1]$ with $1-\powtime/(1+\powdelta)>0$}, 
the corresponding cumulative regret $\R(T) = \sum_{t=1}^T\dR^{(t)}$ satisfies the upper bound
\be\label{eq:regret_generic}
\R(T) \leq %CHECKED
\UR^{1/(1+\powdelta)} (\Uctxnorm\cdot\max_{a^\star,a}\Delta_{a^\star,a})^{\powdelta/(1+\powdelta)}
% \cyan{\powdelta^{(...)}}
% ^^ upper bounded at 1 since we assume \powdelta\leq1
\frac{1+\powdelta}{1+\powdelta-\powtime}T^{1-\powtime/(1+\powdelta)}.
\ee
%where
%\Ugap := ... %\big( \EE[r_t|a_t=a^\star_t] - \EE[r_t|a_t=a] \big)
%is the maximum possible reward gap
\end{lemma}
\begin{proof}
At any given time, the per-timestep regret satisfies (with probability $1$) the bound
\be %CHECKED
\dR^{(t)} \leq \frac{\UR}{t^{\powtime} \delta_t^{\powdelta}} + \delta_t \times \max_{\ctxstar} \max_{a,a^\star}|(\ctxstar)^\top(\mustarastar-\mustara)|
\ee
for some $\powtime,\powdelta>0$. 
The second term conservatively bounds the worst-case regret incurred when the high-probability bound, Eq. \eqref{eq:regret_timestep_generic}, fails with probability $\leq\delta_t$.
We choose a power-law time schedule for the bound probability parameter $\delta_t$,
$$
\delta_t = \delta_0 / t^{\powschedule},
$$
with free parameters $\delta_0>0$ and $\powschedule>0$.
With this schedule, and using Assumption \ref{assumption_mu_bound} to bound $|(\ctxstar)^\top(\mustarastar-\mustara)|\leq\Uctxnorm||\mustarastar-\mustara||_2=\Uctxnorm\Delta_{a^\star,a}$, %CHECKED
we have
$$ %CHECKED
\dR^{(t)} \leq \frac{\UR}{\delta_0^{\powdelta}} t^{\powdelta\powschedule-\powtime} + \delta_0 t^{-\powschedule}\Uctxnorm\cdot\max_{a^\star,a}\Delta_{a^\star,a}.
$$
The sum over timesteps can be bounded with the continuous integral, $\sum_{t=1}^T t^{-\nu} \leq \int_0^T t^{-\nu} dt = \frac{1}{1-\nu} T^{1-\nu}$, as long as $\nu\in(0,1)$. %CHECKED % and $\nu\neq1$.
Assuming, then, that $\powschedule\in(0,1)$, $\powtime-\powdelta\powschedule\in(0,1)$,
we have
\be
\R(T) = \sum_{t=1}^T \dR^{(t)} \leq %CHECKED
\frac{\UR}{\delta_0^{\powdelta}(1-(\powtime-\powdelta\powschedule))} T^{1-(\powtime-\powdelta\powschedule)} + \frac{\delta_0}{1-\powschedule} T^{1-\powschedule}\Uctxnorm\cdot\max_{a^\star,a}\Delta_{a^\star,a}.
\ee
The free parameter $\powschedule$ controls the tradeoff between the growth rates in time of the two terms.
Equating these exponents,
$$
1-(\powtime-\powdelta\powschedule) = 1-\powschedule,
$$
leads to $\powschedule=\powtime/(\powdelta+1)\in(0,1)$.
Consequently,
\be\label{eq:regret_generic_1}
\R(T) = \sum_{t=1}^T \dR^{(t)} \leq %CHECKED
\(\frac{\UR}{\delta_0^{\powdelta}} + \delta_0 \Uctxnorm\cdot\max_{a^\star,a}\Delta_{a^\star,a}\)
\frac{1+\powdelta}{1+\powdelta-\powtime}T^{1-\powtime/(1+\powdelta)}.
\ee
The free parameter $\delta_0$ can be optimized by setting its derivative to zero, which yields
$$
\delta_0 = %CHECKED
\(\frac{\UR\powdelta}{\Uctxnorm\max_{a^\star,a}\Delta_{a^\star,a}} \)^{1/(\powdelta+1)}.
$$
Using this value of $\delta_0$ in Eq. \eqref{eq:regret_generic_1}, along with the assumed condition that $\powdelta\leq1$, we recover Eq. \eqref{eq:regret_generic}.
\end{proof}

While Eq. \eqref{eq:regret_timestep_generic} allows for a generic power-law time-dependence of the per-timestep regret bound and its probability of failure $\delta_t$, in practice the exponents $(\powtime,\powdelta)$ will take specific values. In particular, in the limit of approximately i.i.d. reward data, the error $||\muhat^{(a)}-\mustara||$ in reward estimators decreases as $1/\sqrt{t}$, and can be bounded (e.g. as shown in Appendix~\ref{app:estimator_bound}) for any reward distribution using Chebyshev's inequality, resulting in the following specific case of Lemma~\ref{lemma:regret_generic}:

\begin{corollary}\label{corollary:regret_generic_sqrtT}
When Assumption \ref{assumption_mu_bound} is satisfied,  
and when the per-timestep regret at any time $t$ for a given algorithm satisfies the upper bound
\be\label{eq:regret_timestep_sqrtT}
\dR^{(t)} \leq \frac{\UR}{t\cdot\delta_t}
\ee
with probability at least $1-\delta_t$, for any $\delta_t\in(0,1)$, 
the cumulative regret $\R(T) = \sum_{t=1}^T\dR^{(t)}$ satisfies the upper bound
\be\label{eq:regret_generic_sqrtT}
\R(T) \leq %CHECKED
2\(\UR\cdot\Uctxnorm\cdot\max_{a^\star,a}\Delta_{a^\star,a}\)^{1/2}
T^{1/2}.
\ee
\end{corollary}
\begin{proof}
Eq. \eqref{eq:regret_generic_sqrtT} is the special case of Eq. \eqref{eq:regret_generic} for $\powtime=\powdelta=1$.
\end{proof}

\subsection{Regret bound for latent linear Thompson sampling}
\label{app:regret_latent}

We are now in a position to apply the cumulative regret bound of the previous section, along with the specific form of the per-timestep regret for linear Thompson sampling from Lemma~\ref{lemma:regret_timestep} and the latent bandit error bound, Theorem~\ref{theorem:estimator}, to finally derive Theorem~\ref{theorem:regret_llTS}:

\textit{Proof of Theorem~\ref{theorem:regret_llTS}.}
In the latent bandit setting of Section~\ref{sec:setting_latent}, 
Theorem~\ref{theorem:estimator} guarantees that Assumption \ref{assumption_mu_error_bound} is satisfied, at time $t$, with
\be\label{eq:Umu_latent}
\Umu = %CHECKED
\frac{2Z^2}{\pi^2_{\rm min}\cdot\min_a\lambda_{\rm min}^{(a)}(t)} \sqrt{\frac{1}{\deltamu\cdot t}
\Big( \stdeveq^2 + \frac{4\Umunorm^2}{\gamma_{\phi^\star}}\big(1+\log\cdklstar\big) \Big)} + O(1/t^{3/2})
\ee
with probability at least $1-\deltamu$. Here, we have defined $1-\deltamu$ to be the probability of the bound as given in Eq. \eqref{eq:prob_bound_estimator}, and have Taylor expanded the $O(1/t)$ contribution in $\deltamu=\delta+O(1/t)$ from Eq. \eqref{eq:prob_bound_estimator} into a $O(1/t^{3/2})$ contribution to $\Umu$. %CHECKED
We have also used Assumption \ref{assumption_ctx_bound} to bound $||\mustara||_1<\Umunorm$.
(We remind the reader that the definitions of quantities in Eq. \eqref{eq:Umu_latent} are given in Theorem~\ref{theorem:estimator}.)

Likewise, Lemma~\ref{lemma:eigen_cov} (which was used to derive Theorem~\ref{theorem:estimator}) guarantees that Assumption \ref{assumption_eigen_bound} is satisfied under the same conditions, at time $t$, with
\be\label{eq:Ucov_latent_1}
(\Ucov)^2 = %CHECKED
\frac{\cb}{t\lambda_{\rm min}^{(a)}(t)}
\ee
with probability of failure $\deltacov\propto(\pi_{\min}-\cb^{-1})^{-2}/t$, %CHECKED
as shown in Eq. \eqref{eq:delta_lambda_def}.
Here we have converted the minimal eigenvalue lower bound of Eq. \eqref{eq:eigenvalue_bound_cov} into a maximal eigenvalue upper bound for the inverse matrix, $\covhat^{(a)}(t) = \frac{1}{t}(\covinv^{(a)}(t))^{-1}$.
\newcommand{\powcb}{\tilde{\nu}}

While $\lambda_{\rm min}^{(a)}(t)$ introduces time-dependence in Eqs. \eqref{eq:Umu_latent} and \eqref{eq:Ucov_latent_1}, this time-dependence can be ignored at late times, where all quantities converge to limiting asymptotic forms. 
Under the assumption of ergodicity of the latent Markov chain (used in Appendix~\ref{app:estimator_bound} for Theorem~\ref{theorem:estimator}), as $T\rarr\infty$ the latent state converges to an equilibrium distribution. Consequently, the generating distribution of context data $x_{t-\tau:t}$ approaches an asymptotic equilibrium distribution for any fixed $\tau$, as $t\rarr\infty$. Since the posteriors probabilities $p^\star_t(z)=p(z_t=z|x_{1:t})$ are deterministic functions of context data, and furthermore \cyan{since the dependence on past data $x_{t'\ll t}$ becomes exponentially suppressed with decay rate $\gamma_{\phi^\star}$ (see Appendix~\ref{app:mixing_bounds})}, the distribution over these posterior probabilities will also converge exponentially quickly to an asymptotic equilibrium form at late times. Thus, setting $\ctxstar_t=p^\star_t$, we see that the distribution $\Pct$ over linear bandit context vectors converges to an asymptotic distribution, with differences \cyan{decaying exponentially in time with a decay rate $\gamma_{\phi^\star}$}.
Recall that the action-wise inverse covariance matrices
$$
\Beq^{(a)}(T) := \frac{1}{T}\sum_{t=1}^T \EE_{x_{1:t}}\[\one(a=a^\star_t)p^\star_t(p^\star_t)^\top\],
$$
defined in Eq. \eqref{eq:B_eq_element_def}, are sums of expectations over the posteriors $p^\star_t$.
%%% As $T\rarr\infty$, the average is dominated by terms with $t\rarr\infty$, for which $p^\star_t(z)=p(z_t=z|x_{1:t})$ with $x_{1:t}$
As $T\rarr\infty$, the contributions from $t<\sqrt{T}$ will decrease as $O(\sqrt{T}/T)=O(1/\sqrt{T})$, and will thus make an $O(1/\sqrt{T})$ contribution to the minimal eigenvalues $\lambda_{\rm min}^{(a)}(T)$. This can be absorbed into the $O(t^{-3/2})$ late-time corrections in Eq. \eqref{eq:Umu_latent}.
Furthermore, contributions to $\Beq^{(a)}(T)$ from $t>\sqrt{T}$ will be expectations over the limiting equilibrium distribution over $p^\star_t$, up to differences \cyan{decaying exponentially}, which can also be absorbed into the $O(t^{-3/2})$ corrections in Eq. \eqref{eq:Umu_latent}.\footnote{This relies on a strictly positive decay rate $\gamma_{\rm \phi^\star}$. However, when $\gamma_{\rm \phi^\star}$, the upper bound $\Umu$ becomes vacuous anyways.}
Therefore, from now on, we set the minimal eigenvalues in Eqs. \eqref{eq:Umu_latent} and \eqref{eq:Ucov_latent_1} equal to their asymptotic values, which we define as
\be\label{eq:lambda_min_const_def}
\lambda_{\rm min}^{(a)} = \lim_{t\rarr\infty}\lambda_{\rm min}^{(a)}(t).
\ee
%%% [redunant with above:] Contributions to regret coming form the differences of these quantities from their $t\rarr\infty$ limits become exponentially small in time, and will thus result in contributions that are even smaller than the polynomial $O(1/t^{3/2})$ correction in Eq. \eqref{eq:Umu_latent}. 
Furthermore, we define $\lambda_{\rm min}:=\min_a\lambda_{\rm min}^{(a)}$ as the minimum eigevalue over all action-wise inverse covariance matrices, $\Beq^{(a)}(t)$ as $t\rarr\infty$.

Lastly, under the assumption of  Theorem~\ref{theorem:estimator} that the true posteriors $p^\star_t$ are used as context vectors, Assumption \ref{assumption_eigen_bound} is satisfied with $\Uctx=0$ and $\deltactx=0$.

Given these error bounds, and setting $\ctxdim=Z$ and $\Uctxnorm=1$ (since $||p^\star_t||_2\leq||p^\star_t||_1=\sum_z p^\star_t(z)=1$), Lemma~\ref{lemma:regret_timestep} takes the form
\be\label{eq:regret_timestep_hmm}
\dR^{(t)}%%(\ctxhat,\muhat,\covhat) 
\leq %(CHECKED)
\sum_{a^\star,a}
\frac{\rhoaa_{a^\star,a}^{(t)}}{\Delta_{a^\star,a}} \Big( %\big(||\mustarastar-\mustara||_1\Uctx + 
\big(2Z\Umu\big)^2 + 8\big(\Ucov\big)^2\log\zeta\Big) + 
O((\deltamu t)^{-3/2})
\ee
with probability at least $1-2Z\deltamu-\deltacov$, where $(\Umu,\Ucov)$ are given in Eqs. \eqref{eq:Umu_latent} and \eqref{eq:Ucov_latent_1}, and we have set $\Uctx=0$, 
and where $\zeta$ was defined in Eq. \eqref{eq:zeta_def}. 
Here, we have used the fact that $\Umu\propto(1/\sqrt{t\deltamu})$ in Eq. \eqref{eq:Umu_latent} in the $O(\Uerror^3)$ contributions in Eq. \eqref{eq:regret_bound_timestep}. 
(Note that in the context of linear Thompson sampling, the minimal probability $\pi_{\rm min}$ of selecting the optimal action can be \cyan{lower bounded at $1/K$ by initializing the empirical covariance matrices to allow for sufficiently broad posteriors over $\mustara$.})

The $\Ucov$ term only contributes subleading corrections to regret, as $t\rarr\infty$, for the following reason. 
At times $t$ when the bound on $\Ucov$ holds with probability $1-\deltacov$, the regret incurred scales as $1/t$.
Additional regret is incurred from times when the bound on $\Ucov$ fails. This occurs with probability $\deltacov\propto1/t$, yielding additional per-timestep expected regret that is also $O(1/t)$.

\newcommand{\Ufinal}{U}

Thus, with the $\Ucov$ term incorporated into the subleading corrections, and furthermore using Eq. \eqref{eq:lambda_min_const_def} in the $t\rarr\infty$ limit (as discussed above), we now have
\be\label{eq:regret_timestep_hmm_final}
\dR^{(t)}%%(\ctxhat,\muhat,\covhat) 
\leq %(CHECKED)
\Ufinal\times\sum_{a^\star,a}
\frac{\rhoaa_{a^\star,a}}{\Delta_{a^\star,a}} \ \  + O((\deltamu t)^{-3/2}) + O(1/t) + O(t^{-2})
\ee
with \cyan{probability\footnote{Recall that the probability $\deltacov$ of the covariance bound failing can be chosen to decay to zero as $1/t$.} at least $1-2Z\deltamu$.}
where
$$
\Ufinal := \frac{16Z^6}{\pi^4_{\rm min}\lambda^2_{\rm min}} \frac{1}{\deltamu\cdot t}
\Big( \stdeveq^2 + \frac{4\Umunorm^2}{\gamma_{\phi^\star}}\big(1+\log\cdklstar\big) \Big)
$$
We have also omitted the time index on $\rhoaa_{a^\star,a}$, which we define as the asymptotic limit
\be\label{eq:prob_adv_eq_def}
\rhoaa_{a^\star,a} := \lim_{t\rarr\infty} \rhoaa_{a^\star,a}^{(t)}.
\ee
This is because, since $\rhoaa_{a^\star,a}^{(t)}$ is also an expectation over the current distribution $\Pct$ of context vectors, it will converge to a fixed asymptotic value, with differences from its $t\rarr\infty$ limit \cyan{decaying exponentially}. As discussed above, these differences are smaller than the subleading corrections in Eq. \eqref{eq:regret_timestep_hmm_final}, so we omit them.

\iffalse
\dR^{(t)}%%(\ctxhat,\muhat,\covhat) 
\leq %CHECKED
\frac{Z^6}{\pi_{\rm min}^4\lambda_{\rm min}^2}
\frac{1}{\deltamu t}
\Big( \stdeveq^2 + \frac{4\Umunorm^2}{\gamma_{\phi^\star}}\big(1+\log\cdklstar\big) \Big)
\times\sum_{a^\star,a}
\frac{16\rhoaa_{a^\star,a}}{\Delta_{a^\star,a}}
+ O((\deltamu t)^{-3/2}) + O(t^{-2}),
\fi

In Eq. \eqref{eq:regret_timestep_hmm_final}, the $O(t^{-2})$ contribution comes from the $O(t^{-3/2})$ contribution to $\Umu$ in Eq. \eqref{eq:Umu_latent}.
Finally, defining the parameter $\delta_t$ in Eq. \eqref{eq:regret_timestep_sqrtT} as $\delta_t:=2Z\deltamu$, such that the per-timestep regret bound holds with probability $1-\delta_t$, we can apply Corollary~\ref{corollary:regret_generic_sqrtT}.
Plugging Eq. \eqref{eq:regret_timestep_hmm_final} into Eq. \eqref{eq:regret_timestep_sqrtT}, Eq. \eqref{eq:regret_generic_sqrtT} then recovers the final bound, Eq. \eqref{eq:regret_latent_final}. 

Note that: 
(1) The $O(T^{2/5})$ scaling of the subleading corrections arises from applying Lemma~\ref{lemma:regret_generic} to the $O((\deltamu t)^{-3/2})$ contribution in Eq. \eqref{eq:regret_timestep_hmm_final}, and setting $\powtime=\powdelta=3/2$ in Eqs. \eqref{eq:regret_timestep_generic}- \eqref{eq:regret_generic}. %(CHECKED)
(2) The $O(t^{-2})$ contributions to $\dR^{(t)}$ integrates to a constant when summing over $t$, which is (asymptotically) smaller than the $O(T^{2/5})$ correction.
$\square$


\paragraph{Problem-dependent structure of Theorem~\ref{theorem:regret_llTS}.}

We end this section by reminding the reader of the key dependencies in Theorem~\ref{theorem:regret_llTS}, described in the main text.

In addition to these dependencies, 
the $Z$-dependence %(extra Z factors from requiring the error bound to hold for all z)
and dependences of $\gapadv$ in Theorem~\ref{theorem:regret_llTS} are inherited from Theorem~\ref{theorem:estimator}, with the regret at time $t$ being bounded proportional to the squared error, $||\muhat^{(a)}-\mustara||_2^2$. This dependence arises from the fact that increasing the error increases both (i) the size of the space of posterior beliefs $p^\star_t$ %context vector $\ctx_t$
for which the true reward gap $(p^\star_t)^\top(\mustarastar-\mustara)$ is too small to resolve relative to the error in estimating $\mustarastar-\mustara$, which increases the probability of a suboptimal action,
as well as (ii) the scale of the reward gap (regret incurred) when suboptimal actions are taken. 
In short, mistakes are made more frequently, and mistakes are more costly.

Furthermore, we note that regret is implicitly proportional to the number of actions $K$. This is because the inverse covariance $\Beq^{(a)}$ in Eq. \eqref{eq:B_eq_def} picks out only times when a given action $a$ is optimal, and thus scales as $1/K$, becoming small when there are many actions to choose from. Consequently, the corresponding eigenvalues $\lambda_{\rm min}^{(a)}$ also scale as $1/K$, leading to regret proportional to $K$. This captures the fact that when there are many actions to explore, it takes longer to reduce uncertainty (bounded by $\lambda_{\rm min}$) about all of them.

%Lastly, the lower bound $\pi_{\rm min}$ on the probability of choosing

\subsection{Bound on Gaussian Tail Probability Mass}
\label{app:tailbound}

The probability mass in the normal distribution $\mathcal{N}(0,\sigma)$ above threshold $x$ is
\be %CHECKED
\int_x^\infty dy \frac{1}{\sqrt{2\pi}\sigma}e^{-y^2/2\sigma^2} = \frac{1}{2}\(1-\erf(x/\sqrt{2}\sigma)\),
\ee
where $\erf()$ is the error function, which can be expanded for large argument values and bounded,
\be %CHECKED
\erf(z) > 1 - z^{-1}e^{-z^2}
\ee
for all $z>0$, but tightly as $z\rarr\infty$. 
Equivalently, the probability mass in the tail is bounded as
\be\label{eq:tailbound} %~CHECKED
\int_x^\infty dy \frac{1}{\sqrt{2\pi}\sigma}e^{-y^2/2\sigma^2} < \frac{\sigma}{\sqrt{2}x}e^{-x^2/2\sigma^2}
\ee
for $x>0$. 

\iffalse
We will also make use of the lower bound obtained by expanding the error function to the next order, which yields
\be\label{eq:tailboundlower} %~CHECKED
\int_x^\infty dy \frac{1}{\sqrt{2\pi}\sigma}e^{-y^2/2\sigma^2} > \frac{\sigma}{\sqrt{2}x}\(1-\frac{\sigma^2}{x^2}\)e^{-x^2/2\sigma^2}
\ee
for $x>\sigma$.
\fi

\end{document}
