% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%%% HELPER CODE FOR DEALING WITH EXTERNAL REFERENCES
\usepackage{xr}
\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother
\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}
%%% END HELPER CODE

\myexternaldocument{nelson_697-supp}
%\myexternaldocument{File2}

\usepackage[utf8]{inputenc}

\usepackage{amsmath,amssymb,amsthm}
\usepackage{xcolor}

\usepackage[ruled,vlined]{algorithm2e}
\usepackage{algpseudocode}

\usepackage{hyperref}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%% Useful Math
\newtheorem{theorem}{Theorem}
\newtheorem{assumption}{Assumption}
\newtheorem{conjecture}[theorem]{Conjecture}
\newtheorem{hypothesis}[theorem]{Hypothesis}
\newtheorem{corollary}{Corollary}[theorem]
\newtheorem{definition}[theorem]{Definition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{observation}[theorem]{Observation}
\newtheorem{fact}[theorem]{Fact}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{example}[theorem]{Example}
\newtheorem{notation}[theorem]{Notation}
\newtheorem{trick}[theorem]{Trick Result}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{remark}{Remark}

\newtheorem{innercustomthm}{Theorem}
\newenvironment{customthm}[1]
  {\renewcommand\theinnercustomthm{#1}\innercustomthm}
  {\endinnercustomthm}
  
\newtheorem{innercustomassumption}{Assumption}
\newenvironment{customassumption}[1]
  {\renewcommand\theinnercustomassumption{#1}\innercustomassumption}
  {\endinnercustomassumption}

\usepackage{apptools}
\AtAppendix{\counterwithin{theorem}{section}}
\AtAppendix{\counterwithin{assumption}{section}}
%\AtAppendix{\counterwithin{lemma}{section}}
%\AtAppendix{\counterwithin{corollary}{section}}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% Elliot's definitions:

\newcommand{\be}{\begin{equation}}
\newcommand{\ee}{\end{equation}}
%\newcommand{\ba}{\begin{align}}
%\newcommand{\ea}{\end{align}}
\newcommand{\rarr}{\rightarrow}
\newcommand{\larr}{\leftarrow}
\newcommand{\nn}{\nonumber}
\def\[{\left[}
\def\]{\right]}
\def\({\left(}
\def\){\right)}
\def\<{\langle}
\def\>{\rangle}

\DeclareRobustCommand{\bbone}{\text{\usefont{U}{bbold}{m}{n}1}}
%\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator\erf{erf}
\newcommand{\argmax}{\text{argmax}}

\newcommand{\EE}{\mathbb{E}}
\newcommand{\RR}{\mathbb{R}}
\newcommand{\N}{\mathcal{N}} % standard normal distribution
\newcommand{\xv}{\vec{x}}
\newcommand{\av}{\vec{a}}
\newcommand{\rv}{\vec{r}}
\newcommand{\zv}{\vec{z}}

\newcommand{\R}{\mathcal{R}} % regret

% quantities for latent bandit problem setting
\newcommand{\A}{\mathcal{A}} % action space
\newcommand{\Z}{\mathcal{Z}} % latent space
\newcommand{\muhat}{\hat{\mu}}
\newcommand{\mustara}{\mu_\star^{(a)}}
\newcommand{\mustaraprime}{\mu_\star^{(a')}}
\newcommand{\mustarastar}{\mu_\star^{(a^\star)}}

% quantities for linear bandit problem setting
\newcommand{\ctx}{c}
\newcommand{\ctxspace}{\mathcal{C}}
\newcommand{\ctxdim}{d} % if we change this to Z, check usage...
\newcommand{\ctxstar}{\ctx^\star}
\newcommand{\ctxhat}{\hat{\ctx}}
%\newcommand{\ctxmatrix}{C}
%\newcommand{\ctxv}{\vec{\ctx}}
\newcommand{\Pct}{P_{\ctx}^{(t)}} % probability density function for \ctx at time t
\newcommand{\pc}{\rho}
\newcommand{\pct}{\pc^{(t)}}

% quantities in estimator error bound derivation
\newcommand{\rhoeq}{\rho^{(\phi)}_{\rm eq}}
\newcommand{\biasmatrix}{A}
\newcommand{\error}{g}
\newcommand{\dta}{\delta_t^{(a)}}
\newcommand{\dtpa}{\delta_{t'}^{(a)}}
\newcommand{\rstdev}{\sigma}
%\newcommand{\covreward}{\Sigma}
\newcommand{\cov}{\Omega}
\newcommand{\covhat}{\hat{\cov}}
\newcommand{\covinv}{B} %redefine as needed
%\newcommand{\covinveqa}{B^{(a)}_{\rm eq}}
%\newcommand{\fmu}{f_\mu}
\newcommand{\covinvmu}{\covinv} %\covinv_{\mu}
\newcommand{\Beq}{\bar{B}}
\newcommand{\udelta}{U_\delta} %%% TEMPORARY
\newcommand{\udeltaoverZ}{U_{\delta/Z}} %%% TEMPORARY
\newcommand{\Dphitau}{D_\phi(\tau_1)} %%% TEMPORARY

% quantities in regret bound
\newcommand{\gapadv}{\Delta_{\rm likely}}
\newcommand{\gapworst}{\Delta_{\rm worst}}
% quantities in regret bound derivation
\newcommand{\dR}{\delta\mathcal{R}}
\newcommand{\Rconst}{\mathcal{R}_0}
\newcommand{\dctx}{\delta\ctx}
\newcommand{\dmu}{\nu} % denotes a small perturbation to \mu
\newcommand{\threshold}{\dmu} % free parameter in the regret derivation
\newcommand{\deltamu}{\delta_1}
\newcommand{\deltacov}{\delta_2}
\newcommand{\deltactx}{\delta_3}
\newcommand{\deltaR}{\delta_\R}
\newcommand{\Umu}{U_{\deltamu}^{(\muhat)}}
\newcommand{\Ucov}{U_{\deltacov}^{(\covhat)}}
\newcommand{\Uctx}{U_{\deltactx}^{(\ctxhat)}}
\newcommand{\Uerror}{U}
\newcommand{\Uexponent}{\alpha}
\newcommand{\Umunorm}{u_{\mu}}
\newcommand{\Uctxnorm}{u_{\ctx}}
\newcommand{\UR}{U_{\mathcal{R}}}
\newcommand{\rhoat}{\rho^{(t)}_a}
\newcommand{\rhoaa}{\rho}
%\newcommand{\Ugap}{\Delta_{\rm max}}

% quantities for least-squares transition matrix estimation
%\newcommand{\CE}{H} %% in case we want to quickly replace H with some other notation

% other quantities
\newcommand{\rand}{\eta} % a variable to denote random Gaussian noise ~N(0,1)
\newcommand{\randchi}{x} % chi-squared random variable

% UNCATEGORIZED quantities
\newcommand{\ppeq}{W}
\newcommand{\ca}{\kappa}
\newcommand{\cb}{\tilde{\kappa}}
%%% \newcommand{\cc}{\tilde{\kappa}}
\newcommand{\cdkl}{\zeta_{\phi}}
\newcommand{\cdklstar}{\zeta_{\phi^\star}}
\newcommand{\stdeveq}{\sigma_{\rm eq}}
\newcommand{\stdevz}{\sigma_z}
\newcommand{\stdevza}{\sigma^{(a)}_z}
\newcommand{\stdevzmax}{\sigma_{\rm max}}
\newcommand{\qtilde}{\tilde{q}}
\newcommand{\rhomin}{\rho_{\rm min}}
\newcommand{\coeff}{b}
\newcommand{\phat}{\hat{p}}
\newcommand{\pmin}{p_{\rm min}}
\newcommand{\one}{\mathbf{1}}
\newcommand{\bzero}{\mathbf{0}}
\newcommand{\pluseq}{\mathrel{+}=}
\newcommand{\eq}{{\rm eq}}
\newcommand{\tp}{\top}
\newcommand{\eps}{\epsilon}
\newcommand{\x}{\mathbf{x}}
\newcommand{\pb}{\mathbf{p}}
\newcommand{\rb}{\mathbf{r}}
\newcommand{\mub}{\boldsymbol\mu}
\newcommand{\Wb}{\mathbf{W}}
\newcommand{\Pb}{\mathbf{P}}
\newcommand{\Nv}{\vec{N}}
\newcommand{\Nmin}{N_{\rm min}}

% TEMPORARY / INTERMEDIATE:
\newcommand{\epsbar}{\bar{\epsilon}}
\newcommand{\epstemp}{\tilde{\epsilon}}
\newcommand{\yy}{y} % free parameter optimized in derivation
\newcommand{\ua}{u_1}
\newcommand{\ub}{u_2}
\newcommand{\uu}{u}
\newcommand{\vv}{v}
\newcommand{\xparallel}{x}

% colors:
\newcommand{\blue}[1]{\textcolor{blue}{#1}}
\newcommand{\cyan}[1]{\textcolor{black}{#1}}
\newcommand{\red}[1]{\textcolor{black}{#1}}
\newcommand{\gray}[1]{\textcolor{black}{#1}}
\newcommand{\new}[1]{\textcolor{black}{#1}}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\title{Linearizing Contextual Bandits with Latent State Dynamics}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<enelson@ibm.com>?Subject=Your UAI 2022 paper}{Elliot~Nelson}{}}
\author[1]{Debarun~Bhattacharjya}
\author[1]{Tian~Gao}
\author[1]{Miao~Liu}
\author[1]{Djallel~Bouneffouf}
\author[2]{Pascal~Poupart}
% Add affiliations after the authors
\affil[1]{%
    IBM T. J. Watson Research Center\\
    Yorktown Heights, NY, USA
}
\affil[2]{%
    David R. Cheriton School of Computer Science\\
    University of Waterloo\\
    Waterloo, ON, Canada
}
  
\begin{document}

\maketitle

\begin{abstract}
 In many real-world applications of multi-armed bandit problems, %such as finance, healthcare, and personalized education, 
 both rewards and %the observable
 contexts are often influenced by confounding latent variables which
 %could potentially
 evolve stochastically over time. %confounder
%  Non-stationary bandit algorithms are capable of modeling dynamics in the reward distribution without incorporating latent structure, while other methods such as matrix factorization
  %algorithms -- not least matrix factorization methods for recommender systems -- 
%  model the causal influence of latent variables without dynamics. 
%  In this paper, we ....... 
  While the observed contexts and rewards are nonlinearly related, %%via the latent confounder,
  we show that prior knowledge of latent causal structure can be used to reduce the problem to the linear bandit setting. %%the linear relationship between rewards and latent state probabilities to reduce the problem to the linear bandit
  We develop two algorithms, Latent Linear Thompson Sampling (L$^2$TS) and Latent Linear UCB (L$^2$UCB), which use online EM algorithms for hidden Markov models to learn the latent transition model and maintain a posterior belief over the latent state, 
  %and latent-conditioned context distributions, 
  and then use the resulting posteriors as context features in a linear bandit problem.
  %% to build a least-squares reward estimator.
  %%estimate of mean rewards.
  %%mention Thompson sampling, LinTS ? … in which reward models are sampled 
  We upper bound the error in reward estimation in the presence of a dynamical latent state,
  %for both algorithms, 
  and derive \cyan{a novel problem-dependent regret bound for linear Thompson sampling with non-stationarity and unconstrained reward distributions}, which we apply to L$^2$TS under certain conditions.
  %% measure of an adversarial relationship between the model and environment. %% which is controlled by the degree to which the environment adversarially fools the model.
  Finally, we demonstrate the superiority of our algorithms over related bandit algorithms through experiments.
  %% We study empirically the convergence of our algorithm to optimality in environments %%nonstationary environments
\end{abstract}

\section{Introduction}
\label{sec:intro}

% Intro & motivate: latent var, nonstationarity, partial (causal) knowledge
Multi-armed bandits %, with or without contextual information,  
have been successfully applied in domains such as healthcare~\citep{pmlr-v85-durand18a,cmab-health}, finance~\citep{mab-finance}, and recommender systems~\citep{mab-application-rs}. 
%systems~\citep{mab-application-rs,mab-application-survey}. 
In this work, we are interested in contextual multi-armed bandit problems where the presence of a latent variable is crucial for predicting rewards.
Furthermore, it is typical in many real-world problems for additional complexity to arise in the form of %non-stationarity (dynamics) of the
latent variable non-stationarity (dynamics).
%with the following characteristics, since they are important for adequately modeling a host of real-world problems: 1) the presence of a latent variable that is crucial for determining reward, 2) non-stationarity in the latent variable, and 3) the potential to obtain partial knowledge from domain experts and/or prior offline data.
% Applications
Consider the following illustrative real-world applications:
%\vspace{-0.8em}
\begin{itemize}%[noitemsep,nolistsep,leftmargin=*]
    \setlength\itemsep{-0.2em}
    \item An interactive AI agent for personalized education  chooses material to help a student’s  evolving state of knowledge, using observations such as the time taken to answer questions.   %with reward gauged by student performance 
    \item A rover on a mission explores blocks of land, taking samples for information about the ore grade and choosing real-time mining strategies for each block.
    \item A recommender system selects items for users with evolving latent preferences or values, potentially using observable signals such as behavior patterns. % using some context such as demographic information
\end{itemize}
%\vspace{-0.8em}

%from personalized education as an illustrative application, which follows a typical human-AI interactive setting and can be modeled graphically as shown in Figure~\ref{fig:graphical_model}. An AI agent must choose educational material to help a student’s slowly evolving state of knowledge, where the reward is gauged by student performance and depends on their latent state of knowledge. The bandit problem may include a context $x$, such as the time taken to answer questions or absorb material, which could reveal information about the latent state of knowledge $z$. 

%As another application, consider a vendor making decisions about their level of engagement with a potential supplier. The amount of stock that is delivered and the related risk for the vendor to meet their demand depends on the latent state of well-being of the supplier, which may be evolving slowly due to external events that are infrequent. In the classic bandits application of recommender systems, our work is applicable for users with slowly evolving latent preferences/moods/goals.
% and the state of the environment or the preferences/goals of a user change slowly.

Such problems can be represented with the graphical model of Figure \ref{fig:graphical_model}.
Here a decision-making agent must use additional side information or context data (denoted $x$) for inference of an unseen, time-dependent latent state (denoted $z$), in order to improve reward predictions.

%Our setting (see Figure \ref{fig:graphical_model}) applies generally to situations where explicitly modeling a latent variable is natural and beneficial, possibly due to a causal mechanism where the context is `caused’ by the latent variable. This often implies that the observed context contains useful information about the underlying latent variable, beyond what one could infer from rewards alone, making it possible to better predict rewards by inferring latent states from observations.}

Our approach to the non-stationary latent bandit problem of Figure~\ref{fig:graphical_model} focuses on leveraging prior knowledge of the graphical structure to apply simpler methods to a difficult problem, using a strategy of reduction to a known problem.
The linear multi-armed bandit setting \citep{auer2002,AbbasiYadkori2011} has been studied extensively, leading to many algorithms and related theoretical guarantees.
While complex real-world tasks generally involve nonlinear relationships between observed variables and target objectives (such as the nonlinear relationship between $x_t$ and $r_t$ in Figure~\ref{fig:graphical_model}), 
a key motivating observation for our work is that expected values, and in particular expected rewards, are linearly related to probabilities of unknown variables or parameters. This linear relationship can be exploited using algorithms and theoretical analyses for the linear bandit setting.
%[connect with the related work paragraph]

%as long as these probs ^^ can be modeled

\new{For the non-stationary bandit task of Figure~\ref{fig:graphical_model}, this requires maintaining posterior probabilities over the current latent state $z_t$.
%(with approx online Bayes inference)
%for non-stationary, to track these probs, we need something like EM:
Since Figure~\ref{fig:graphical_model} may be viewed as an extension of a hidden Markov model (HMM) \citep{rabiner1989hmm} into a multi-armed bandit task, we  leverage existing methods for online learning of HMMs.}
In particular, online expectation maximization (EM) is an established method which learns to perform approximate Bayesian inference over the latent state, when applied in our setting.
%and can be used to maintain an approximate posterior belief which is linearly related to expected rewards.

%\blue{tian: maybe highlight the differences with existing methods and how they cant handle our situations}

\paragraph{Contributions.}

We combine existing methods for hidden Markov models and linear bandit problems in a novel way, to make the following contributions:
(i) We identify conditions under which contextual multi-armed bandit problems with an evolving hidden state (Figure~\ref{fig:graphical_model}) can be mapped to a linear bandit problem. 
(ii) We introduce novel bandit algorithms for the setting of Figure~\ref{fig:graphical_model}, Latent Linear Thompson Sampling (L$^2$TS) and Latent Linear Upper Confidence Bounds (L$^2$UCB), which combine approximate online Bayesian inference over the latent state with linear bandit methods, 
%%learns to perform Bayesian inference over the latent state
and demonstrate superior performance compared to baseline algorithms. 
(iii) We derive a high-probability bound (Theorem~\ref{theorem:estimator}) on least-squares parameter estimation error in the setting of Figure~\ref{fig:graphical_model}.
(iv) We derive a novel problem-dependent regret bound for linear Thompson sampling with non-stationary and arbitrary reward distributions, and apply it to L$^2$TS
(Theorem~\ref{theorem:regret_llTS}).
%in the setting of Figure~\ref{fig:graphical_model}
%using Theorem~\ref{theorem:estimator}

In the next section, we discuss the advantages of our approach and limitations of existing multi-armed bandit approaches in settings where a time-evolving latent state influences contexts and rewards.

\section{Related Work}
\label{sec:relatedwork}

\textit{Linear Bandits.}
%As mentioned in Section~\ref{sec:intro}, 
Our work identifies a path for applying methods and analysis for the linear bandit framework \citep{auer2002,AbbasiYadkori2011} to a larger class of (nonlinear) contextual bandit problems.
We introduce algorithms which use the linear Thompson sampling algorithm of \cite{agrawal2013linTS} or the related LinUCB algorithm \citep{li2010linucb,chu2011contextual} as subroutines. %in a more general problem setting
While linear bandit methods have been applied in various settings, our approach of leveraging linearity with respect to posterior probabilities is novel, as well as application of the suite of linear bandit tools to latent bandit problems.

\textit{Non-Stationary Bandits.} 
The decision-making problem of Figure~\ref{fig:graphical_model} lies at the intersection of the (more general) class of contextual bandit problems, in which additional context information is available along with reward data, 
and the class of \emph{non-stationary bandit}~\citep{Auer2003,luo18a,Hartland2007,Garivier2008,Yu2009} problems, which introduce time-dependence into the reward distribution.
The bulk of existing work in non-stationary bandits focuses on detecting change in distributions or parameters \citep{luo18a}. 
In our setting, these methods are limited, as they cannot model the latent causal structure, which allows for improved modeling and prediction of distributional change over time.
%% The special case of piecewise stationary bandits~\citep{Hartland2007,Garivier2008,Yu2009} [MOVED THESE REFS ELSEWHERE] is applicable in our setting if latent states do not change for significant time periods. However, if latent state changes occur rapidly, these methods fall short due to their reliance on .... data from z_t

\textit{Latent Bandits.} 
A growing body of research on \emph{latent bandit}~\citep{maillard14,ZhouBrunskill2016} problems seeks to model reward distributions which are influenced by a latent state, as in Figure ~\ref{fig:graphical_model}.
Most work in this area does not consider the case of dynamical state transitions.
The graphical structure of Figure~\ref{fig:graphical_model} is considered in \cite{Hong2021}, which in contrast to the present work, focuses on off-policy learning.
Other recent work \citep{Hong2020} (see also \cite{Hong2020a}) considers a closely related problem in which a dynamical hidden state influences rewards, but assuming a different graphical structure in which contexts are unaffected by the latent state (and thus cannot be leveraged for inference of $z$).
% [from previous version] studies Thompson sampling for a related non-stationary latent bandit problem in which a posterior over discrete latent-state histories is updated using rewards rather than context observations, which are assumed unaffected by the latent state.
Our approach is similar to that of~\cite{Hong2020}, in that we use Thompson sampling \citep{thompson1933,Chapelle2011,Russo2013} as an exploration heuristic.
\gray{However, their approach involves Thompson sampling of latent states as well as parameters.
In settings where latent states changes occur frequently, such exploration of the latent space may not yield significant information gain before the state changes again,\footnote{In contrast, information gained via exploration about fixed parameters will not become outdated.} and can thus under-exploit.
Furthermore, the practical algorithm proposed in \cite{Hong2020} uses particle filtering \citep{doucet2001smc}, which can struggle to scale to higher dimensions with a fixed number of particles.}
%Naive particle filtering generically results in weights for most particles falling to zero \cyan{[REF]}, and can struggle without a \cyan{[fill this in ]
In comparison, we sidestep the difficulties of approximating a high-dimensional posterior by selectively maintaining uncertainty over the most task-relevant unknowns. Moreover, in the asymptotic limit of long sequences, the cumulative log-likelihood $L(\mu_\star)=\sum_t\log p(r_t|a_t;\mu_\star)$ of reward data becomes large, causing the posterior $p(\mu_\star)\propto e^{L(\mu_\star)}$ over reward parameters $\mu_\star$ to satisfy the Laplace approximation and generally converge to a Gaussian form. We exploit this asymptotic property with linear Thompson sampling \citep{agrawal13a}, which uses a multivariate normal posterior. %and least-squares estimator with Gaussian asymptotics \citep{eiker1963}. [commenting out as I'm not sure this is relevant]
%sacrificing rewards unnecessarily.
%On the other hand, in the case of slow (enough) dynamics...sampling z could be advantageous

\textit{Recommender Systems.} The graphical structure of our problem, with a latent variable acting as a confounder of context observations and rewards, 
is shared in the literature on bandit algorithms for recommender systems (e.g. \citep{Sen2017,Kawale2015}). In comparison to these works, which generally assume i.i.d. latent variables, our work is primarily an extension in the direction of non-stationarity.

\textit{Causal Bandits.}
Lastly, our work is also related to the burgeoning area of \emph{causal bandits}~\citep{Lattimore2016} where causal mechanisms are explicitly modeled. Confounding from a latent variable was considered in \cite{BareinboimFP15,Lee2018,Sen2017}, but under the assumption of i.i.d. data (no non-stationarity), 
and in an offline rather than online learning setting. % [LONGER:] \gray{Unlike \cite{BareinboimFP15} and related work, which considers confounding of contexts and actions and applies to offline learning from actions of other decision makers, we consider the situation where the latent variable is a confounder for the context and reward.}
%While some of the literature focuses on problems involving graphs where it is possible to intervene on many variables~\citep{Lattimore2016,yabe2018}, we consider the setting where there is a single confounding latent variable~\citep{BareinboimFP15,Sen2017,Lee2018}, because it is not possible to intervene on any variable other than the bandit arm in our setting.
%In comparison with the work of \cite{Yu2020}, which develops Thompson sampling algorithms for specific graphs with latent variables, we consider a slightly simpler graphical setting %%(since only the bandit arm can be intervened upon)
%but with more general conditional distributions as well as potential nonstationarity.

\textit{Theoretical Analysis.} Discussion of related work on the subject of regret bounds is deferred to Section~\ref{sec:regret}.

\section{Problem Setting}
\label{sec:setting}

In this section, we describe our contextual multi-armed bandit problem setting with a dynamical latent state (Section~\ref{sec:setting_latent}), 
%(ii) define in that setting a posterior belief which plays a central role in our algorithm,
describe a related linear bandit problem setting (Section~\ref{sec:linear}), and
%identify in the linear bandit problem setting several task-relevant functions of the context and reward distributions which will play a key role in our regret bound, and
show that the latent bandit setting of Section \ref{sec:setting_latent} can be reduced to the linear bandit setting of Section \ref{sec:linear} under certain conditions (Section~\ref{sec:reduction}).

\subsection{Non-stationary Latent Bandits}
% contextual (confounded?)
\label{sec:setting_latent}

We consider the non-stationary bandit environment of Figure \ref{fig:graphical_model} in which a dynamical latent state $z$ acts as a confounder of observations (or contexts) $x$ and rewards $r$.
The figure is represented as an influence diagram, which is a graphical model for decision making under uncertainty~\citep{howard1984}. At any epoch, context $x$ is observed before selecting action $a$, and reward $r$ depends on 
$a$ and $z$.
%$a$ as well as the latent state $z$.
%In an influence diagram, decision and reward nodes are depicted as rectangles and diamonds (or polygons) respectively. 

\begin{figure}
    \centering
    \includegraphics[width=7.5cm]{graphical_model}
    %\includegraphics[width=\linewidth]
    \caption{An influence diagram representation of the non-stationary  version of our latent bandit setting. The latent state $z$ changes dynamically while context $x$ is observed at the time of choosing action $a$ (rectangle), represented by the informational arc from $x$ to $a$. Reward $r$ (diamond) is a function of $a$ and $z$.
    Shaded (white) nodes indicate observed (unobserved) variables.}
    %Black and red conditional edges denote known and unknown (or learned) conditional distributions respectively.
    \label{fig:graphical_model}
\end{figure}

While the context and reward may be either discrete or real-valued\footnote{We denote context as a scalar for simplicity but our work is equally applicable to settings with high-dimensional observations.}, the latent state $z\in\Z=\{1,...,Z\}$ and action 
$a\in\A=\{1,...,K\}$ are assumed to be discrete. The latent state $z$ evolves stochastically according to a transition matrix $\Phi^\star$ (assumed to be ergodic) with elements, $p(z_t=z'|z_{t-1}=z;\phi^\star) = \phi^\star_{z,z'}$. 
The equilibrium distribution $\rhoeq(z)$ \new{for a given matrix $\Phi$} is the stationary distribution, $\Phi\rhoeq = \rhoeq$. (For any categorical distribution $p(z)$, we will denote by $p\in\RR^Z$ the vector whose elements are the probabilities $p(z)$.)
Given %a latent state 
$z$, an observed context $x$ %$x\in\mathbb{R}$ \elliot{denote $x$ as a scalar?} 
% Note from DB: Could leave domain unspecified
is generated from a conditional distribution $p(x|z;\theta^\star)$ with parameters $\theta^\star$. % which we assume to be known. 
%We assume access to a good but possibly imperfect approximation $\theta$ to $\theta^\star$, which may be available via domain expertise, or via offline samples $x_i\sim p(x|z)$ and accompanying labels $z$ of the generating distribution.
Lastly, rewards are generated from conditional distributions $p(r|z,a)$; %% and its parameters $\tilde{\theta}^\star$,
we denote their expected values as $(\mustara)_z:=\mathbb{E}[r|z,a]$, with $\mustara\in\RR^Z$ being an action-wise vector of means, and variance as ${\rm Var}[r|z,a]$.
% \footnote{Prior information about reward means may be availabile in the absence of the full reward distribution, as in \cite{sharma2020warm}.}
We collectively denote the action-wise parameter vectors as $\mu_\star := \{\mustara\}_{a=1}^K$.

Our algorithm relies on the estimation and use of a posterior belief, $p_t(z|x_{1:t}) := p(z_t=z|x_{1:t})$ 
over the current latent state, which is a categorical distribution represented as a $Z$-dimensional vector. Given a transition model $p(z'|z;\hat{\phi})$ and observation model $p(x|z;\hat{\theta})$, it can be updated every timestep with Bayes' rule:\footnote{We occasionally use $\propto$ to denote equality up to a normalizing constant.}
\be\label{eq:p_hat_def}
\phat_t(z|x_{1:t}) \propto \sum_{z'} \phat_{t-1}(z'|x_{1:t-1})
\hat{\phi}_{z,z'} %p(z_t=z|z_{t-1};\hat{\phi})
p(x_t|z;\hat{\theta})
\ee
where the hat notation denotes model estimates.  %% (REDUNDANT W/ COMMENT ABOVE:) (We generally assume $\hat{\theta}=\theta^\star$, and will later relax this assumption.) 
We will distinguish the model posterior $\hat{p}$ from the ``true'' posterior
\be\label{eq:p_star_def}
p_t^\star(z):=p(z_t=z|x_{1:t};\phi^\star,\theta^\star,\rho_0),
\ee
which uses ground truth parameters and the true prior, %distribution,
$p_0^\star(z):=\rho_0(z)$.

A policy $\pi$ is a mapping from partial histories $(x_{1:t},r_{1:t-1},a_{1:t-1})$ at any time $t$ to probabilities of selecting each action, $a_t=a$.
The optimal policy $\pi^\star$ is defined as the policy which selects, at every timestep, the action with highest expected reward, given the true parameters (but without accessing the true latent state), that is, $a_t^\star:=\argmax_a (p^\star_t)^\tp \mustara$. 
We will quantify performance with expected cumulative regret, defined -- for any policy $\pi$ -- as the loss in expected rewards after $T$ timesteps relative to the optimal policy: $\R_\pi(T) := \sum_{t\leq T} \(\EE_{\pi^\star}[r_t ] - \EE_\pi[ r_t ] \)$.

\subsection{Linear Bandits}
\label{sec:linear}

\newcommand{\noise}{\epsilon}

We will apply methods from the linear bandit setting to the contextual latent bandit setting of Section~\ref{sec:setting_latent}, in which observations $x_t$ and reward $r_t$ are nonlinearly related.
We work with a slightly modified linear bandit setting as compared to the typical setting in the literature \citep{agrawal2013linTS}:
At each timestep, a context feature vector $\ctx_t\in\RR^\ctxdim$ is observed, an action $a_t=a$ is selected from $K$ possible actions, and a reward \be\label{eq:linear_reward}
r_t = \ctx_t^\tp\mustara + \noise_t
\ee
with mean value $\ctx_t^\tp\mustara$ is observed. 
The random noise vector $\noise_t\in\RR^d$ has mean zero by definition, $\EE[\noise_t]=0$, but need not satisfy any other conditions such as sub-Gaussianity or i.i.d. data across time.
In order to maximize returns, the agent must use the sequential context data $\ctx_{1:t}$ to learn the unknown mean reward parameters $\mustara\in\RR^\ctxdim$ for each action $a$.\footnote{\gray{In other variations of the linear bandit setting, the same parameters $\mu$ may be shared across actions, while a separate per-action context $\ctx_t^{(a)}$ may be observed.}}
Given the context $\ctx_t$, the corresponding optimal action is $a^\star_t := a^\star(\ctx_t) := \argmax_{a} \ctx_t^\tp \mustara$.  %%%\label{eq:linear_bandit_opt_action}

In Section~\ref{sec:llts}, we introduce algorithms which use linear Thompson sampling (LinTS) \citep{agrawal2013linTS} or LinUCB \citep{li2010linucb,chu2011contextual} as subroutines.
% [CUT:] \footnote{\cyan{See Appendix C of \cite{agrawal2013linTS} for the case of per-action reward parameters $\mu^{(a)}$.}}
LinUCB and LinTS use observed contexts and rewards to maintain (for each action) a least-squares estimator:
\be\label{eq:mu_hat}
\hat{\mu}^{(a)} = (B^{(a)})^{-1} f^{(a)},
\ee
where $f^{(a)}:=\sum_{t'=1}^t \one(a_{t'}=a) \ctx_{t'} r_{t'}$, with $\one(A)$ being the indicator function equal to $1$ ($0$) when $A$ is true (false), and $B^{(a)} := \lambda_\mu\bbone_{\ctxdim}+\sum_{t'=1}^t \one(a_{t'}=a) \ctx_{t'}\ctx_{t'}^\top$ is an empirical covariance matrix (we assume $\lambda_\mu>0$ to ensure invertibility). 
%% [ALTERNATE] (\ctxmatrix(t) \ctxmatrix^\tp(t)+\lambda_\mu\bbone_d)^{-1} \ctxmatrix(t) R(t)^\tp
 % $\hat{\cov}^{(a)}_t$
%% $(\ctxmatrix_a 
%% \ctxmatrix_a(t) := \sum_{t'=1}^t \one(a_{t'}=a)\ctx_t\ctx_t^\tp
%% $\ctxmatrix_a\in\RR^{\ctxdim\times t}$
%% $R_a\in\RR^t$ 
%% (with a subscripts:) $\hat{\mu}^{(a)}_t=(\ctx_a \ctx_a^\tp)^{-1} \ctx_a r_a^\tp$ and covariance matrix $\hat{\cov}^{(a)}_t=(\ctx_a \ctx_a^\tp)^{-1}$ given contexts $\ctx_a\in\RR^{\ctxdim\times t}$ and rewards $r_a\in\RR^t$ 
%% seen when choosing action $a_{t'}=a$ at times $t'\leq t$. 
LinUCB uses the estimator covariance to compute upper confidence bounds, while LinTS uses each estimator $\hat{\mu}^{(a)}$ to Thompson sample from a multivariate Gaussian posterior, 
$\mu^{(a)}\sim\N(\hat{\mu}^{(a)},(B^{(a)})^{-1})$,
%% $\mu^{(a)}\sim\N(\hat{\mu}_t^{(a)},\hat{\cov}_t^{(a)})$ for $a\in\A$, %%%\label{eq:linTS_posterior}
%%% \hat{\mu}^{(a)}(\ctx,\av,\rv)$
%%% $\hat{\cov}^{(a)}=\hat{\cov}^{(a)}(\ctx,\av,\rv)$
and selects at each timestep the corresponding optimal action:
$a_t=\argmax_a\ctx_t^\top\mu^{(a)}$.
%%% could add an equation expressing this as a policy pi(a|\ctx)

\subsection{Reduction to the Linear Bandit Problem}
\label{sec:reduction}

We now exploit the linear relationship between rewards and probabilities over the latent space to show that the latent bandit problem of Section~\ref{sec:setting_latent} can be reduced to the linear bandit setting of Section~\ref{sec:linear}.

\begin{lemma}\label{lemma:reduction}
When the true model parameters $(\theta^\star,\phi^\star)$ and initial latent state probabilities $\rho_0(z) = p(z_0=z)$ in the model from Figure~\ref{fig:graphical_model} are known, the latent bandit setting of Section~\ref{sec:setting_latent} reduces to the linear bandit setting of Section~\ref{sec:linear}. %% with the linear bandit context vector $\ctx_t$ at time $t$ being equated with the vector of posterior probabilities $p(z|x_t;\theta^\star,\phi^\star)$ for latent states $z$
\end{lemma}
%% Given access to $p^\star$ (i.e. knowledge of $\theta^\star$, $\phi^\star$)
%% given prior knowledge about transition probabilities $p(z'|z;\phi)$ and context likelihoods $p(x|z;\hat{\theta})$. 
%\vspace{-0.5cm}
\begin{proof}
Conditional on a sequence of observations $x_{1:t}$ in the latent bandit setting %of Section~\ref{sec:setting_latent}, 
and action $a_t=a$, the reward $r_t$ is generated from the mixture distribution
$$
p(r_t=r|a_t=a,x_{1:t};\theta^\star,\phi^\star)
= \sum_z (\ctx_t)_z p(r|z,a),
$$
where we have defined $\ctx_t\in\RR^Z$ as the vector with elements equal to the posterior probabilities
%\vspace{-0.2cm}
\be\label{eq:ctx_posterior}
(\ctx_t)_z := p(z_t=z|x_{1:t};\theta^\star,\phi^\star) := p^\star_t(z).
\ee
%\vspace{-0.3cm}
The expected reward at time $t$ is therefore
$$
\EE[r_t|a_t=a,x_{1:t};\theta^\star,\phi^\star] = \sum_z (\ctx_t)_z (\mustara)_z = \ctx_t^\top\mustara.
$$
Thus, the reward takes the form of Eq. \eqref{eq:linear_reward}, with $d=Z$ being the number of latent states, $\ctx_t$ defined in Eq. \eqref{eq:ctx_posterior}, and $\mustara\in\RR^Z$ being the vector of latent-conditioned mean rewards $(\mustara)_z$.
\end{proof}

Lemma~\ref{lemma:reduction} shows that the posterior belief over the current latent state $z_t$ can be viewed as a compression of the context history $x_{1:t}$ into a (nonlinearly) transformed context variable which is related linearly to rewards. 
Since Lemma~\ref{lemma:reduction} assumes access to the true parameters $(\theta^\star,\phi^\star)$, in general it will only apply in the asymptotic limit ($t\rarr\infty$) in which $(\theta^\star,\phi^\star)$ have been learned. 
Prior to this asymptotic regime, error in model estimates of these parameters will corrupt the context features $c_t$ in the corresponding linear bandit problem with noise and/or systematic bias.
%commenting out the comment below, since the more general version would require a task distribution that matches the agent's Bayesian prior uncertainty, in order for the model posterior be the same as the true marginal generating distribution for z ... and if not, then you have bias/noise/error in c
%However, since Lemma~\ref{lemma:reduction} relies essentially on linearity with respect to probabilities, we expect a more general version to hold when the posterior probabilities $p_t(z)$ are obtained by marginalizing out uncertainty over other variables.

We end this section by noting that the space of context vectors $\ctx_t$, or equivalently posterior beliefs $p^\star_t$ (see Eq. \eqref{eq:ctx_posterior}), is partitioned into subspaces -- denoted $\mathcal{P}_{a^\star}$ -- for which action $a^\star$ is optimal, i.e. $a^\star=\argmax_a \ctx_t^\top\mustara$. In the following section, we will build on  Lemma~\ref{lemma:reduction} to develop a latent bandit algorithm which estimates rewards, Eq. \eqref{eq:mu_hat}, with contexts $\ctx_t\rarr p^\star_t$ as in Eq. \eqref{eq:ctx_posterior}.

\section{Latent Linear Bandit Algorithms}
\label{sec:llts}

Since the non-stationary latent bandit problem of Section~\ref{sec:setting_latent} can be reduced to the linear bandit setting as long as an accurate posterior belief over the latent state $z$ can be maintained, 
algorithms for the latent bandit problem can be built by combining (i) methods for approximate inference over $z$ with (ii) linear bandit algorithms.
In this paper, we introduce two specific such algorithms, which use (i) Online Expectation Maximization (EM) for learning the parameters $(\theta^\star,\phi^\star)$ of a hidden Markov model (and thus learning the ``true'' posteriors $p^\star_t(z)$ assumed in Lemma~\ref{lemma:reduction}), and (ii) either LinTS or LinUCB, 
into an end-to-end pipeline. %for the confounded and non-stationary latent bandit setting of Section~\ref{sec:setting_latent}. 

\textit{Latent State Inference.}
We use the online EM algorithm of \cite{mongillo2008online} (for categorical context data), and the related Algorithm 1 of
\cite{cappe2011online} (for continuous context data).
As indicated in Algorithms~\ref{alg:llts} and \ref{alg:llucb}, after observing $x_t$ these online EM algorithms recursively update (i) the vector estimate $\phat_t$ of latent state probabilities, (ii) 
%higher-dimensional
sufficient statistics $\hat{\psi}_t$, and (iii) parameter estimates $(\hat{\theta},\hat{\phi})$ (determined by $\hat{\psi}_t$).
Further details, including the form of sufficient statistics $\hat{\psi}_t$ for multinomial or Gaussian distributions, are provided in Appendix~\ref{app:online_em}.
Importantly, the approximate Bayes' update of the model posterior over the latent state, Eq. \eqref{eq:p_hat_def}, takes place as part of the online EM update.
After observing the reward $r_t$, the model posterior $\phat_t$ is again updated using a reward likelihood model $p(r|z,a;\muhat)$ which is either Bernoulli or Gaussian in our experiments (see Appendix~\ref{app:experiments}).
%\cyan{As indicated, this update is optional. While it can accelerate learning when $\muhat$ is close to $\mu_\star$, it also feeds back into the model posterior $\phat_t$ (and hence all parameter estimates) additional bias from $\muhat\neq\mu_\star$ or from a misspecified and over-confident (low-entropy) reward distribution $p(r|z,a;\muhat)$.}

\begin{algorithm}%[H]
    \caption{Latent Linear Thompson Sampling (L$^2$TS)}\label{alg:llts}
        \textbf{Input:}\\
        \ \ Prior over latent state, $\phat_0\in[0,1]^Z$\\
        \ \ Initial parameter estimates $(\hat{\theta},\hat{\phi})$\\
        \ \ Initial sufficient statistics $\hat{\psi}_0$\\
        \ \ $f^{(a)} = \bzero_{Z}$, $B^{(a)}=\lambda_\mu\bbone_Z$, for $a\in\A$; $\lambda_\mu>0$\\
        \ \ \blue{Likelihood variance $\tilde{\sigma}_r>0$}\\
        \textbf{for }$t\leftarrow 1,2,...$\textbf{ do}\\
        \ \ Observe $x_t$;\\
        \ \ Update posterior $\phat_t$ and parameters $(\hat{\theta},\hat{\phi})$:\\
%        \ \ \ \ $\phat_t(z) \propto \sum_{z'} \phat_{t-1}(z')\hat{\phi}_{z,z'}p(x_t|z;\hat{\theta})$\\
        %\ \ Online HMM update:\\
        \ \ \ \ $(\hat{\theta},\hat{\phi},\phat_t,\hat{\psi}_t)=\text{OnlineEM}(x_t;\hat{\theta},\hat{\phi},\phat_{t-1},\hat{\psi}_{t-1})$\\
        %% \ \ Update posterior, $\phat_t\propto e^{-Y_t}\odot(\hat{\Phi}\phat_{t-1})$\\
        %% [OR] $\phat_t(z') \propto p(x_t|z';\hat{\theta})\sum_{z}\phat_{t-1}(z)\hat{\phi}_{z,z'}$\\%%using transition model,
        \ \ \blue{Sample $\mu^{(a)}\sim\N(\hat{\mu}^{(a)},\tilde{\sigma}_r^2(B^{(a)})^{-1})$ for $a\in\A$}\\
        \ \ \blue{Select action $a=\argmax_{a'} \phat_t^\top\mu^{(a')}$} \\
        \ \ Observe $r_t$\\
        \ \ Update mean reward estimates:\\ %%and covariance: \\
        \ \ \ \ $B^{(a)} \larr B^{(a)} + \phat_t \phat_t^\top$, \ \ $f^{(a)} \larr f^{(a)} + \phat_t r_t$ \\
        \ \ \ \ $\hat{\mu}^{(a)} = (B^{(a)})^{-1}f^{(a)}$\\
        \ \ Update posterior,  $\phat_t(z)\propto\sum_{z'}\phat_t(z')p(r|z,a;\hat{\mu})$
        %\text{     [optional]}
        %%% [OR] $\phat_t \propto \phat_t\odot\ell(r_t)$ 
        %%% where \cyan{$....=\exp[-(r-\hat{\mu}^{(a)}_z)^2/2(\tilde{\sigma}_r^{(a)})^2]$}.
\end{algorithm}

\textit{Thompson Sampling and UCB.}
As described in Section~\ref{sec:reduction}, we use the model posterior over the current latent state $\phat_t$ as a context feature vector in the linear bandit setting, $\ctx_t=\phat_t$, and \new{apply either linear Thompson Sampling \citep{agrawal2013linTS} (L$^2$TS, Algorithm~\ref{alg:llts}) or LinUCB \citep{li2010linucb,chu2011contextual} (L$^2$UCB, Algorithm~\ref{alg:llucb}) as exploration heuristics to select actions.}
Like L$^2$TS, L$^2$UCB treats the posterior beliefs $\phat_t$ as context vectors in a linear bandit problem, and uses the same reward estimators $\{\muhat^{(a)}\}$ and covariance matrices $B^{(a)}$.
\new{The differences between L$^2$TS and L$^2$UCB are highlighted in blue in Algorithms~\ref{alg:llts} and \ref{alg:llucb}.}
\new{Note that L$^2$UCB asymptotically selects the action with the highest expected reward $\phat_t^\top\hat{\mu}^{(a)}=\sum_z\phat_t(z)\hat{\mu}^{(a)}_z$ given the current posterior vector $\phat_t$, and assigns an exploration bonus to actions whose reward estimates $\hat{\mu}_z^{(a)}$ it is less certain of (in terms of the covariance $(B^{(a)})^{-1}$), for states $z$ that have high probability $\phat_t(z)$.}

We emphasize that while online EM only maintains point estimates $(\hat{\theta},\hat{\phi})$, L$^2$TS and L$^2$UCB use exploration heuristics which leverage uncertainty in reward parameters $\{\hat{\mu}^{(a)}\}$ and in the current latent state $z_t$. %\footnote{We leave to future work the incorporation of uncertainty estimates in $(\hat{\theta},\hat{\phi})$ into the (marginal) posterior vector $\phat_t$.}
\cyan{In comparison, the algorithm of~\cite{Hong2020}  
%the algorithm of Hong et. al. \cite{Hong2020} 
also maintains Bayesian uncertainty over the transition matrix, requiring a more computationally intensive particle filtering implementation. Our more computationally lightweight approach focuses on maintaining task-relevant uncertainty over $(z_t;\mu_\star)$ (see Section~\ref{sec:relatedwork}), and performed best empirically (Section~\ref{sec:experiments}).
%and thus avoids the difficulties of approximating a complex, higher-dimensional posterior)
}
\new{The computational complexity of L$^2$TS and L$^2$UCB is polynomial in the number of latent states $Z$ (due to the online EM updates shown in Appendix~\ref{app:online_em}; see \cite{cappe2011online} for further discussion) and independent of the time $t$, making these algorithms scale well in problems with very long time horizons and low-dimensional latent structure.}

\begin{algorithm}%[H]
    \caption{Latent Linear UCB (L$^2$UCB)}\label{alg:llucb}
        \textbf{Input:}\\
        \ \ Prior over latent state, $\phat_0\in[0,1]^Z$\\
        \ \ Initial parameter estimates $(\hat{\theta},\hat{\phi})$\\
        \ \ Initial sufficient statistics $\hat{\psi}_0$\\
        \ \ $f^{(a)} = \bzero_{Z}$, $B^{(a)}=\lambda_\mu\bbone_Z$, for $a\in\A$; $\lambda_\mu>0$\\
        \ \ \blue{Exploration parameter $\alpha_{\rm UCB}>0$}\\
        \textbf{for }$t\leftarrow 1,2,...$\textbf{ do}\\
        \ \ Observe $x_t$;\\
        \ \ Update posterior $\phat_t$ and parameters $(\hat{\theta},\hat{\phi})$:\\
        \ \ \ \ $(\hat{\theta},\hat{\phi},\phat_t,\hat{\psi}_t)=\text{OnlineEM}(x_t;\hat{\theta},\hat{\phi},\phat_{t-1},\hat{\psi}_{t-1})$\\
        \ \ \blue{Compute upper confidence bounds, \\
        \ \ \ \ $\pi_a = \phat_t^\top\hat{\mu}^{(a)}  + \alpha_{\rm UCB}\sqrt{\phat_t^\top (B^{(a)})^{-1}\phat_t}$} \\
        \ \ \blue{Select action $a=\argmax_{a'}\pi_{a'}$} \\
        \ \ Observe $r_t$\\
        \ \ Update reward estimator \& covariance:\\
        \ \ \ \ $B^{(a)} \larr B^{(a)} + \phat_t \phat_t^\top$, \ \ $f^{(a)} \larr f^{(a)} + \phat_t r_t$ \\
        \ \ \ \ $\hat{\mu}^{(a)} = (B^{(a)})^{-1}f^{(a)}$\\
        \ \ Update posterior,  $\phat_t(z)\propto\sum_{z'}\phat_t(z')p(r|z,a;\hat{\mu})$
        %\text{     [optional]}
\end{algorithm}

\section{Theoretical Analysis}

In this section, we 
(i) demonstrate that linear bandit reward estimation can be effectively applied to the non-stationary latent bandit setting from Figure~\ref{fig:graphical_model} by upper bounding the error of the reward estimators used by L$^2$TS and L$^2$UCB (Theorem~\ref{theorem:estimator}), 
and (ii) derive a high-probability regret bound for linear Thompson sampling, using Theorem~\ref{theorem:estimator} to apply it to L$^2$TS.

\subsection{Reward Estimation Error}
\label{sec:estimator_bound}

In the case of a dynamical latent state, the reduction to the linear bandit setting described in Section~\ref{sec:reduction} results in contexts $\ctx_{1:t}$ and rewards $r_{1:t}$ which are not i.i.d. across time. Here, we state a result which shows that reward estimation via reduction to the linear bandit setting will converge to the true reward parameters $\{\mustara\}$ given a sufficiently long time horizon:

\begin{customthm}{1}
\label{theorem:estimator}
Assuming that 
(i) the latent state Markov chain is ergodic and in equilibrium, $z_1\sim\rhoeq(\cdot)$, 
and when (ii) \cyan{the true parameters $(\theta^\star$, $\phi^\star)$ are known % i.e. $\phat_t\rarr p^\star_t$,
and are used to compute $\hat{\mu}^{(a)}$ 
[Eq. \eqref{eq:mu_hat} with $\ctx_t\rarr p^\star_t$, in Eq. \eqref{eq:ctx_posterior}]},
the error in $\hat{\mu}^{(a)}$ at time $t=T$ for any algorithm which selects the optimal action given $x_{1:T}$ with probability at least $\pi_{\rm min}$, is upper bounded,\footnote{Here, $||\mu||_\ell$ denotes the $\ell$-norm of a vector $\mu$.}
\begin{align}
& |\hat{\mu}^{(a)}_z - (\mustara)_z|
\label{eq:estimator_bound} \\
& < %CHECKED
\frac{2Z^2}{\pi^2_{\rm min}\lambda^{(a)}_{\rm min}} \sqrt{\frac{1}{\delta\cdot T}
\Big( \stdeveq^2 + ||\mustara||_1^2\frac{4}{\gamma_{\phi^\star}}\big(1+\log\cdklstar\big) \Big)} \nn
\end{align}
for any $z$ with probability at least
\be\label{eq:prob_bound_estimator} %CHECKED
1 - \delta - \frac{8Z^3}{\pi^2_{\rm min}\lambda^{(a)}_{\rm min}}\frac{1}{T\gamma_{\phi^\star}}(\ca+\log\log(1/\rhomin))
\ee
for any $\delta\in(0, 1)$.
% and any $T$ s.t. Eq. \eqref{} is positive
% and $T/(1-\delta)>8Z^3(\ca+\log\log(1/\rhomin))/(\lambda^{(a)}_{\rm min}\gamma_{\phi^\star})$.  
Here, $\ca\approx 6.8$,
$\cdklstar$ is a $\Phi^\star$-dependent numerical constant (see Appendix~\ref{app:numerator}),
$\rhomin := \min_z\rhoeq(z)$ is the equilibrium probability of the least probable latent state, 
$\stdeveq^2 := \max_a \sum_z\rhoeq(z){\rm Var}[r|z,a]$ % [OR:] \(\EE_{r\sim p(\cdot|z,a)}[r^2] - (\mustara)_z^2\)
is a measure of reward noise when the latent state is in equilibrium, 
$\lambda^{(a)}_{\rm min}=\lambda^{(a)}_{\rm min}(T)$
%=\lambda^{(a)}_{\rm min}}(\theta^\star,\phi^\star)
is the minimal eigenvalue of the action-wise asymptotic expected inverse covariance matrix\footnote{Recall that $\one (p_t\in\mathcal{P}_a)$ is the binary truth value of the statement that $a=\argmax_{a'} p_t^\tp\mustaraprime$ is the optimal action given the posterior belief $p_t$.}
\be\label{eq:B_eq_def}
\Beq^{(a)}(T) := % (\theta^\star,\phi^\star)
\frac{1}{T}\sum_{t=1}^T
\EE_{x_{1:t}\sim\rhoeq}[
\one(p^\star_t\in\mathcal{P}_a)
p^\star_t (p^\star_t)^\top],
\ee
averaged over histories generated from the equilibrium distribution, 
%(This can be viewed as an average over the context distribution $P(c)$ for where $c$ is now the posterior probability vector over the latent state.)
and $\gamma_\phi := \min_{z_1,z_2} \sum_z \min(\phi_{z,z_1},\phi_{z,z_2})$
is the minimal mixing rate of a transition matrix $\Phi$ \citep{boyenkoller1998}.
\end{customthm}
%\vspace{-0.6cm}
\begin{proof} [Proof (Outline)]
Appendix~\ref{app:estimator_bound} has the complete proof. The derivation relies primarily on a KL divergence contraction theorem for stochastic Markov processes from \cite{boyenkoller1998} to show that posterior probabilities used to compute the estimators $\hat{\mu}^{(a)}$ are approximately uncorrelated, $\EE[p^\star(z)p^\star_{t'}(z')]\approx\EE[p^\star_t(z)]\EE[ p^\star_{t'}(z')]$, over time separations $|t-t'|$ greater than the minimal mixing time $1/\gamma_{\phi^\star}$. Thus, the quantities $f^{(a)}$ and $\covinvmu^{(a)}$ in Eq. \eqref{eq:mu_hat} are sums of approximately independent random variables over blocks of at least $1/\gamma_{\phi^\star}$ timesteps.
We quantify this with upper bounds on the variances ${\rm Var}[f^{(a)}]$ and ${\rm Var}[\covinvmu^{(a)}]$ across context and reward histories, apply Chebyshev's inequality to obtain high-probability bounds on the deviation of $f^{(a)}$ and $\covinvmu^{(a)}$ from their expected values at large $T$, and derive an eigenvalue bound for the inverse matrix $(\covinvmu^{(a)})^{-1}$ in order to upper bound the deviation of the product $\hat{\mu}^{(a)} = (\covinvmu^{(a)})^{-1}f^{(a)}$ from $\mustara$.
%%  inverse covariance $\ctxmatrix\ctxmatrix^\tp$ which quantifies uncertainty in the estimators.
\end{proof}

%\vspace{-0.3cm}

Theorem~\ref{theorem:estimator} describes the effect of the latent dynamics and resulting posterior beliefs $p^\star_t$ on reward parameter estimation. At times $T$ sufficiently large compared to the mixing time $t_{\phi^\star}:=1/\gamma_{\phi^\star}$, correlations between posterior beliefs (i.e. the dependent variables in linear regression estimation of $\mustara$) at different times are small, and reward data are close to i.i.d., allowing for a $1/\sqrt{T}$ error reduction.
%(Note that the mixing time becomes large when the spectral gap $1-\lambda_2$ is small, corresponding to rare transitions to new states, or when $\rhomin$ is small due to a rarely sampled state. When mixing occurs rapidly, the reward variance $\sigma_a^2$ is dominant, as in the i.i.d. case.) 
The dependence on $\lambda^{(a)}_{\rm min}$ in Eq. \eqref{eq:estimator_bound}, which approaches a fixed asymptotic value in the $t\rarr\infty$ limit where posterior vectors $p_t$ are generated from a fixed asymptotic distribution, captures the benefit of more diverse posterior beliefs $p^\star_t$. When observations $x_t\sim p(\cdot|z^\star_t)$ contain little information about the true state $z^\star_t$, posterior beliefs will be more uncertain, decreasing $\lambda^{(a)}_{\rm min}$, which falls to zero in the limit where posteriors $p^\star_t$ fail to span the space of possible beliefs (e.g. if some latent states are indistinguishable), making $\covinv^{(a)}_\eq$ no longer full rank, and hence singular.\footnote{Furthermore, if an action $a$ is rarely or never optimal, $\covinv_{\rm eq}^{(a)}$ will approach the zero matrix, and again $\lambda_{\rm min}^{(a)}\rarr0$ and the bound becomes weak due to %a deficit in 
less data for action $a$.} 
The $Z$-dependence in Eq. \eqref{eq:estimator_bound} indicates that reward estimation is easier when the latent space is lower dimensional, in which case prior knowledge of the latent structure is more valuable.
Lastly, note that the bound probability, Eq. \eqref{eq:prob_bound_estimator}, reduces to $1-\delta$ as $T\rarr\infty$, but falls to zero at early times.

\subsection{Regret Bound}
\label{sec:regret}

%[Linear TS regret:]
%\begin{customthm}{2a}
%\label{theorem:regret_linTS}
%\end{customthm}

Since Algorithm~\ref{alg:llts} uses Thompson sampling with multivariate normal posteriors centered around the estimators $\muhat^{(a)}$ whose errors are bounded at large $t$ in Theorem~\ref{theorem:estimator}, we expect that as $t\rarr\infty$ and these posteriors become sharply peaked, sampled parameters $\mu^{(a)}$ will approach the true values $\mustara$, resulting in low regret.

Theorem~\ref{theorem:regret_llTS} below demonstrates this, and depends on two important quantities:
(1) We define the pairwise reward gap
\be
\Delta_{a^\star,a}=||\mustarastar-\mustara||_2
\ee
as the Euclidean norm of the difference of mean reward parameters for actions $a^\star$ and $a$.
%\be\label{eq:prob_a_opt_asym_def}
%\pc_{a^\star}^{(t)} :=
%\EE_{x_{1:t}\sim\rhoeq}[\one(p^\star_t\in\mathcal{P}_{a^\star})]
%%\mathbb{P}(a_t^\star=a|x_{1:t};\theta^\star,\phi^\star,\mu_\star)
%\ee
%Furthermore, we define ... as the probability that 
%%% In addition to the pairwise gaps $\Delta_{a^\star,a}$, our bound is controlled by the
(2) We define the limiting pairwise probability density
\begin{align}
    & \rhoaa_{a^\star,a}^{(t)}
    := \lim_{\eps\rarr0+} \frac{1}{\eps}\cdot
    \mathbb{P}_{x_{1:t}\sim\rhoeq}
    \Big(p_t^\star\in\mathcal{P}_{a^\star}, \nn \\
    & \ \ \ \ \ \ \ \
    (p_t^\star)^\top(\mustarastar-\mustara)<\eps
    ||\mustarastar-\mustara||_2 %%\Delta_{a^\star,a}
    \Big),
    % \big|\theta^\star,\phi^\star,\mu_\star
    \label{eq:probadv_def_latent}
\end{align}
which is the probability density that (i) action $a^\star$ is optimal at time $t$, i.e. $a^\star_t:=a(p^\star_t)=a^\star$, 
and (ii) the reward gap between action $a^\star$ and $a$ is infinitesimally small.
This quantifies the probability that the sequence of context data $x_{1:t}$ (generated with parameters $\theta^\star,\phi^\star,\mu_\star$) will determine a posterior $p^\star_t$ for which the optimal action is very difficult to resolve. \cyan{We denote the $t\rarr\infty$ limit of Eq. \eqref{eq:probadv_def_latent} -- which is well-defined due to the ergodicity and asymptotic equilibration of the latent state -- as $\rhoaa_{a^\star,a}:=\lim_{t\rarr\infty}\rhoaa_{a^\star,a}^{(t)}$.}

With these definitions, we state our main result:

\begin{customthm}{2}\label{theorem:regret_llTS}
Under the same conditions as Theorem~\ref{theorem:estimator} (i.e. $\phat_t\rarr p^\star_t$), 
%Assuming that 
%(i) the latent state Markov chain is ergodic and in equilibrium, $z_1\sim\rhoeq(\cdot)$, 
%and when (ii) \cyan{the true parameters $(\theta^\star$, $\phi^\star)$ are known [i.e. $\phat_t\rarr p^\star_t$]}, 
and when reward parameter vectors satisfy $||\mustara||_1<\Umunorm\in\mathbb{R}^+$ for all $a$, 
the expected regret incurred by Algorithm~\ref{alg:llts} (with $\tilde{\sigma}_r=1$)
%%%%\footnote{\cyan{We assume initializations ensure that Algorithm~\ref{alg:llts} asympotically selects the optimal }}
%linear Thompson sampling 
after $T$ timesteps satisfies the upper bound
\be\label{eq:regret_latent_final}
\R(T) \leq \frac{8Z^3}{\pi_{\rm min}^2}\sqrt{\gapadv\gapworst T} + O(T^{2/5})
\ee
where $\gapworst :=\max_{a^\star,a}\Delta_{a^\star,a}$ is the worst-case reward gap
and
\be\label{eq:Delta_likely_def} %CHECKED
\gapadv = 2Z\lambda_{\rm min}^{-2}
\Big( \stdeveq^2 + \frac{4\Umunorm^2}{\gamma_{\phi^\star}}\big(1+\log\cdklstar\big) \Big)
\sum_{a^\star,a}\frac{\rhoaa_{a^\star,a}}{\Delta_{a^\star,a}},
\ee
with $\rhoaa_{a^\star,a}$ defined above, and other quantities defined in Theorem~\ref{theorem:estimator}.
%where $\Rconst$ is a $T$-independent constant (see Appendix~\ref{app:regret_linTS}).
\end{customthm}

\begin{proof} [Proof (Outline)]
Appendix~\ref{app:regret} has the complete proof, and follows several steps:
(1) We derive a bound (Lemma~\ref{lemma:prob_action}) on the probability of a suboptimal action $a_t\neq a^\star_t$ for linear Thompson sampling, under the assumption of an upper bound on the estimation error $|\hat{\mu}^{(a)}-\mustara|$.
(2) We extend (1) into a high-probability bound on the regret incurred at timestep $t$ (Lemma~\ref{lemma:regret_timestep}), by taking an expectation over linear bandit context vectors. %% $\ctx$. %% [confusing:] in terms of $\rhoaa_{a^\star,a}$.
(3) We sum over timesteps to bound the cumulative regret (Corollary~\ref{corollary:regret_generic_sqrtT}), by decomposing the regret at time $t$ into the ``likely'' regret $\propto\gapadv$ when the per-timestep regret bound holds and a worst-case regret $\gapworst$ when it fails with probability $\delta(t)$. We optimize the time-dependent function $\delta(t)$, which reduces regret by a factor of $\sqrt{\gapadv/\gapworst}$ relative to the worst-case.
(4) We use the specific estimator bound from Theorem~\ref{theorem:estimator} to apply the linear Thompson sampling regret bound, Corollary~\ref{corollary:regret_generic_sqrtT}, to the latent bandit setting, where linear bandit contexts are posterior beliefs, $\ctx_t=p^\star_t$.
%are generated as the ``true'' posterior probability vectors used in Algorithm~\ref{alg:llts}.
\end{proof}

\textit{Structure of Theorem~\ref{theorem:regret_llTS}.} The dependence on $\rhoaa_{a^\star,a}/\Delta_{a^\star,a}$ captures the fact that the ``likely'' regret increases when there is more probability mass for posterior beliefs $p^\star_t$ %context vector $\ctx_t$
for which the optimal action is hard to resolve, 
and that decreasing the reward gap $\Delta_{a^\star,a}$ makes the optimal action still harder to resolve when such posterior beliefs %context vectors
occur.
We discuss dependence on the number of actions $K$, as well as the scaling of $\gapadv$ with the squared estimation error, Eq. \eqref{eq:estimator_bound}, at the end of Appendix~\ref{app:regret_latent}.

\textit{Related bandit literature.} 
Theorem~\ref{theorem:regret_llTS} is a bound for linear Thompson sampling, applied to the case where the context vectors are posterior probability vectors in a latent bandit problem, $\ctx_t=p^\star_t$ (see Lemma~\ref{lemma:reduction}).
%Algorithm~\ref{alg:llts} reduces to linear Thompson sampling with contexts $\ctx_t=p^\star_t$.
While this limits its applicability in the latent bandit setting to the case where $(\theta^\star,\phi^\star)$ are known, the bound is novel in relation to existing regret bounds, in three ways:
%\vspace{-0.25cm}
\setlist[itemize]{leftmargin=4.0mm}
\begin{itemize}
    \item \textit{Problem-dependence.} Eq. \eqref{eq:Delta_likely_def} describes the influence of task parameters $(\theta^\star,\phi^\star,\mu_\star)$ on regret, \gray{via their influence on %the distribution of 
    posterior beliefs $p^\star_t$ (in $\rhoaa_{a^\star,a}$), and resulting reward uncertainty ($\lambda_{\rm min}^{-1}$).}
    %(also reward gaps, although that's clear)
    %(via the dynamical mixing rate, \cyan{distinguishability of latent states}, etc.)
    \item \textit{Heavy-tailed reward distributions.}  Theorem~\ref{theorem:regret_llTS} makes no assumptions (e.g. sub-Gaussianity) about the reward distribution.
    \new{This is because the derivation of
    Theorem~\ref{theorem:estimator} relies mainly on Chebyshev's inequality --
    %(used in Appendix D.2 and D.3)
    which only assumes a finite variance for reward distributions -- to bound the estimator error and covariance.}\footnote{In the case of sub-Gaussian rewards, we expect the  Theorem~\ref{theorem:estimator} to hold with \new{higher probability than Eq. \eqref{eq:prob_bound_estimator}}, as outlier rewards are exponentially rare. This will improve the $O(\sqrt{T})$ scaling of Theorem~\ref{theorem:regret_llTS}.}
    \item \textit{Non-stationarity.} Our result applies in a non-stationary linear bandit setting where contexts $\ctx_t=p^\star_t$ are correlated across time (via latent state dynamics).\footnote{The limiting case where contexts $\ctx_t$ are i.i.d. can effectively be obtained by making the mixing rate large, $\gamma_{\phi^\star}\rarr\infty$, \gray{in which case intrinsic reward noise dominates, $\R(T)\propto\stdeveq$.} %(This does not reduce to an existing regret bound; the problem-dependent structure is still novel.)
    }
\end{itemize}
%\vspace{-0.25cm}
Our results compare to notable existing works as follows:
%\vspace{-0.25cm}
\begin{itemize}
    \item \textit{Linear Thompson sampling.} The problem-dependent regret bound for linear Thompson sampling \citep{agrawal13a} is $O(\log T)$, but this and most subsequent works assume sub-Gaussian (and i.i.d.) rewards.
    \item \textit{Heavy-tailed reward distributions.} Some works \citep{medina16heavytail,xue2020heavytail} have obtained problem-independent $O(T^{1/2+\eps})$ regret bounds with heavy-tailed rewards. \new{In comparison, our bound captures problem-dependent structure %for Thompson sampling
    in a more general setting with non-stationarity and latent variables.}
    %and their stochastic LB setting is different from our LB setting
    \item \textit{Non-stationary bandits.} Existing bounds \citep{luo18a,Hong2020} depend on the number of change-points; when distribution changes occur at a constant rate (e.g. due to latent state changes), these bounds are $\Omega(\sqrt{T})$ or linear, in contrast to our $O(\sqrt{T})$ bound.
    \item \textit{Latent bandits.} The Thompson sampling regret bounds of \cite{Hong2020} are complementary to ours, in that (i) they are problem-independent and assume sub-Gaussian rewards, but more importantly (ii) they assume an alternative definition of regret, relative to an oracle policy which sees the true latent state. (Note that, in contrast to our $O(\sqrt{T})$ regret, regret relative to such an oracle cannot be sublinear as long as latent state changes occur at a constant rate.)
    % [OR] \footnote{This leads to regret which grows linearly in time, as long as latent state changes occur at a constant rate.}
\end{itemize}
%\vspace{-0.25cm}
%(In comparison, our regret bound is more directly comparable to existing ...)
% [MEH, optional] Lastly, we emphasize that, in the limit \cyan{other TS(UCB) regret bounds should apply, when we know some true params and the bounds don't assume i.i.d.}
\cyan{Extending our regret bound to the case where $(\theta^\star,\phi^\star)$ are learned would be straightforward\footnote{Lemmas~\ref{lemma:regret_timestep} and \ref{lemma:regret_generic} bound the regret of linear Thompson sampling in the case where the agent's contexts $\ctxhat_t=\phat_t$ deviate from the true contexts $\ctx_t=p^\star_t$ which determine expected rewards.}
given a bound on the posterior error $|\phat_t-p^\star_t|$. (We are not aware of such convergence guarantees for online EM applied to HMMs.)}

\section{Experiments}
\label{sec:experiments}

In order to demonstrate the strong performance of our algorithms, we conduct experiments to compare the L$^2$TS and L$^2$UCB algorithms\footnote{\new{Code will be made available at \href{https://github.com/elliotnelson/hmm-bandits}{github.com/elliotnelson/hmm-bandits}.}} with relevant baselines on (i) discrete latent bandit tasks with synthetic data, and (ii) a Gaussian latent  bandit problem for a mining application involving real data.
\new{In all cases, the true initial state distribution $p^\star_0(z)$ differs at random from the model initial state distribution $p_0(z)$ (see Appendix~\ref{app:experiments}).}

\begin{figure}[t!]
    \centering
    \includegraphics[width=7.0cm]{regret_discrete_1.png}
    \includegraphics[width=7.0cm]{regret_discrete_cluster.png}
    \caption{\textbf{Top:} Mean cumulative regret for a synthetic task with discrete categorical variables (Problem 1). Shaded regions show uncertainty in the mean over 10 episodes. \textbf{Bottom:} Results for a synthetic task with clustered contexts (Problem 2).}
    \label{fig:regret_discrete}
\end{figure}

\begin{figure}[t!]
    \centering
    \includegraphics[width=7.0cm]{regret_gaussian_fast.png}
    \includegraphics[width=7.0cm]{regret_gaussian_slow.png}
    \caption{\textbf{Top:} Mean cumulative regret in a Gaussian-variable rover mining task. Shaded regions show uncertainty in the mean over 10 episodes. \textbf{Bottom:} %Results on the same task, 
    As above but with a rarely changing latent state (nearly diagonal $\Phi^\star$).}
    \label{fig:regret_gaussian}
\end{figure}

%\vspace{-0.4cm}

\paragraph{Multinomial Context and Reward Distributions.}
\textit{Problem 1.} In this problem, \cyan{$Z = 2$, $K = 2$, and $x_t\in\{1,...,X\}$ with $X=4$}, and with
$
\Phi^\star=
  \tiny{ \( {\begin{array}{cc}
   0.9 & 0.1 \\
   0.1 & 0.9 \\
  \end{array} } \) }.
$
We used 5 offline samples $x\sim p(x|z)$ for each $z$ to improve the initial estimate $\hat{\theta}$ at $t=0$ for both L$^2$TS and L$^2$UCB. %We found that convergence was unreliable with a completely uninformed initialization.
\textit{Problem 2.} In this problem, $(Z,X,K)=(4,12,8)$, with Bernoulli reward probabilities sampled uniformly in $(0,1)$, $\phi^\star_{z,z}=0.75$ on-diagonal and uniform off-diagonal, and contexts clustered into groups which are only emitted by a single latent state. (See Appendix~\ref{app:experiment_discrete} for more details to both problems.)

%\vspace{-0.4cm}

\paragraph{Mining Application.}

We consider an application  %mentioned in Section~\ref{sec:intro}: a rover exploring and mining oxide ore.
where a rover explores and mines for oxide ore.
The rover travels over various blocks of land taking x-ray fluorescent meter samples (context $x$), which provide information about the oxide grade, which in turn depends on the presence of one of three latent geological classes (latent state $z$).
Non-stationarity in this application is from spatial dependence between adjacent blocks of land. We assume the rover chooses between two mining strategies for different minerals (actions $a$), such that there are varying reward probabilities depending on uncertain revenue from the mined ore as well as fixed and variable costs.
%An action is deemed successful if the profit from a block is positive, i.e. estimated revenues outweigh costs.
We provide numerical details about the latent bandit model  parameters in Appendix~\ref{app:experiment_mining}, highlighting in particular how the context distribution $p(x|z)$ is obtained using real-world geological data  \citep{eidsvik_mukerji_bhattacharjya_2015}.
% Cite Eidsvik and Ellefmo paper?

\paragraph{Baselines.}

We compare L$^2$TS and L$^2$UCB with three baselines (see Appendix~\ref{app:experiments} for all parameter settings): 
(1) Uncertain Model Thompson Sampling (umTS): We adapt Algorithm 3 of \cite{Hong2020} 
-- which uses particle filtering to maintain a posterior over reward models, latent states, and latent transition matrices --
to our setting by using oracle knowledge of $p(x_t|z;\theta^\star)$ for additional posterior updates, which we denote in Figures~\ref{fig:regret_discrete} and \ref{fig:regret_gaussian} with the label umTS$^\star$. (In the graphical setting of \cite{Hong2020}, the latent state only influences rewards, and not contexts.)
%\cyan{(Discussion?: Could comment that umTS also explore (TS) to learn the latent state ... more applicable when $z$ changes more slowly ... if $z$ changing quickly you don't have much time to explore before it changes, so less to gain over just purely exploiting)}
%'context'? that's what we modify
(2) Exp4.P \citep{exp4p}: We use expert advisor classifiers trained (with varying latent state distributions) to label contexts $x$ according to corresponding optimal actions, as detailed in Appendix~\ref{app:experiments}, and modify the weight update of Exp4.P to discount the influence of old context data on current weights assigned to experts, and use the true dynamics timescale to set the discount factor.
(3) Discounted Thompson Sampling (dTS) \citep{raj2017taming}: We extend dTS to maintain success ($r=1$) and failure ($r=0$) counts for each discrete context-action pair $(x,a)$, and (like Exp4.P) allow dTS to use the true dynamics timescale to set the discount factor $\gamma$. (We only include dTS in the experiment with discrete context variables.)

%\vspace{-0.1cm}

We also compare to oracle variants of L$^2$TS \new{and L$^2$UCB} which use the true posterior $p^\star_t$ (i.e. condition on the true parameters $\theta^\star,\phi^\star,\mu_\star$) instead of the estimate $\phat_t$. As such, the oracle variants are simply linear Thompson sampling \new{and LinUCB} with uncorrupted or unbiased vectors $\ctx_t=p^\star_t$. (For this reason, the L$^2$TS oracle satisfies the conditions for Theorems~\ref{theorem:estimator} and \ref{theorem:regret_llTS}.)
\new{Lastly, in the rover mining experiment, we also compare to linear Thompson sampling using the raw contexts $x_t$ (instead of posteriors $\phat_t$ or $p^\star_t$).}

%\vspace{-0.3cm}

\paragraph{Results.}

Figures~\ref{fig:regret_discrete} and \ref{fig:regret_gaussian} show the cumulative regret for all algorithms, averaged over 10 episodes, for (respectively) the categorical-variable synthetic tasks and the Gaussian-variable rover mining tasks.
L$^2$TS significantly outperforms baselines. %% with regret \cyan{comment on asymptotic scaling}. %% () due to its ability to efficiently perform 
While umTS models the true latent structure and is given additional prior knowledge of $\theta^\star$, \cyan{it struggles relative to our algorithms except in the low-dimensional task (Problem 1), possibly due to challenges of scaling particle filtering to higher dimensions.}
%we found that it \red{struggled to converge to the true reward parameters.}
% suffers from asymptotically linear regret due to a failure to converge to the true parameters $\mu_\star$.
% \footnote{\gray{This is partly due to the particle filtering method used, which resamples from existing particles. Without the possibility of updating the positions (parameter values) of particles as well as their weights (e.g. \cite{Chen2019}) when resampling, naive particle filtering is limited by the initial positions assigned to particles.}}
% [OR] \footnote{We expect this performance to improve with stronger particle resampling methods %% which use the reward model $p(r|z,a)$ which resample by interpolating between existing particles, or update particle positions as well as weights \citep{Chen2019}.}
%% (INITIAL NOTES:) umTS does poorly, likely b/c of naive particle resampling\footnote{We experimented in a simplified environment with improved resampling methods which interpolate between existing particles to maintain diversity between particles, but still found it challenging to prevent all particles from collapsing to identical values $\theta\neq\theta^\star$ instead of converging asymptotically to $\theta^\star$.} ... we expect it to potentially do better with a more sophisticated method for particle/weight updating, especially when reward data is useful for latent state inference ... but in practice it is computationally intensive and getting the particles to behave requires effort
Exp4.P suffers from asymptotically linear regret due to its inability to model the underlying latent dynamics.\footnote{\gray{While Exp4.P can leverage statistical correlation between contexts and rewards that is modeled by its expert advisors, it cannot learn the temporal structure of this correlation, which is governed by the latent state.}}
%%\footnote{Exp4.P ... implicitly? b/c if expert i predicts well for some (x), it will be given more weight for other (x)
%% and leverage information about the hidden state to  ... 
%% and thus can only model the correlation between rewards and recent contexts 
Discounted TS performs most poorly \new{in Figure~\ref{fig:regret_discrete}} due to its inability to model the latent space or to transfer information gained across different discrete contexts. %%% (LONGER:) Discounted Thompson sampling (dTS) suffers from linear regret because it lacks knowledge of the underlying latent structure, and bases reward predictions for each context-action $(x,a)$ pair only on the past reward data from the same pair. In contrast, the other algorithms are able to generalize across contexts and actions by performing inference over the Markovian hidden state.
\new{The poor performance of linear Thompson sampling relative to L$^2$TS in Figure~\ref{fig:regret_gaussian} shows the benefit of using the (history-dependent) posterior probabilities $p_t$ as contexts for linear reward estimation, instead of the directly observed contexts $x_t$.}
In most cases, the asymptotic performance of L$^2$TS \new{and L$^2$UCB is comparable to their respective oracle variants} %\footnote{Note that the L$^2$TS oracle variant is not an upper bound on the performance of L$^2$UCB, which uses a different exploration heuristic.} 
(differing mainly in the overhead cost incurred at early times), 
indicating \new{that approximation error in the learned transition probabilities and context distributions is under control. (See Appendix~\ref{app:experiment_mining} for additional results on parameter estimation error.)}
%asymptotic convergence of the learned transition probabilities and context distributions.

%\cyan{While we have not compared directly to change-point detector algorithms for non-stationary settings (e.g. as in \cite{Hong2020}), we expect these approaches to suffer from linear regret (like dTS and Exp4.P). While retroactive detection of a shift in reward distributions can be effective if latent state changes are rare, change-point detection per se cannot learn the underlying transition probabilities, which can be used to predict future state changes and reward probabilities.}

\section{Conclusion \& Discussion}

In this paper, we have developed a novel multi-armed bandit algorithm for environments with a dynamical latent state influencing both observations (contexts) and rewards. Our algorithm uses prior knowledge of latent graphical structure to transform a nonlinear and non-stationary contextual bandit problem into a linear bandit problem, exploiting the linearity between rewards and posterior probabilities over the latent state. %Online EM
While we considered a specific method (Online EM)
to learn the latent transition matrix and context distributions, with specific linear bandit algorithms (LinTS, LinUCB), the high-level approach of treating a posterior belief over latent variables (or over unknown parameters) as context information %using Bayesian inference to reduce to a linear bandit problem 
is %much more 
general; it can be applied with any method for sequential Bayesian inference, and with other sequential decision-making algorithms. 
Our theoretical analysis underscores the influence of the %timescale of 
latent dynamics and %problem-specific
distributional structure of the environment on task difficulty. 
Directions for future work include online learning of the latent space dimensionality, application of HMM learning convergence guarantees \citep{hsu2012spectral} to non-stationary bandit problems, 
and extensions of our methodology to partially observable Markov decision process (POMDP) settings %%in which actions can influence latent state transitions,
or to more complex graphical models.
% or learned Bayes-adaptive policies which act on beliefs about latent variables in order to explore strategically. 
% sub-Gaussian rewards -> stronger regret bound

%\begin{contributions}
%\end{contributions}

\begin{acknowledgements}
We are grateful to Karthikeyan Shanmugam for conversations regarding regret analysis and algorithm development.
\end{acknowledgements}

\bibliography{nelson_697}

\appendix

\end{document}
