% !TEX root = main21neurips-ssp.tex

\section{The \ssp~Algorithm}
\label{sec: algorithm}
In this section, we propose the Posterior Sampling Reinforcement Learning (\ssp) ~algorithm (Algorithm~\ref{alg: posterior sampling}) for the SSP model. The input of the algorithm is the prior distribution $\mu_1$. At time $t$, the agent maintains the posterior distribution $\mu_t$ on the unknown parameter $\theta_*$ given by $\mu_t(\Theta) = \mathbb{P}(\theta_* \in \Theta | \calF_t)$ for any set $\Theta \subseteq \Theta_{\B}$. Here $\calF_t$ is the information available at time $t$ (i.e., the sigma algebra generated by $s_1, a_1, \cdots, s_{t-1}, a_{t-1}, s_t$). Upon observing state $s_t'$ by taking action $a_t$ at state $s_t$, the posterior can be updated according to 
\begin{align}
\label{eq: update rule}
\mu_{t+1}(d\theta) = \frac{\theta(s_t'|s_t, a_t)\mu_t(d\theta)}{\int \theta'(s_t'|s_t, a_t)\mu_t(d\theta')}.
\end{align}

The \ssp~algorithm  proceeds in epochs $\ell = 1, 2, 3, \cdots$. Let $t_\ell$ denote the start time of epoch $\ell$. In the beginning of epoch $\ell$, parameter $\theta_\ell$ is sampled from the posterior distribution $\mu_{t_\ell}$ and the actions within that epoch are chosen according to the optimal policy with respect to $\theta_\ell$. Each epoch ends if either of the two stopping criteria are satisfied. The first criterion is triggered if the number of visits to the goal state during the current epoch (denoted by $K_\ell$) exceeds that of the previous epoch. This ensures that $K_\ell \leq K_{\ell-1}+1$ for all $\ell$. The second criterion is triggered if the number of visits to  any of the state-action pairs is doubled compared to the beginning of the epoch. This guarantees that $n_t(s, a) \leq 2n_{t_\ell}(s, a)$ for all $(s, a)$ where $n_t(s, a) = \sum_{\tau=1}^{t-1} \one_{\{s_\tau=s, a_\tau=a\}}$ denotes the number of visits to state-action pair $(s, a)$ before time $t$. 

The second stopping criterion is similar to that used by \citet{jaksch2010near,rosenberg2020near,agrawal2017optimistic}, and is one of the two stopping criteria in the posterior sampling algorithm (\texttt{TSDE}) for the infinite-horizon average-cost MDPs \citep{ouyang2017learning}. This stopping criterion is crucial since it allows the algorithm to switch policies if the generated policy is improper and cannot reach the goal. Note that updating the policy only at the beginning of an episode (as done in the posterior sampling for finite-horizon MDPs \citep{osband2013more}) does not work for SSP models, because if the generated policy in the beginning of the episode is improper, the goal is never reached and the regret is infinity. 

The first stopping criterion is novel. A similar stopping criterion used in the posterior sampling for infinite-horizon MDPs \citep{ouyang2017learning} is based on the length of the epochs, i.e., a new epoch starts if the length of the current epoch exceeds the length of the previous epoch. This leads to a bound of $\order(\sqrt{T_K})$ on the number of epochs which translates to a final regret bound of $\order(K^{2/3})$ in SSP models. However, our first stopping criterion allows us to bound the number of epochs by $\order(\sqrt K)$ rather than $\order(\sqrt{T_K})$ (see Lemma~\ref{lem: number of epochs}). This is a key step in avoiding dependency on $\cmin^{-1}$ (i.e., a lower bound on the cost function)  and achieve a final regret bound of $\order(\sqrt{K})$.
\begin{remark}
\ssp~only requires to know the prior distribution $\mu_1$. Unlike \citet{cohen2021minimax}, knowledge of $\B,\T$ (an upper bound on the expected time the optimal policy takes to reach the goal) is not needed.
\end{remark}

\begin{remark}
Computing the posterior can be done through conjugate distributions. For a fixed state-action pair $(s, a)$, the likelihood distribution of the next state follows a categorical distribution. Thus, the Dirichlet distribution should be chosen as the conjugate prior.
\end{remark}

\begin{remark}
\ssp~can easily deal with unknown cost functions in the exact same way as \cite{osband2013more} has done with only a constant overhead for the regret. More precisely, one can maintain a posterior distribution on both the cost function and the transition kernel separately (by choosing a normal-gamma distribution for the cost function and Dirichlet distribution for the transition kernel). Then, at the time of sampling, the algorithm samples both the transition kernel and the cost function from the posterior and computes the optimal policy for the sampled SSP. Our known cost function assumption is just for simplicity of explanation and is not a limitation of the algorithm, or its analysis.
\end{remark}

%\begin{algorithm}[h]
%\caption{\textsc{\texttt{SSP-PSRL}}}
%\label{alg: posterior sampling}
%\textbf{Input: } $\mu_1$\\
%\textbf{Initialization: }$t \gets 1, t_0 \gets 0, \ell \gets 0, K_{-1} \gets 0$\\
%\For{ {\normalfont episodes} $k=1, 2, \cdots, K$}{
%	$s_t \gets \sinit$ \\
%	\While{$s_t \neq g$}{
%		\If{$k > k(t_{\ell}) + K_{\ell-1}$ or $n_{t}(s, a) > 2 n_{t_\ell}(s, a)$ for some $(s, a) \in \calS \times \calA$}{
%			$\ell \gets \ell + 1$ \\			
%			$K_{\ell-1} \gets k - k(t_{\ell})$\\
%			$t_\ell \gets t$\\
%			Generate $\theta_\ell \sim \mu_{t_\ell}(\cdot)$ and compute $\pi_\ell(\cdot) = \pi^*(\cdot;\theta_\ell)$ according to \eqref{eq: Bellman equation} \\	
%		}
%		Choose action $a_t = \pi_\ell(s_t)$ and observe $s_{t+1} \sim \theta_*(\cdot | s_t, a_t)$\\
%		Update $\mu_{t+1}$ according to \eqref{eq: update rule}\\
%		$t \gets t+1$	
%	}
%}
%\end{algorithm}

\begin{algorithm}[t]
\caption{\textsc{\ssp}}
\label{alg: posterior sampling}
\textbf{Input: } $\mu_1$\\
\textbf{Initialization: }$t \gets 1, \ell \gets 0, K_{-1} \gets 0, t_0 \gets 0, k_{t_0} \gets 0$\\
\For{ {\normalfont episodes} $k=1, 2, \cdots, K$}{
	$s_t \gets \sinit$ \\
	\While{$s_t \neq g$}{
		\If{$k - k_{t_\ell} > K_{\ell-1}$ or $n_{t}(s, a) > 2 n_{t_\ell}(s, a)$ for some $(s, a) \in \calS \times \calA$}{
			$K_{\ell} \gets k - k_{t_\ell}$\\
			$\ell \gets \ell + 1$ \\
			$t_\ell \gets t$\\			
			$k_{t_\ell} \gets k$\\	
			Generate $\theta_\ell \sim \mu_{t_\ell}(\cdot)$ and compute $\pi_\ell(\cdot) = \pi^*(\cdot;\theta_\ell)$ according to \eqref{eq: Bellman equation} \\	
		}
		Choose action $a_t = \pi_\ell(s_t)$ and observe $s_t' \sim \theta_*(\cdot | s_t, a_t)$ \\
		Update $\mu_{t+1}$ according to \eqref{eq: update rule}\\
		$s_{t+1} \gets s_t'$\\
		$t \gets t+1$ \\
	}
}
\end{algorithm}

\paragraph*{Main Results.} Our first result considers the case where the cost function is strictly positive for all state-action pairs. Subsequently, we extend the result to the general case by adding a small positive perturbation to the cost function and running the algorithm with the perturbed costs. We first assume make a standard assumption for SSP models:
\begin{assumption}
	\label{ass: cmin}
	There exists $\cmin > 0$, such that $c(s, a) \geq \cmin$ for all state-action pairs $(s, a)$.
\end{assumption}
This assumption allows us to bound the total time spent in $K$ episodes with the total cost, i.e., $\cmin T_K \leq C_K$, where $C_K := \sum_{t=1}^{T_K} c(s_t, a_t)$ is the total cost during the $K$ episodes. To facilitate the presentation of the results, we assume that $S \geq 2$, $A \geq 2$, and $K \geq S^2A$. The first main result is as follows.
\begin{theorem}
	\label{thm1}
	Suppose Assumptions~\ref{ass: class of ssp} and ~\ref{ass: cmin} hold. Then, the regret of \ssp~is upper bounded as
	\begin{align*}
		R_K = \order\rbr{\B S \sqrt{KA}L^2 + S^2A \sqrt{\frac{{\B}^3}{\cmin}}L^2},
	\end{align*}
	where $L = \log (\B SAK\cmininv)$.
\end{theorem}
Note that when $K \gg \B S^2A\cmininv$, the regret bound scales as $\otil(\B S \sqrt{KA})$. A crucial point about the above result is that the dependency on $\cmininv$ is only in the lower order term. This allows us to extend the $\order(\sqrt{K})$ bound to the general case where Assumption~\ref{ass: cmin} does not hold by using the perturbation technique of \cite{rosenberg2020near} (see Theorem~\ref{thm2}). We avoid dependency on $\cmininv$ in the main term by use of a Bernstein-type confidence set in the analysis inspired by \cite{rosenberg2020near}. We note that using a Hoeffding-type confidence set in the analysis as in \citet{ouyang2017learning} gives a regret bound of $\order(\sqrt{K/\cmin})$ which results in $\order(K^{2/3})$ regret bound if Assumption~\ref{ass: cmin} is violated.
\begin{theorem}
	\label{thm2}
	Suppose Assumption~\ref{ass: class of ssp} holds and let $\tilde L := \log (K\B\T SA)$. Running \ssp~with costs $c_\epsilon(s, a) := \max \{c(s, a), \epsilon\}$ for $\epsilon = (S^2A/K)^{2/3}$ yields
	\begin{align*}
		R_K &= \order\Big(\B S \sqrt{KA}\tilde{L}^2 + (S^2A)^\frac{2}{3}K^\frac{1}{3}(\B^\frac{3}{2}\tilde{L}^2 + \T) \\
		&\qquad+ S^2A\T^\frac{3}{2}\tilde{L}^2\Big).
	\end{align*}
	 %and $\T$ is an upper bound on the expected time the optimal policy takes to reach the goal from any initial state.
\end{theorem}

Note that when $K \gg S^2A(\B^3 + \T(\T/\B)^6)$, the regret bound scales as $\otil(\B S \sqrt{KA})$. These results have similar regret bounds as the \texttt{Bernstein-SSP} algorithm \citep{rosenberg2020near}, and have a gap of $\sqrt{S}$ with the lower bound of $\Omega(\B\sqrt{SAK})$.