% !TEX root = main21neurips-ssp.tex

\section{Theoretical Analysis}
\label{sec: analysis}
In this section, we prove Theorem~\ref{thm1}. Proof of Theorem~\ref{thm2} can be found in the Appendix. 

A key property of posterior sampling is that conditioned on the information at time $t$, $\theta_*$ and $\theta_t$ have the same distribution if $\theta_t$ is sampled from the posterior distribution at time $t$ \citep{osband2013more}. Since \ssp~samples $\theta_\ell$ at the stopping time $t_\ell$, we use the stopping time version of the posterior sampling property stated as follows.
\begin{lemma}[Adapted from Lemma 2 of \cite{ouyang2017learning}]
\label{lem: property of ps}
Let $t_\ell$ be a stopping time with respect to the filtration $(\calF_t)_{t=1}^\infty$, and $\theta_\ell$ be the sample drawn from the posterior distribution at time $t_\ell$. Then, for any measurable function $f$ and any $\calF_{t_\ell}$-measurable random variable $X$, we have $\E[f(\theta_\ell, X)|\calF_{t_\ell}] = \E[f(\theta_*, X)|\calF_{t_\ell}]$.
\end{lemma}

We now sketch the proof of Theorem~\ref{thm1}. Let $0 < \delta < 1$ be a parameter to be chosen later. We distinguish between \textit{known} and \textit{unknown} state-action pairs. A state-action pair $(s, a)$ is \textit{known} if the number of visits to $(s, a)$ is at least $\alpha \cdot \frac{\B S}{\cmin}\log \frac{\B SA}{\delta \cmin}$ for some large enough constant $\alpha$ (to be determined in Lemma~A.6), and \textit{unknown} otherwise. We divide each epoch into \textit{intervals}. The first interval starts at time $t = 1$. Each interval ends if any of the following conditions hold: (i) the total cost during the interval is at least $\B$; (ii) an unknown state-action pair is met; (iii) the goal state is reached; or (iv) the current epoch completes. The idea of introducing intervals is that after all state-action pairs are known, the cost accumulated during an interval is at least $\B$ (ignoring conditions (iii) and (iv)), which allows us to bound the number of intervals with the total cost divided by $\B$. Introducing intervals and distinguishing between known and unknown state-action pairs is only in the analysis and thus knowledge of $\B$ is not required.

Instead of bounding $R_K$, we bound $R_M$ defined as
\begin{align*}
R_M &:= \E\sbr{\sum_{t=1}^{T_M} c(s_t, a_t) - KV(\sinit;\theta_*)},
\end{align*}
for any number of intervals $M$ as long as $K$ episodes are not completed. Here, $T_M$ is the total time of the first $M$ intervals. Let $C_M$ denote the total cost of the algorithm after $M$ intervals and define $L_M$ as the number of epochs in the first $M$ intervals. Observe that the number of times conditions (i), (ii), (iii), and (iv) trigger to start a new interval are bounded by $C_{M}/{\B}$, $\order(\frac{\B S^2A}{\cmin}\log \frac{\B SA}{\delta \cmin})$, $K$, and $L_M$, respectively. Thus, the number of intervals is bounded as
\begin{align}
\label{eq: bound on m}
M \leq \frac{C_{M}}{\B} + K + L_M + \order(\frac{\B S^2A}{\cmin}\log \frac{\B SA}{\delta \cmin}).
\end{align}
Moreover, since the cost function is lower bounded by $\cmin$, we have $\cmin T_M \leq C_M$. Our argument proceeds as follows.\footnote{Lower order terms are neglected.} We bound $R_M \lesssim \B S\sqrt{MA}$ which implies $\E[C_M] \lesssim K\E[V(\sinit;\theta_*)] + \B S\sqrt{MA}$. From the definition of intervals and once all the state-action pairs are known, the cost accumulated within each interval is at least $\B$ (ignoring intervals that end when the epoch or episode ends). This allows us to bound the number of intervals $M$ with $C_M/\B$ (or $\E[C_M]/\B$). Solving for $\E[C_M]$ in the quadratic inequality $\E[C_M] \lesssim K\E[V(\sinit;\theta_*)] + \B S\sqrt{MA} \lesssim K\E[V(\sinit;\theta_*)] + S\sqrt{\E[C_M]\B A}$ implies that $\E[C_M] \lesssim K\E[V(\sinit;\theta_*)] + \B S\sqrt{AK}$. Since this bound holds for any number of $M$ intervals as long as $K$ episodes are not passed, it holds for $\E[C_K]$ as well. Moreover, since $\cmin > 0$, this implies that the $K$ episodes eventually terminate and proves the final regret bound.


\textbf{Bounding the Number of Epochs.} Before proceeding with bounding $R_M$, we first prove that the number of epochs is bounded as $\order(\sqrt{KSA\log T_M})$. Recall that the length of the epochs is determined by two stopping criteria. If we ignore the second criterion for a moment, the first stopping criterion ensures that the number of episodes within each epoch grows at a linear rate which implies that the number of epochs is bounded by $\order(\sqrt{K})$. If we ignore the first stopping criterion for a moment, the second stopping criterion triggers at most $\order(SA\log T_M)$ times. The following lemma shows that the number of epochs remains of the same order even if  these two criteria are considered simultaneously.
\begin{lemma}
\label{lem: number of epochs}
The number of epochs is bounded as $L_M \leq  \sqrt{2SAK\log T_M} + SA\log T_M$.
\end{lemma}

We now provide the proof sketch for bounding $R_M$. With abuse of notation let $t_{L_M+1} := T_M+1$ and write
\begin{align*}
&R_M := \E\sbr{\sum_{t=1}^{T_M} c(s_t, a_t) - KV(\sinit;\theta_*)}\\
&= \E\sbr{\sum_{\ell = 1}^{L_M}\sum_{t=t_\ell}^{t_{\ell+1}-1} c(s_t, a_t)} - K\E\sbr{V(\sinit;\theta_*)}.
\end{align*}
Note that within epoch $\ell$, action $a_t$ is taken according to the optimal policy with respect to $\theta_\ell$. Thus, with the Bellman equation we can write
\begin{align*}
c(s_t, a_t) = V(s_t;\theta_\ell) - \sum_{s'}\theta_\ell(s'|s_t, a_t)V(s';\theta_\ell).
\end{align*}
Substituting this, and adding and subtracting $V(s_{t+1};\theta_\ell)$ and $V(s'_t;\theta_\ell)$, decomposes $R_M$ as
\begin{align*}
R_M = R_M^1 + R_M^2 + R_M^3,
\end{align*}
where
\begin{align*}
R_M^1 &:= \E\sbr{\sum_{\ell = 1}^{L_M}\sum_{t=t_\ell}^{t_{\ell+1}-1}\sbr{V(s_t;\theta_\ell) - V(s_{t+1};\theta_\ell)}}, \\
R_M^2 &:= \E\Bigg[\sum_{\ell = 1}^{L_M}\sum_{t=t_\ell}^{t_{\ell+1}-1}\Bigg[V(s_{t+1};\theta_\ell) \\
&\qquad\qquad - V(s'_t;\theta_\ell)\Bigg]\Bigg] - K\E\sbr{V(\sinit;\theta_*)},  \\
R_M^3 &:= \E\Bigg[\sum_{\ell = 1}^{L_M}\sum_{t=t_\ell}^{t_{\ell+1}-1}\Bigg[V(s'_t;\theta_\ell) \\
&\qquad - \sum_{s'}\theta_\ell(s'|s_t, a_t)V(s';\theta_\ell)\Bigg]\Bigg].
\end{align*}
We proceed by bounding these terms separately. Proof of these lemmas can be found in the supplementary material. $R_M^1$ is a telescopic sum and can be bounded by the following lemma.
\begin{lemma}
\label{lem: bounding R1}
The first term $R_M^1$ is bounded as $R_M^1 \leq \B \E[L_M]$.
\end{lemma}
To bound $R_M^2$, recall that $s'_t \in \calS^+$ is the next state of the environment after applying action $a_t$ at state $s_t$, and that $s'_t = s_{t+1}$ for all time steps except the last time step of an episode (right before reaching the goal). In the last time step of an episode, $s'_t = g$ while $s_{t+1} = \sinit$. This proves that the inner sum of $R_M^2$ can be written as $V(\sinit;\theta_\ell)K_\ell$, where $K_\ell$ is the number of visits to the goal state during epoch $\ell$. Using $K_\ell \leq K_{\ell-1}+1$ and the property of posterior sampling completes the proof. This is formally stated in the following lemma.
\begin{lemma}
\label{lem: r2}
The second term $R_M^2$ is bounded as $R_M^2 \leq \B \E[L_M]$.
\end{lemma}
The rest of the proof proceeds to bound the third term $R_M^3$ which contributes to the dominant term of the final regret bound. The detailed proof can be found in Lemma~\ref{lem: r3}. Here we provide the proof sketch. $R_M^3$ captures the difference between $V(\cdot;\theta_\ell)$ at the next state $s_t' \sim \theta_*(\cdot|s_t, a_t)$ and its expectation with respect to the sampled $\theta_\ell$. Applying the Hoeffding-type concentration bounds \citep{weissman2003inequalities}, as used by \cite{ouyang2017learning} yields a regret bound of $\order(K^{2/3})$ which is sub-optimal. To achieve the optimal dependency on $K$, we use a technique based on the Bernstein concentration bound inspired by the work of \cite{rosenberg2020near}. This requires a more careful analysis. Let $n_{t_\ell}(s, a, s')$ be the number of visits to state-action pair $(s, a)$ followed by state $s'$ before time $t_\ell$. For a fixed state-action pair $(s, a)$, define the Bernstein confidence set using the empirical transition probability $\hatthetal(s'|s, a) := \frac{n_{t_\ell}(s, a, s')}{n_{t_\ell}(s, a)}$ as
\begin{align}
\label{eq: bernstein confidence set}
&B_\ell(s, a) := \Bigg\{\theta(\cdot|s, a) : \abs{\theta(s'|s, a) - \hatthetal(s' | s, a)} \nonumber \leq \\
&4\sqrt{\hatthetal(s'|s, a)\Alsa} + 28\Alsa, \forall s' \in \calS^+\Bigg\}.
\end{align}
Here $\Alsa := \Aldef$ and $n_\ell^+(s, a) := \max \{n_{t_\ell}(s, a), 1\}$. This confidence set is similar to the one used by \citet{rosenberg2020near} and contains the true transition probability $\theta_*(\cdot|s, a)$ with high probability (see Lemma~A.2). Note that $B_\ell(s, a)$ is  $\calF_{t_\ell}$-measurable which allows us to use the property of posterior sampling (Lemma~\ref{lem: property of ps}) to conclude that $B_\ell(s, a)$ contains the sampled transition probability $\theta_\ell(\cdot|s, a)$ as well with high probability. With some algebraic manipulation, $R_M^3$ can be written as (with abuse of notation $\ell := \ell(t)$ is the epoch at time $t$)
\begin{align*}
R_M^3 &= \E\Bigg[\sum_{t=1}^{T_M}\sumsp\sbr{\theta_*(s'|s_t, a_t) - \thetalm(s'|s_t, a_t)}\Bigg(V(s';\thetalm) \\
&\qquad- \sumsdp \theta_*(s'' | s_t, a_t)V(s''; \thetalm)\Bigg)\Bigg].
\end{align*}
Under the event that both $\theta_*(\cdot|s_t, a_t)$ and $\theta_\ell(\cdot|s_t, a_t)$ belong to the confidence set $B_\ell(s_t, a_t)$, Bernstein bound can be applied to obtain
\begin{align*}
R_M^3 &\approx \order\rbr{\E\sbr{\sum_{t=1}^{T_M}\sqrt{S\Alstat\Vlstat}}} \\
&= \order\rbr{\summ\E\sbr{\sumtm\sqrt{S\Alstat\Vlstat}}},
\end{align*}
where $t_m$ denotes the start time of interval $m$ and $\mathbb{V}_\ell$ is the empirical variance defined as
\begin{align}
\label{eq: empirical variance}
\Vlstat &:= \sumsp \theta_*(s'|s_t, a_t)\Bigg(V(s';\thetalm) \nonumber \\
&- \sumsdp\theta_*(s''|s_t, a_t)V(s''; \thetalm)\Bigg)^2. 
\end{align}
Applying Cauchy Schwarz on the inner sum twice implies that
\begin{align*}
R_M^3 \approx \order\Bigg(\summ \Bigg(&\sqrt{S\E\sbr{\sumtm\Alstat }} \\
&\cdot \sqrt{\E\sbr{\sumtm\Vlstat}}\Bigg)\Bigg).
\end{align*}
Using the fact that all the state-action pairs $(s_t, a_t)$ within an interval except possibly the first one are known, and that the cumulative cost within an interval is at most $2\B$, one can bound $\E\sbr{\sumtm\Vlstat} \  = \order(\B^2)$ (see Lemma~A.5 for details). Applying Cauchy Schwarz implies $R_M^3 \approx$
\begin{align*}
 \order\rbr{\B \sqrt{MS \E\sbr{\sum_{t=1}^{T_M}\Alstat}}} \approx \order\rbr{\B S\sqrt{MA}}.
\end{align*}
This argument is formally presented in the following lemma.
\begin{lemma}
\label{lem: r3}
The third term $R_M^3$ can be bounded as
\begin{align*}
R_M^3 &\leq 288\B S \sqrt{MA \log^2\frac{SA\E[T_M]}{\delta}} \\
&+ 1632 \B S^2A\log^2\frac{SA\E[T_M]}{\delta} + 4S\B \delta \E[L_M].
\end{align*}
\end{lemma}

Detailed proofs of all lemmas and the theorem can be found in the appendix in the supplementary material.