% !TEX root = main21neurips-ssp.tex

\section{Proofs}
\label{app: proofs}


\subsection{Proof of Lemma~\ref{lem: number of epochs}}\label{app:proof:lem: number of epochs}
\textbf{Lemma} (restatement of Lemma~\ref{lem: number of epochs})\textbf{.} The number of epochs is bounded as $L_M \leq  \sqrt{2SAK\log T_M} + SA\log T_M$.
\begin{proof}
Define macro epoch $i$ with start time $t_{u_i}$ given by $t_{u_1} = t_1$, and
\begin{align*}
t_{u_{i+1}} = \min \cbr{t_\ell > t_{u_i} : n_{t_\ell}(s, a) > 2n_{t_\ell-1}(s, a) \text{ for some } (s, a)}, \qquad i = 2, 3, \cdots.
\end{align*}
A macro epoch starts when the second criterion of determining epoch length triggers. Let $N_M$ be a random variable denoting the total number of macro epochs by the end of interval $M$ and define $u_{N_M + 1} := L_M + 1$. 

Recall that $K_\ell$ is the number of visits to the goal state in epoch $\ell$. Let $\tilde{K}_i := \sum_{\ell=u_i}^{u_{i+1}-1}K_\ell$ be the number of visits to the goal state in macro epoch $i$. By definition of macro epochs, all the epochs within a macro epoch except the last one are triggered by the first criterion, i.e., $K_\ell = K_{\ell-1}+1$ for $\ell = u_i, \cdots, u_{i+1}-2$. Thus,
\begin{align*}
\tilde{K_i} = \sum_{\ell=u_i}^{u_{i+1}-1}K_\ell = K_{u_{i+1}-1} + \sum_{j=1}^{u_{i+1}-u_i - 1}(K_{u_i-1}+j) \geq \sum_{j=1}^{u_{i+1}-u_i - 1}j = \frac{(u_{i+1}-u_i - 1)(u_{i+1}-u_i)}{2}.
\end{align*}
Solving for $u_{i+1}-u_i$ implies that $u_{i+1}-u_i \leq 1 + \sqrt{2\tilde{K_i}}$. We can write
\begin{align*}
L_M = u_{N_M+1} - 1 = \sum_{i=1}^{N_M}\rbr{u_{i+1}-u_i} &\leq \sum_{i=1}^{N_M}\rbr{1 + \sqrt{2\tilde{K_i}}} = N_M + \sum_{i=1}^{N_M}\sqrt{2\tilde{K_i}} \\
&\leq N_M + \sqrt{2N_M \sum_{i=1}^{N_M}\tilde{K_i}} = N_M + \sqrt{2N_M K},
\end{align*}
where the second inequality follows from Cauchy-Schwarz. It suffices to show that the number of macro epochs is bounded as $N_M \leq 1 + SA\log T_M$. Let $\calT_{s, a}$ be the set of all time steps at which the second criterion is triggered for state-action pair $(s, a)$, i.e.,
\begin{align*}
\calT_{s, a} := \cbr{t_\ell \leq T_M : n_{t_\ell}(s, a) > 2 n_{t_{\ell-1}}(s, a)}.
\end{align*}
We claim that $\abr{\calT_{s, a}} \leq \log n_{T_M+1}(s, a)$. To see this, assume by contradiction that $\abr{\calT_{s, a}} \geq 1 + \log n_{T_M+1}(s, a)$, then
\begin{align*}
n_{t_{L_M}}(s, a) &= \prod_{t_\ell \leq T_M, n_{t_{\ell-1}}(s, a) \geq 1}\frac{n_{t_\ell}(s, a)}{n_{t_{\ell-1}}(s, a)} \geq \prod_{t_\ell \in \calT_{s, a}, n_{t_{\ell-1}}(s, a) \geq 1}\frac{n_{t_\ell}(s, a)}{n_{t_{\ell-1}}(s, a)} \\
&> 2^{\abr{\calT_{s, a}} - 1} \geq n_{T_M+1}(s, a),
\end{align*}
which is a contradiction. Thus, $\abr{\calT_{s, a}} \leq \log n_{T_M+1}(s, a)$ for all $(s, a)$. In the above argument, the first inequality is by the fact that $n_t(s, a)$ is non-decreasing in $t$, and the second inequality is by the definition of $\calT_{s, a}$. Now, we can write
\begin{align*}
N_M &= 1 + \sum_{s, a}\abr{\calT_{s, a}} \leq 1 + \sum_{s, a} \log n_{T_M+1}(s, a) \\
&\leq 1 + SA\log \frac{\sum_{s, a}n_{T_M+1}(s, a)}{SA} = 1 + SA \log \frac{T_M}{SA} \leq SA\log T_M,
\end{align*}
where the second inequality follows from Jensen's inequality.
\end{proof}











\subsection{Proof of Lemma~\ref{lem: bounding R1}}\label{app:proof:lem: bounding R1}
\textbf{Lemma} (restatement of Lemma~\ref{lem: bounding R1})\textbf{.} The first term $R_M^1$ is bounded as $R_M^1 \leq \B \E[L_M]$.
\begin{proof}
Recall
\begin{align*}
R_M^1 = \E\sbr{\sum_{\ell = 1}^{L_M}\sum_{t=t_\ell}^{t_{\ell+1}-1}\sbr{V(s_t;\theta_\ell) - V(s_{t+1};\theta_\ell)}} 
\end{align*}
Observe that the inner sum is a telescopic sum, thus
\begin{align*}
R_M^1 = \E\sbr{\sum_{\ell = 1}^{L_M}\sbr{V(s_{t_\ell};\theta_\ell) - V(s_{t_{\ell+1}};\theta_\ell)}} \leq \B \E[L_M],
\end{align*}
where the inequality is by Assumption~\ref{ass: class of ssp}.
\end{proof}











\subsection{Proof of Lemma~\ref{lem: r2}}\label{sec:proof:lem: r2}
\textbf{Lemma} (restatement of Lemma~\ref{lem: r2})\textbf{.} The second term $R_M^2$ is bounded as $R_M^2 \leq \B \E[L_M]$.
\begin{proof}
Recall that $K_\ell$ is the number of times the goal state is reached during epoch $\ell$. By definition, the only time steps that $s'_t \neq s_{t+1}$ is right before reaching the goal. Thus, with $V(g;\theta_\ell) = 0$, we can write
\begin{align*}
R_M^2 &= \E\sbr{\sum_{\ell = 1}^{L_M}\sum_{t=t_\ell}^{t_{\ell+1}-1}\sbr{V(s_{t+1};\theta_\ell) - V(s'_t;\theta_\ell)}} - K\E\sbr{V(\sinit;\theta_*)} \\
&= \E\sbr{\sum_{\ell = 1}^{L_M}V(\sinit;\theta_\ell)K_\ell} - K\E\sbr{V(\sinit;\theta_*)} \\
&= \sum_{\ell = 1}^{\infty}\E\sbr{\one_{\{m(t_\ell) \leq M\}}V(\sinit;\theta_\ell)K_\ell} - K\E\sbr{V(\sinit;\theta_*)},
\end{align*}
where the last step is by Monotone Convergence Theorem. Here $m(t_\ell)$ is the interval at time $t_\ell$. Note that from the first stopping criterion of the algorithm we have $K_\ell \leq K_{\ell-1}+1$ for all $\ell$. Thus, each term in the summation can be bounded as
\begin{align*}
\E\sbr{\one_{\{m(t_\ell) \leq M\}}V(\sinit;\theta_\ell)K_\ell} \leq \E\sbr{\one_{\{m(t_\ell) \leq M\}}V(\sinit;\theta_\ell)(K_{\ell-1} + 1)}.
\end{align*}
$\one_{\{m(t_\ell) \leq M\}}(K_{\ell-1} + 1)$ is $\calF_{t_\ell}$ measurable. Therefore, applying the property of posterior sampling (Lemma~\ref{lem: property of ps}) implies
\begin{align*}
\E\sbr{\one_{\{m(t_\ell) \leq M\}}V(\sinit;\theta_\ell)(K_{\ell-1} + 1)} = \E\sbr{\one_{\{m(t_\ell) \leq M\}}V(\sinit;\theta_*)(K_{\ell-1} + 1)}
\end{align*}
Substituting this into $R_M^2$, we obtain
\begin{align*}
R_M^2 &\leq \sum_{\ell = 1}^{\infty}\E\sbr{\one_{\{m(t_\ell) \leq M\}}V(\sinit;\theta_*)(K_{\ell-1}+1)} - K\E\sbr{V(\sinit;\theta_*)} \\
&= \E\sbr{\sum_{\ell = 1}^{L_M}V(\sinit;\theta_*)(K_{\ell-1}+1)} - K\E\sbr{V(\sinit;\theta_*)} \\
&= \E\sbr{V(\sinit;\theta_*)\rbr{\sum_{\ell = 1}^{L_M}K_{\ell-1} - K}} + \E\sbr{V(\sinit;\theta_*)L_M} \leq \B\E[L_M].
\end{align*}
In the last inequality we have used the fact that $0 \leq V(\sinit;\theta_*) \leq \B$ and $\sum_{\ell = 1}^{L_M}K_{\ell-1} \leq K$.
\end{proof}














\subsection{Proof of Lemma~\ref{lem: r3}}\label{app:proof:lem: r3}
\textbf{Lemma} (restatement of Lemma~\ref{lem: r3})\textbf{.} The third term $R_M^3$ can be bounded as
\begin{align*}
R_M^3 \leq 288\B S \sqrt{MA \log^2\frac{SA\E[T_M]}{\delta}} + 1632 \B S^2A\log^2\frac{SA\E[T_M]}{\delta} + 4S\B \delta \E[L_M].
\end{align*}
\begin{proof}
With abuse of notation let $\ell := \ell(t)$ denote the epoch at time $t$ and $m(t)$ be the interval at time $t$. We can write
\begin{align*}
&R_M^3 = \E\sbr{\sum_{t=1}^{T_M}\sbr{V(s'_t;\theta_\ell) - \sum_{s'}\thetalm(s'|s_t, a_t)V(s';\thetalm)}} \\
&= \E\sbr{\sum_{t=1}^{\infty}\one_{\{m(t) \leq M\}}\sbr{V(s'_t;\theta_\ell) - \sum_{s'}\thetalm(s'|s_t, a_t)V(s';\thetalm)}} \\
&= \sum_{t=1}^{\infty}\E\sbr{\one_{\{m(t) \leq M\}}\E\sbr{V(s'_t;\theta_\ell) - \sum_{s'}\thetalm(s'|s_t, a_t)V(s';\thetalm)\Big| \calF_t, \theta_*, \theta_\ell}}.
\end{align*}
The last equality follows from Dominated Convergence Theorem, tower property of conditional expectation, and that $\one_{\{m(t) \leq M\}}$ is measurable with respect to $\calF_t$. Note that conditioned on $\calF_t$, $\theta_*$ and $\theta_\ell$, the only random variable in the inner expectation is $s'_t$. Thus, $\E[V(s'_t;\theta_\ell) | \calF_t, \theta_*, \theta_\ell] = \sum_{s'}\theta_*(s'|s_t, a_t)V(s';\theta_\ell)$. Using Dominated Convergence Theorem again implies that
\begin{align}
&R_M^3 = \E\sbr{\sum_{t=1}^{T_M}\sumsp\sbr{\theta_*(s'|s_t, a_t) - \thetalm(s'|s_t, a_t)}V(s';\thetalm)} \nonumber \\
&= \E\sbr{\sum_{t=1}^{T_M}\sumsp\sbr{\theta_*(s'|s_t, a_t) - \thetalm(s'|s_t, a_t)}\rbr{V(s';\thetalm) - \sumsdp \theta_*(s'' | s_t, a_t)V(s''; \thetalm)}}, \label{eq: pf lem rr3 tmp1}
\end{align}
where the last equality is due to the fact that $\theta_*(\cdot|s_t, a_t)$ and $\thetalm(\cdot|s_t, a_t)$ are probability distributions and that $\sumsdp \theta_*(s'' | s_t, a_t)V(s''; \thetalm)$ is independent of $s'$.

Recall the Bernstein confidence set $B_\ell(s, a)$ defined in \eqref{eq: bernstein confidence set} and let $\Omega^\ell_{s, a}$ be the event that both $\theta_*(\cdot|s, a)$ and $\theta_\ell(\cdot|s, a)$ are in $B_\ell(s, a)$. If $\Omega^\ell_{s, a}$ holds, then the difference between $\theta_*(\cdot|s, a)$ and $\theta_\ell(\cdot|s, a)$ can be bounded by the following lemma.
\begin{lemma}
Denote $\Alsa = \Aldef$. If $\Omega^\ell_{s, a}$ holds, then 
\begin{align*}
\abr{\theta_*(s'|s, a) - \theta_\ell(s'|s, a)} \leq 8\sqrt{\theta_*(s'|s, a)\Alsa} + 136\Alsa.
\end{align*}
\begin{proof}
Since $\Omega^\ell_{s, a}$ holds, by \eqref{eq: bernstein confidence set} we have that
\begin{align*}
\hatthetal(s' | s, a) - \theta_*(s'|s, a) \leq 4\sqrt{\hatthetal(s'|s, a)\Alsa} + 28\Alsa.
\end{align*}
Using the primary inequality that $x^2 \leq ax + b$ implies $x \leq a + \sqrt b$ with $x = \sqrt{\hatthetal(s' | s, a)}$, $a = 4\sqrt{\Alsa}$, and $b = \theta_*(s'|s, a) + 28\Alsa$, we obtain
\begin{align*}
\sqrt{\hatthetal(s' | s, a)} \leq 4\sqrt{\Alsa} + \sqrt{\theta_*(s'|s, a) + 28\Alsa} \leq \sqrt{\theta_*(s'|s, a)} + 10\sqrt{\Alsa},
\end{align*}
where the last inequality is by sub-linearity of the square root. Substituting this bound into ~\eqref{eq: bernstein confidence set} yields
\begin{align*}
\abs{\theta_*(s'|s, a) - \hatthetal(s' | s, a)} \leq 4\sqrt{\theta_*(s'|s, a)\Alsa} + 68\Alsa.
\end{align*}
Similarly,
\begin{align*}
\abs{\theta_\ell(s'|s, a) - \hatthetal(s' | s, a)} \leq 4\sqrt{\theta_*(s'|s, a)\Alsa} + 68\Alsa.
\end{align*}
Using the triangle inequality completes the proof.
\end{proof}
\label{lem: theta_star minus theta_l}
\end{lemma}
Note that if either of $\theta_*(\cdot|s_t, a_t)$ or $\thetalm(\cdot|s_t, a_t)$ is not in $B_\ell(s_t, a_t)$, then the inner term of \eqref{eq: pf lem rr3 tmp1} can be bounded by $2S\B$ (note that $|\calS^+| \leq 2S$ and $V(\cdot;\thetalm) \leq \B$). Thus, applying Lemma~\ref{lem: theta_star minus theta_l} implies that
\begin{align*}
&\sumsp\sbr{\theta_*(s'|s_t, a_t) - \thetalm(s'|s_t, a_t)}\rbr{V(s';\thetalm) - \sumsdp \theta_*(s'' | s_t, a_t)V(s''; \thetalm)} \\
&\leq 8 \sumsp \sqrt{\Alstat\theta_*(s'|s_t, a_t)\rbr{V(s';\thetalm) - \sumsdp\theta_*(s''|s_t, a_t)V(s''; \thetalm)}^2}\one_{\Omega^\ell_{s_t, a_t}}\\
&\qquad + 136 \sumsp \Alstat \abr{V(s';\thetalm) - \sumsdp\theta_*(s''|s_t, a_t)V(s''; \thetalm)}\one_{\Omega^\ell_{s_t, a_t}} \\
&\qquad + 2S\B\rbr{\one_{\{\theta_*(\cdot|s_t, a_t) \notin B_\ell(s_t, a_t)\}} + \one_{\{\thetalm(\cdot|s_t, a_t) \notin B_\ell(s_t, a_t)\}}} \\
&\leq 16\sqrt{S\Alstat\Vlstat}\one_{\Omega^\ell_{s_t, a_t}} + 272S\B \Alstat\one_{\Omega^\ell_{s_t, a_t}} \\
&\qquad+ 2S\B\big(\one_{\{\theta_*(\cdot|s_t, a_t) \notin B_\ell(s_t, a_t)\}} + \one_{\{\thetalm(\cdot|s_t, a_t) \notin B_\ell(s_t, a_t)\}}\big).
\end{align*}
where $\Alsa = \Aldef$ and $\mathbb{V}_\ell(s, a)$ is defined in \eqref{eq: empirical variance}. Here the last inequality follows from Cauchy-Schwarz, $|\calS^+| \leq 2S$, $V(\cdot;\thetalm) \leq \B$ and the definition of $\mathbb{V}_\ell$. Substituting this into \eqref{eq: pf lem rr3 tmp1} yields
\begin{align}
R_M^3 &\leq 16\sqrt S\E\sbr{\sum_{t=1}^{T_M}\sqrt{\Alstat\Vlstat}\oneomega} \label{eq: pf lem R3 tmp1}\\
&\qquad + 272S\B \E\sbr{\sum_{t=1}^{T_M}\Alstat\oneomega}  \label{eq: pf lem R3 tmp2}\\
&\qquad + 2S\B\E\sbr{\sum_{t=1}^{T_M} \rbr{\one_{\{\theta_*(\cdot|s_t, a_t) \notin B_\ell(s_t, a_t)\}} + \one_{\{\thetalm(\cdot|s_t, a_t) \notin B_\ell(s_t, a_t)\}}}}. \label{eq: pf lem R3 tmp3}
\end{align}
The inner sum in \eqref{eq: pf lem R3 tmp2} is bounded by $6SA \log^2 (SAT_M/\delta)$ (see Lemma~\ref{lem: sumt of Alstat}). To bound \eqref{eq: pf lem R3 tmp3}, we first show that $B_\ell(s, a)$ contains the true transition probability $\theta_*(\cdot|s, a)$ with high probability: 
\begin{lemma}
For any epoch $\ell$ and any state-action pair $(s, a) \in \calS \times \calA$, $\theta_*(\cdot|s, a) \in B_\ell(s, a)$ with probability at least $1 - \frac{\delta}{2SAn^+_\ell(s, a)}$.
\label{lem: high prob bernstein}
\end{lemma}
\begin{proof}
Fix $(s, a, s') \in \calS\times\calA\times\calS^+$ and $0 < \delta' < 1$ (to be chosen later). Let $(Z_i)_{i=1}^\infty$ be a sequence of random variables drawn from the probability distribution $\theta_*(\cdot|s, a)$. Apply Lemma~\ref{lem: D.3 of cohen} below with $X_i = \one_{\{Z_i = s'\}}$ and $\delta_t = \frac{\delta'}{4St^2}$ to a prefix of length $t$ of the sequence $(X_i)_{i=1}^\infty$, and apply union bound over all $t$ and $s'$ to obtain
\begin{align*}
\abr{\hat\theta_\ell(s'|s, a) - \theta_*(s'|s, a)} \leq 2\sqrt{\frac{\hat\theta_\ell(s'|s, a)\log\frac{8S{n^+_\ell}^2(s, a)}{\delta'}}{n^+_\ell(s, a)}} + 7\log \frac{8S{n^+_\ell}^2(s, a)}{\delta'}
\end{align*}
with probability at least $1 - \delta'/2$ for all $s' \in \calS^+$ and $\ell \geq 1$, simultaneously. Choose $\delta'=\delta/SAn^+_{\ell}(s, a)$ and use $S \geq 2$, $A \geq 2$ to complete the proof.
\end{proof}
\begin{lemma}[Theorem D.3 (Anytime Bernstein) of \cite{rosenberg2020near}]
Let $(X_n)_{n=1}^\infty$ be a sequence of independent and identically distributed random variables with expectation $\mu$. Suppose that $0 \leq X_n \leq B$ almost surely. Then with probability at least $1 - \delta$, the following holds for all $n \geq 1$ simultaneously:
\begin{align*}
\abr{\sum_{i=1}^n(X_i - \mu)} \leq 2\sqrt{B\sum_{i=1}^nX_i \log\frac{2n}{\delta}} + 7B\log\frac{2n}{\delta}.
\end{align*}
\label{lem: D.3 of cohen}
\end{lemma}

Now, by rewriting the sum in \eqref{eq: pf lem R3 tmp3} over epochs, we have
\begin{align*}
&\E\sbr{\sum_{t=1}^{T_M} \rbr{\one_{\{\theta_*(\cdot|s_t, a_t) \notin B_\ell(s_t, a_t)\}} + \one_{\{\thetalm(\cdot|s_t, a_t) \notin B_\ell(s_t, a_t)\}}}} \\
&= \E\sbr{\suml \sumtl \rbr{\one_{\{\theta_*(\cdot|s_t, a_t) \notin B_\ell(s_t, a_t)\}} + \one_{\{\thetalm(\cdot|s_t, a_t) \notin B_\ell(s_t, a_t)\}}}} \\
&= \sum_{s, a}\E\sbr{\suml \sumtl \one_{\{s_t=s, a_t=a\}}\rbr{\one_{\{\theta_*(\cdot|s, a) \notin B_\ell(s, a)\}} + \one_{\{\thetalm(\cdot|s, a) \notin B_\ell(s, a)\}}}} \\
&= \sum_{s, a}\E\sbr{\suml \rbr{n_{t_{\ell+1}}(s, a) - n_{t_\ell}(s, a)}\rbr{\one_{\{\theta_*(\cdot|s, a) \notin B_\ell(s, a)\}} + \one_{\{\thetalm(\cdot|s, a) \notin B_\ell(s, a)\}}}}.
\end{align*}
Note that $n_{t_{\ell+1}}(s, a) - n_{t_\ell}(s, a) \leq n_{t_\ell}(s, a)+1$ by the second stopping criterion. Moreover, observe that $B_\ell(s, a)$ is $\calF_{t_\ell}$ measurable. Thus, it follows from the property of posterior sampling (Lemma~\ref{lem: property of ps}) that $\E[\one_{\{\thetalm(\cdot|s, a) \notin B_\ell(s, a)\}}|\calF_{t_\ell}] = \E[\one_{\{\theta_*(\cdot|s, a) \notin B_\ell(s, a)\}}|\calF_{t_\ell}] = \mathbb{P}(\theta_*(\cdot|s, a) \notin B_\ell(s, a)|\calF_{t_\ell}) \leq \delta/(2SAn_\ell^+(s, a))$, where the inequality is by Lemma~\ref{lem: high prob bernstein}. Using Monotone Convergence Theorem and that $\one_{\{m(t_\ell) \leq M\}}$ is $\calF_{t_\ell}$ measurable, we can write
\begin{align*}
&\sum_{s, a}\E\sbr{\suml \rbr{n_{t_{\ell+1}}(s, a) - n_{t_\ell}(s, a)}\rbr{\one_{\{\theta_*(\cdot|s, a) \notin B_\ell(s, a)\}} + \one_{\{\thetalm(\cdot|s, a) \notin B_\ell(s, a)\}}}} \\
&\leq \sum_{s, a}\sum_{\ell=1}^\infty\E\sbr{ \one_{\{m(t_\ell) \leq M\}}  \rbr{n_{t_\ell}(s, a)+1}\E\sbr{\one_{\{\theta_*(\cdot|s, a) \notin B_\ell(s, a)\}} + \one_{\{\thetalm(\cdot|s, a) \notin B_\ell(s, a)\}}| \calF_{t_\ell}}} \\
&\leq \sum_{s, a}\sum_{\ell=1}^\infty\E\sbr{ \one_{\{m(t_\ell) \leq M\}}  \rbr{n_{t_\ell}(s, a)+1}\frac{\delta}{SAn_\ell^+(s, a)}} \\
&\leq 2\delta \E[L_M],
\end{align*}
where the last inequality is by $n_{t_\ell}(s, a)+1 \leq 2n_\ell^+(s, a)$ and Monotone Convergence Theorem.
%This completes the proof of the lemma.
%We proceed by bounding \eqref{eq: pf lem R3 tmp1}. Denote by $k(m)$ the episode associated with interval $m$. By monotone convergence theorem and that $\onekm$ is measurable with respect to $\calF_{t_m}$, we have
%\begin{align*}
%&\E\sbr{\summ\sumtm\sqrt{\Alstat\Vlstat}\oneomega} \\
%&= \sum_{m=1}^\infty\E\sbr{\onekm\E\sbr{\sumtm\sqrt{\Alstat\Vlstat}\oneomega \Big| \calF_{t_m}}}.
%\end{align*}

We proceed by bounding \eqref{eq: pf lem R3 tmp1}. Denote by $t_m$ the start time of interval $m$, define $t_{M+1} := T_M+1$, and rewrite the sum in \eqref{eq: pf lem R3 tmp1} over intervals to get
\begin{align*}
\E\sbr{\sum_{t=1}^{T_M}\sqrt{\Alstat\Vlstat}\oneomega} &= \summ\E\sbr{\sumtm\sqrt{\Alstat\Vlstat}\oneomega}
\end{align*}
Applying Cauchy-Schwarz twice on the inner expectation implies
\begin{align*}
&\E\sbr{\sumtm\sqrt{\Alstat\Vlstat}\oneomega } \\
&\leq \E\sbr{\sqrt{\sumtm\Alstat}\cdot \sqrt{\sumtm\Vlstat\oneomega} } \\
&\leq \sqrt{\E\sbr{\sumtm\Alstat }} \cdot \sqrt{\E\sbr{\sumtm\Vlstat\oneomega }} \\
&\leq 7\B \sqrt{\E\sbr{\sumtm\Alstat }},
\end{align*}
where the last inequality is by Lemma~\ref{lem: sum of variance}. Summing over $M$ intervals and applying Cauchy-Schwarz, we get
\begin{align*}
\sum_{m=1}^{M}\E&\sbr{\sumtm\sqrt{\Alstat\Vlstat}\oneomega } \leq 7\B \sum_{m=1}^{M} \sqrt{\E\sbr{\sumtm\Alstat }} \\
&\leq 7\B \sqrt{M  \summ\E\sbr{\sumtm\Alstat }} \\
&= 7\B \sqrt{M  \E\sbr{\sum_{t=1}^{T_M}\Alstat }} \\
&\leq 18 \B \sqrt{MSA  \E\sbr{\log^2\frac{SAT_M}{\delta}}},
\end{align*}
where the last inequality follows from Lemma~\ref{lem: sumt of Alstat}.
Substituting these bounds in \eqref{eq: pf lem R3 tmp1}, \eqref{eq: pf lem R3 tmp2}, \eqref{eq: pf lem R3 tmp3}, concavity of $\log^2 x$ for $x \geq 3$, and applying Jensen's inequality completes the proof.




\begin{lemma}
$\sum_{t=1}^{T_M} \Alstat \leq 6SA \log^2 (SAT_M/\delta).$ \\
%2. For any interval $m$, $\sumtm \Alstat \leq 5\log (SAt_\ell/\delta)$.
\label{lem: sumt of Alstat}
\end{lemma}
\begin{proof}
Recall $\Alsa = \Aldef$. Denote by $L := \log(SAT_M/\delta)$, an upper bound on the numerator of $\Alstat$. we have
\begin{align*}
\sum_{t=1}^{T_M}\Alstat &\leq \sum_{t=1}^{T_M}\frac{L}{n_\ell^+(s_t, a_t)} = L\sum_{s, a}\sum_{t=1}^{T_M}\frac{\one_{\{s_t=s, a_t=a\}}}{n_\ell^+(s, a)} \\
&\leq 2L\sum_{s, a}\sum_{t=1}^{T_M}\frac{\one_{\{s_t=s, a_t=a\}}}{n_t^+(s, a)} = 2L\sum_{s, a}\one_{\{n_{T_M+1}(s, a) > 0\}} + 2L\sum_{s, a} \sum_{j=1}^{n_{T_M+1}(s, a)-1}\frac{1}{j} \\
&\leq 2LSA + 2L\sum_{s, a}(1 + \log n_{T_M+1}(s, a)) \\
&\leq 4LSA + 2LSA \log T_M \leq 6LSA \log T_M.
\end{align*}
Here the second inequality is by $n_\ell^+(s, a) \geq 0.5 n_t^+(s, a)$ (the second criterion in determining the epoch length), the third inequality is by $\sum_{x=1}^n1/x \leq 1 + \log n$, and the fourth inequality is by $n_{T_M+1}(s, a) \leq T_M$. The proof is complete by noting that $\log T_M \leq L$.
%
%2. By the definition of interval, all the state-action pairs in an interval except possibly the first one $(s_{t_m}, a_{t_m})$ are known. Since for a known state-action pair $(s, a)$ we have $n_\ell^+(s, a) \geq \alpha \cdot \frac{\B S}{\cmin} \log \frac{\B SAT_M}{\delta\cmin}$ for some large enough $\alpha$, and that $\log (x)/x$ is decreasing, we can write
%\begin{align*}
%\sumtm \Alstat &= \frac{\log(S^2A^2t_\ell^3n_\ell^+(s_{t_m}, a_{t_m})/\delta)}{n_\ell^+(s_{t_m}, a_{t_m})} + \sum_{t=t_m+1}^{t_{m+1}-1} \frac{\log(S^2A^2t_\ell^3n_\ell^+(s_t, a_t)/\delta)}{n_\ell^+(s_t, a_t)} \\
%&\leq \log (S^2A^2t_\ell^3/\delta) + \frac{\cmin(t_{m+1}-t_m)}{\B} \\
%&\leq 3\log (SAt_\ell/\delta) + 2 \\
%&\leq 5\log (SAt_\ell/\delta),
%\end{align*}
%where the second inequality is by $t_{m+1}-t_m \leq 2\B/\cmin$.
\end{proof}


%\begin{lemma}[Lemma D.4. of \citet{rosenberg2020near}]
%\label{lem: d4 of cohen}
%Let $(X_i)_{i=1}^\infty$ be a sequence of random variables adapted to the filtration $(\calF_i)_{i=0}^\infty$. Suppose that $0 \leq X_i \leq B$ almost surely. Then, with probability at least $1 - \delta$, the following holds for all $n \geq 1$ simultaneously:
%\begin{align*}
%\sum_{i=1}^n \E\sbr{X_i | \calF_{i-1}} \leq 2\sum_{i=1}^nX_i + 4 B\log \frac{2n}{\delta}.
%\end{align*}
%\end{lemma}









\begin{lemma}
For any interval $m$, $\E[\sumtm \Vlstat \one_{\Omega^\ell}] \leq 44\B^2$.
\label{lem: sum of variance}
\end{lemma}
\begin{proof}
To proceed with the proof, we need the following two technical lemmas.
\begin{lemma}
Let $(s, a)$ be a known state-action pair and $m$ be an interval. If $\Omega^\ell_{s, a}$ holds, then for any state $s' \in \calS^+$,
\begin{align*}
\abs{\theta_*(s'|s, a) - \theta_\ell(s'|s, a)} \leq \frac{1}{8} \sqrt{\frac{\cmin \theta_*(s'|s, a)}{S\B}} + \frac{\cmin}{4S\B}.
\end{align*}
\label{lem: known state-action}
\end{lemma}
\begin{proof}
From Lemma~\ref{lem: theta_star minus theta_l}, we know that if $\Omega^\ell_{s, a}$ holds, then
\begin{align*}
\abr{\theta_*(s'|s, a) - \theta_\ell(s'|s, a)} \leq 8\sqrt{\theta_*(s'|s, a)\Alsa} + 136\Alsa,
\end{align*}
with $\Alsa = \Aldef$. The proof is complete by noting that $\log (x)/x$ is decreasing, and that $n_\ell^+(s, a) \geq \alpha \cdot \frac{\B S}{\cmin} \log \frac{\B SA}{\delta\cmin}$ for some large enough constant $\alpha$ since $(s, a)$ is known.
\end{proof}
\begin{lemma}[Lemma B.15. of \citet{rosenberg2020near}]
Let $(X_t)_{t=1}^\infty$ be a martingale difference sequence adapted to the filtration $(\calF_t)_{t=0}^\infty$. Let $Y_n = (\sum_{t=1}^nX_t)^2 - \sum_{t=1}^n\E[X_t^2|\calF_{t-1}]$. Then $(Y_n)_{n=0}^\infty$ is a martingale, and in particular if $\tau$ is a stopping time such that $\tau \leq c$ almost surely, then $\E[Y_\tau] = 0$.
\label{lem: b15 of cohen}
\end{lemma}
By the definition of the intervals, all the state-action pairs within an interval except possibly the first one are known. Therefore, we bound
\begin{align*}
\E\sbr{\sumtm \Vlstat \oneomega \Big| \calF_{t_m}} = \E\sbr{\mathbb{V}_\ell(s_{t_m}, a_{t_m})\oneomega | \calF_{t_m}} + \E\sbr{\sumtmplus \Vlstat \oneomega \Big| \calF_{t_m}}.
\end{align*}
The first summand is upper bounded by $\B^2$. To bound the second term, define $Z_\ell^t := [V(s'_{t};\thetalm) - \sum_{s' \in \calS}\theta_*(s'|s_t, a_t)V(s';\thetalm)]\oneomega$. Conditioned on $\calF_{t_m}, \theta_*$ and $\thetalm$, $(Z_\ell^t)_{t \geq t_m}$ constitutes a martingale difference sequence with respect to the filtration $(\calF^m_{t+1})_{t \geq t_m}$, where $\calF^m_t$ is the sigma algebra generated by $\{(s_{t_m}, a_{t_m}), \cdots, (s_t, a_t)\}$. Moreover, $t_{m+1}-1$ is a stopping time with respect to $(\calF^m_{t+1})_{t \geq t_m}$ and is bounded by $t_m + 2\B/\cmin$. Therefore, Lemma~\ref{lem: b15 of cohen} implies that
\begin{align}
\label{eq: pf lem bounded v tmp0}
\E\sbr{\sumtmplus \Vlstat \oneomega \Big| \calF_{t_m}, \theta_*, \thetalm} = \E\sbr{\rbr{\sumtmplus Z_\ell^t \oneomega}^2 \Big| \calF_{t_m}, \theta_*, \thetalm}.
\end{align}
We proceed by bounding $\abs{\sumtmplus Z_\ell^t \oneomega}$ in terms of $\sumtmplus \Vlstat \oneomega$ and combine with the left hand side to complete the proof. We have
\begin{align}
&\abr{\sumtmplus Z_\ell^t \oneomega} = \abr{\sumtmplus \sbr{V(s'_{t};\thetalm) - \sum_{s' \in \calS}\theta_*(s'|s_t, a_t)V(s';\thetalm)}\oneomega} \nonumber \\
&\leq \abr{\sumtmplus \sbr{V(s'_{t};\thetalm) - V(s_{t};\thetalm)}} \label{eq: pf lem bounded v tmp1}\\
&+ \abr{\sumtmplus \sbr{V(s_{t};\thetalm) - \sum_{s' \in \calS}\thetalm(s'|s_t, a_t)V(s';\thetalm)}} \label{eq: pf lem bounded v tmp2}\\
& + \abr{\sumtmplus \sumsp \sbr{\thetalm(s'|s_t, a_t) - \theta_*(s'|s_t, a_t)}\rbr{V(s';\thetalm) - \sumsdp \theta_*(s''|s_t, a_t)V(s'';\thetalm)}\oneomega}. \label{eq: pf lem bounded v tmp3}
\end{align}
where \eqref{eq: pf lem bounded v tmp3} is by the fact that $\thetalm(\cdot|s_t, a_t), \theta_*(\cdot|s_t, a_t)$ are probability distributions and $\sumsdp \theta_*(s''|s_t, a_t)V(s'';\thetalm)$ is independent of $s'$ and $V(g;\thetalm) = 0$. \eqref{eq: pf lem bounded v tmp1} is a telescopic sum (recall that $s_{t+1} = s'_t$ if $s'_t \neq g$) and is bounded by $\B$. It follows from the Bellman equation that \eqref{eq: pf lem bounded v tmp2} is equal to $\sumtmplus c(s_t, a_t)$. By definition, the interval ends as soon as the cost accumulates to $\B$ during the interval. Moreover, since $V(\cdot; \thetalm) \leq \B$, the algorithm does not choose an action with instantaneous cost more than $\B$. This implies that $\sumtmplus c(s_t, a_t) \leq 2\B$. To bound \eqref{eq: pf lem bounded v tmp3} we use the Bernstein confidence set, but taking into account that all the state-action pairs in the summation are known, we can use Lemma~\ref{lem: known state-action} to obtain
\begin{align*}
& \sumsp \rbr{\thetalm(s'|s_t, a_t) - \theta_*(s'|s_t, a_t)}\rbr{V(s';\thetalm) - \sumsdp \theta_*(s''|s_t, a_t)V(s'';\thetalm)}\oneomega \\
&\leq  \sumsp \frac{1}{8} \sqrt{\frac{\cmin \theta_*(s'|s_t, a_t)\rbr{V(s';\thetalm) - \sumsdp \theta_*(s''|s_t, a_t)V(s'';\thetalm)}^2\oneomega}{S\B}} \\
&\qquad + \sumsp \frac{\cmin}{4S\B}\abr{V(s';\thetalm) - \sumsdp \theta_*(s''|s_t, a_t)V(s'';\thetalm)} \\
&\leq \frac{1}{4}\sqrt{\frac{\cmin \Vlstat \oneomega}{\B}} + \frac{c(s_t, a_t)}{2}.
\end{align*}
The last inequality follows from Cauchy-Schwarz inequality, $|\calS^+| \leq 2S$, $|V(\cdot;\thetalm)| \leq \B$, and $\cmin \leq c(s_t, a_t)$. Summing over the time steps in interval $m$ and applying Cauchy-Schwarz, we get
\begin{align*}
\sumtmplus \sbr{\frac{1}{4}\sqrt{\frac{\cmin \Vlstat \oneomega}{\B}} + \frac{c(s_t, a_t)}{2}} &\leq \frac{1}{4}\sqrt{(t_{m+1} - t_m)\frac{\cmin \sumtmplus\Vlstat \oneomega}{\B}} \\
&\qquad+ \frac{\sumtmplus c(s_t, a_t)}{2} \\
&\leq \frac{1}{4}\sqrt{2\sumtmplus \Vlstat \oneomega} + \B.
\end{align*}
The last inequality follows from the fact that duration of interval $m$ is at most $2\B/\cmin$ and its cumulative cost is at most $2\B$. Substituting these bounds into \eqref{eq: pf lem bounded v tmp0} implies that
\begin{align*}
\E\sbr{\sumtmplus \Vlstat \oneomega \Big| \calF_{t_m}, \theta_*, \thetalm} &\leq \E\sbr{\rbr{4\B + \frac{1}{4}\sqrt{2\sumtmplus \Vlstat \oneomega}}^2 \Big| \calF_{t_m}, \theta_*, \thetalm} \\
&\leq 32\B^2 + \frac{1}{4} \E\sbr{\sumtmplus \Vlstat \oneomega \Big| \calF_{t_m}, \theta_*, \thetalm},
\end{align*}
where the last inequality is by $(a+b)^2 \leq 2(a^2 + b^2)$ with $b =\frac{1}{4}\sqrt{2\sumtmplus \Vlstat \oneomega}$ and $a = 4\B$. Rearranging implies that $\E\sbr{\sumtmplus \Vlstat \oneomega | \calF_{t_m}, \theta_*, \thetalm} \leq 43\B^2$ and the proof is complete.
\end{proof}
\end{proof}









\subsection{Proof of Theorem~\ref{thm1}}\label{app:proof:thm1}
\textbf{Theorem} (restatement of Theorem~\ref{thm1})\textbf{.}
Suppose Assumptions~\ref{ass: class of ssp} and ~\ref{ass: cmin} hold. Then, the regret bound of the \ssp~algorithm is bounded as
	\begin{align*}
		R_K = \order\rbr{\B S \sqrt{KA}L^2 + S^2A \sqrt{\frac{{\B}^3}{\cmin}}L^2},
	\end{align*}
	where $L = \log (\B SAK\cmininv)$.
\begin{proof}
Denote by $C_M$ the total cost after $M$ intervals. Recall that 
\begin{align*}
&\E[C_M] = K\E[V(\sinit;\theta_*)] + R_M = K\E[V(\sinit;\theta_*)] + R_M^1 + R_M^2 + R_M^3
\end{align*}
Using Lemmas~\ref{lem: bounding R1}, \ref{lem: r2}, and \ref{lem: r3} with $\delta=1/K$ obtains
\begin{align}
\E[C_M]&\leq K\E[V(\sinit;\theta_*)] \nonumber \\
&+ \order\rbr{\B\E[L_M] + \B S \sqrt{MA \log^2(SAK\E[T_M])} + \B S^2A\log^2(SAK\E[T_M])}. \label{eq: pf thm1 tmp 1}
\end{align}
Recall that $L_M \leq  \sqrt{2SAK\log T_M} + SA\log T_M$. Taking expectation from both sides and using Jensen's inequality gets us $\E[L_M] \leq \sqrt{2SAK \log \E[T_M]} + SA \log \E[T_M]$.
Moreover, taking expectation from both sides of \eqref{eq: bound on m}, plugging in the bound on $\E[L_M]$, and concavity of $\log(x)$ implies 
\begin{align*}
M \leq \frac{\E[C_{M}]}{\B} + K + \sqrt{2SAK \log \E[T_M]} + SA \log \E[T_M] + \order\rbr{\frac{\B S^2A}{\cmin}\log \frac{\B KSA}{\cmin}}.
\end{align*}
Substituting this bound in \eqref{eq: pf thm1 tmp 1}, using subadditivity of the square root, and simplifying yields
\begin{align*}
\E[C_M] &\leq K\E[V(\sinit;\theta_*)] + \order\Bigg(\B S \sqrt{KA\log^2 (SAK\E[T_M])} +  S \sqrt{\B \E[C_M]A \log^2 (SAK\E[T_M])}\\
&+ \B S^\frac{5}{4}A^\frac{3}{4}K^\frac{1}{4} \log^\frac{5}{4}(SAK\E[T_M]) + S^2A\sqrt{\frac{\B^3}{\cmin}\log^3 \frac{\B SAK \E[T_M]}{\cmin}}\Bigg).
\end{align*}
Solving for $\E[C_M]$ (by using the primary inequality that $x \leq a \sqrt{x} + b$ implies $x \leq (a + \sqrt b)^2$ for $a, b > 0$), using $K \geq S^2A$, $V(\sinit;\theta_*) \leq \B$, and simplifying the result gives
\begin{align}
\label{eq: pf thm1 tmp2}
&\E[C_M] \leq \Bigg(\order\rbr{S \sqrt{\B A \log^2 (SAK\E[T_M])}} \nonumber \\
&+ \sqrt{K\E[V(\sinit;\theta_*)] + \order\rbr{\B S \sqrt{KA\log^{2.5} (SAK\E[T_M])} + S^2A\sqrt{\frac{\B^3}{\cmin}\log^3 \frac{\B SAK \E[T_M]}{\cmin}}}} \Bigg)^2 \nonumber \\
&\leq \order\rbr{\B S^2A\log^2 \frac{SA\E[T_M]}{\delta}} \nonumber \\
&\quad+ K\E[V(\sinit;\theta_*)] + \order\Bigg(\B S \sqrt{KA\log^{2.5} (SAK\E[T_M])} + S^2A\sqrt{\frac{\B^3}{\cmin}\log^3 \frac{\B SAK \E[T_M]}{\cmin}} \nonumber \\
&\quad+ \B S \sqrt{KA\log^{4} (SAK\E[T_M])} + S^2A\rbr{\frac{{\B}^5}{\cmin}\log^7 \frac{\B SAK\E[T_M]}{\cmin}}^\frac{1}{4}\Bigg) \nonumber \\
&\leq K\E[V(\sinit;\theta_*)] + \order\rbr{\B S \sqrt{KA\log^{4} SAK\E[T_M])} + S^2A \sqrt{\frac{\B^3}{\cmin}\log^4 \frac{\B SAK\E[T_M]}{\cmin}}}.
\end{align}
Note that by simplifying this bound, we can write $\E[C_M] \leq \order\rbr{\sqrt{{\B}^3S^4A^2K^2\E[T_M]/\cmin}}$. On the other hand, we have that $\cmin T_M \leq C_M$ which implies $\E[T_M] \leq \E[C_M]/\cmin$. Isolating $\E[T_M]$ implies $\E[T_M] \leq \order\rbr{{\B}^3S^4A^2K^2/c^3_\text{min}}$. Substituting this bound into \eqref{eq: pf thm1 tmp2} yields
\begin{align*}
&\E[C_M] \leq K\E[V(\sinit;\theta_*)] + \order\rbr{\B S \sqrt{KA\log^{4} \frac{\B SAK}{\cmin}} + S^2A \sqrt{\frac{\B^3}{\cmin}\log^4 \frac{\B SAK}{\cmin}}}.
\end{align*}
We note that this bound holds for any number of $M$ intervals as long as the $K$ episodes have not elapsed. Since, $\cmin > 0$, this implies that the $K$ episodes eventually terminate and the claimed bound of the theorem for $R_K$ holds.
\end{proof}






















\subsection{Proof of Theorem~\ref{thm2}}\label{sec:proof:thm2}
\textbf{Theorem} (restatement of Theorem~\ref{thm2})\textbf{.}
Suppose Assumption~\ref{ass: class of ssp} holds. Running the \ssp~algorithm with costs $c_\epsilon(s, a) := \max \{c(s, a), \epsilon\}$ for $\epsilon = (S^2A/K)^{2/3}$ yields
	\begin{align*}
		R_K = \order\rbr{\B S \sqrt{KA}\tilde{L}^2 + (S^2A)^\frac{2}{3}K^\frac{1}{3}(\B^\frac{3}{2}\tilde{L}^2 + \T) + S^2A\T^\frac{3}{2}\tilde{L}^2},
	\end{align*}
	where $\tilde L := \log (K\B\T SA)$ and $\T$ is an upper bound on the expected time the optimal policy takes to reach the goal from any initial state.
\begin{proof}
Denote by $T_K^\epsilon$ the time to complete $K$ episodes if the algorithm runs with the perturbed costs $c_\epsilon(s, a)$ and let $V_\epsilon(\sinit;\theta_*)$, $V^\pi_\epsilon(\sinit;\theta_*)$ be the optimal value function and the value function for policy $\pi$ in the SSP with cost function $c_\epsilon(s, a)$ and transition kernel $\theta_*$. We can write
\begin{align}
R_K&= \E\sbr{\sum_{t=1}^{T_K^\epsilon}c(s_t, a_t) - KV(\sinit;\theta_*)} \nonumber \\
&\leq \E\sbr{\sum_{t=1}^{T_K^\epsilon}c_\epsilon(s_t, a_t) - KV(\sinit;\theta_*)} \nonumber \\
&=  \E\sbr{\sum_{t=1}^{T_K^\epsilon}c_\epsilon(s_t, a_t) - KV_\epsilon(\sinit;\theta_*)} + K\E\sbr{V_\epsilon(\sinit;\theta_*) - V(\sinit;\theta_*)}. \label{eq: thm2 tmp1}
\end{align}
Theorem~\ref{thm1} implies that the first term is bounded by
\begin{align*}
\E\sbr{\sum_{t=1}^{T_K^\epsilon}c_\epsilon(s_t, a_t) - KV_\epsilon(\sinit;\theta_*)} = \order\rbr{\B^\epsilon S \sqrt{KA}L_\epsilon^2 + S^2A \sqrt{\frac{{\B^\epsilon}^3}{\epsilon}}L_\epsilon^2},
\end{align*}
with $L_\epsilon = \log (\B^\epsilon SAK/\epsilon)$ and $\B^\epsilon \leq \B + \epsilon \T$ (to see this note that $V_\epsilon(s;\theta_*) \leq V^{\pi^*}_\epsilon(s;\theta_*) \leq \B + \epsilon \T$).
To bound the second term of \eqref{eq: thm2 tmp1}, we have
\begin{align*}
V_\epsilon(\sinit;\theta_*) \leq V^{\pi^*}_\epsilon(\sinit;\theta_*) \leq V(\sinit;\theta_*) + \epsilon\T.
\end{align*}
Combining these bounds, we can write
\begin{align*}
R_K &= \order\rbr{\B S \sqrt{KA}L_\epsilon^2 + \epsilon\T S \sqrt{KA}L_\epsilon^2 + S^2A \sqrt{\frac{(\B + \epsilon \T)^3}{\epsilon}}L_\epsilon^2 + K \T\epsilon}.
\end{align*}
Substituting $\epsilon = (S^2A/K)^{2/3}$, and simplifying the result with $K \geq S^2A$ and $\B \leq \T$ (since $c(s, a) \leq 1$) implies
\begin{align*}
R_K = \order\rbr{\B S \sqrt{KA}\tilde{L}^2 + (S^2A)^\frac{2}{3}K^\frac{1}{3}(\B^\frac{3}{2}\tilde{L}^2 + \T) + S^2A\T^\frac{3}{2}\tilde{L}^2},
\end{align*}
where $\tilde L = \log (K\B\T SA)$. This completes the proof.
\end{proof}
