In this section, we show that the behavior policies of algorithm \ref{alg:UniSREP} achieve anytime sub-linear regret without exploiting the UniSOFT property. On a high level, this ensures that the algorithm plays optimal actions often enough, such that the MLE constrained oracle eventually selects UniSOFT features, which we leverage in subsequent sections to improve upon the baseline result. We note that the analysis in this section is purely based on known results and provided for completeness.

We start by providing two important results, first introduced by \cite{uehara2021representation}, which we will use to link the bonus of the learned features to the elliptical potential function of the true features. This allows us to track the progress of our algorithm through the standard elliptical potential lemma \ref{lemma:elliptical_potential}.

\begin{lemma}\label{lemma:one_step_back_true}(One-step back inequality in the true model)
   Consider a set of functions \(\{g_{h}\}_{h=1}^{H}\) that satisfies \(g_{h}:\mathcal{S}\times\mathcal{A}\to\mathbb{R}\) such that \(\Vert g_{h} \Vert_{\infty}\leq B\) for all \(h\in[H]\). Then, for all \(t\in\mathbb{N}\), \(h>1\) and any \(\pi\), 

    \begin{align*}
        &\mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star}, h}^{\pi}}[g_{h}(s,a)] \\  &\quad\leq \mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star}, h-1}^{\pi}}[\Vert\phi^{\star}_{h-1}(s,a)\Vert_{\Sigma_{\gamma_{t, h-1}, \phi^{\star}_{h-1}}^{-1}}]\sqrt{t\frac{|\mathcal{A}|}{\xi_{t}}\mathbb{E}_{(s,a) \sim\rho_{t,h}}[g_{h}(s,a)^{2}] + B^{2}\lambda_{t}d}
    \end{align*}
\end{lemma}

\begin{proof}
    For \(h=2, ..., H\) we have,
    \begin{align*}
        &\mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star}, h}^{\pi}}[g_{h}(s,a)] \\ &= \mathbb{E}_{(\tilde{s},\tilde{a})\sim d_{\mathcal{P}^{\star}, h-1}^{\pi}, s\sim \mathcal{P}^{\star}_{h-1}(\cdot|\tilde{s}, \tilde{a}), a\sim \pi_{h}(\cdot|s)}[g_{h}(s,a)] \\
        &= \mathbb{E}_{(\tilde{s},\tilde{a})\sim d_{\mathcal{P}^{\star}, h-1}^{\pi}}[\langle\phi^{\star}_{h-1}(\tilde{s}, \tilde{a}), \sum_{(s,a)\in\mathcal{S}\times\mathcal{A}}\mu^{\star}_{h-1}(s)\pi_{h}(a|s)g_{h}(s,a)\rangle] \\
        &\stackrel{(i)}{\leq} \mathbb{E}_{(\tilde{s},\tilde{a})\sim d_{\mathcal{P}^{\star}, h-1}^{\pi}}[\Vert\phi^{\star}_{h-1}(\tilde{s},\tilde{a})\Vert_{\Sigma_{\gamma_{t, h-1}, \phi^{\star}_{h-1}}^{-1}} \Vert \sum_{(s,a)\in\mathcal{S}\times\mathcal{A}}\mu^{\star}_{h-1}(s)\pi_{h}(a|s)g_{h}(s,a)\Vert_{\Sigma_{\gamma_{t, h-1}, \phi^{\star}_{h-1}}}],
    \end{align*}
    where \((i)\) follows from the symmetry of the regularized covariance matrix and an application of the Cauchy-Schwarz inequality. Further we have for \(h=2, ..., H\),
    \begin{align*}
        &\Vert \sum_{(s,a)\in\mathcal{S}\times\mathcal{A}}\mu^{\star}_{h-1}(s)\pi_{h}(a|s)g_{h}(s,a)\Vert_{\Sigma_{\gamma_{t, h-1}, \phi^{\star}_{h-1}}}^{2} \\
        &\stackrel{(i)}{\leq} t\mathbb{E}_{(\tilde{s}, \tilde{a})\sim\gamma_{t,h-1}}[(\sum_{(s,a)\in\mathcal{S}\times\mathcal{A}}\langle\phi^{\star}_{h-1}(\tilde{s}, \tilde{a}),\mu^{\star}_{h-1}(s)\rangle\pi_{h}(a|s)g_{h}(s,a))^{2}] + B^{2}\lambda_{t}d \\
        &= t\mathbb{E}_{(\tilde{s}, \tilde{a})\sim\gamma_{t,h-1}}[\mathbb{E}_{s\sim\mathcal{P}^{\star}_{h-1}(\cdot|\tilde{s}, \tilde{a}), a\sim\pi_{h}(\cdot|s)}[g_{h}(s,a)]^{2}] + B^{2}\lambda_{t}d \\
        &\leq t\mathbb{E}_{s \sim\rho_{t,h}, a\sim \pi_{h}(\cdot| s)}[g_{h}(s,a)^{2}] + B^{2}\lambda_{t}d \\
        &\stackrel{(ii)}{\leq} t\max_{s,a}\frac{\rho_{t,h}(s)\pi_{h}(a|s)}{\rho_{t,h}(s)\bar{\pi}_{t,h}(a|s)}\mathbb{E}_{(s,a) \sim\rho_{t,h}}[g_{h}(s,a)^{2}] + B^{2}\lambda_{t}d \\
        &\leq t\frac{1}{\frac{1}{t}\sum_{i=0}^{t-1}(\xi_{i}\cdot\frac{1}{|\mathcal{A}|})}\mathbb{E}_{(s,a) \sim\rho_{t,h}}[g_{h}(s,a)^{2}] + B^{2}\lambda_{t}d \\
        &\stackrel{(iii)}{\leq} t\frac{|\mathcal{A}|}{\xi_{t}}\mathbb{E}_{(s,a) \sim\rho_{t,h}}[g_{h}(s,a)^{2}] + B^{2}\lambda_{t}d,
    \end{align*}
    where, \((i)\) is by assumptions \(\Vert g_{h}(s,a)\Vert_{\infty}\leq B\) and \(\Vert\int_{\mathcal{S}}\mu^{\star}(s)h(s)p(s)\Vert_{2} \leq \sqrt{d}\) for any \(h:\mathcal{S}\to[0,1]\) (realizability, Assumption \ref{ass:realizability}), \((ii)\) is by importance sampling and \((iii)\) follows from $\xi_{t}$ being decreasing.
\end{proof}

\begin{lemma}\label{lemma:one_step_back_learned}(One-step back inequality in the learned model)
    Consider a set of functions \(\{g_{h}\}_{h=1}^{H}\) that satisfies \(g_{h}:\mathcal{S}\times\mathcal{A}\to\mathbb{R}\) such that \(\Vert g_{h} \Vert_{\infty}\leq B\) for all \(h\in[H]\). Then, given that the event \(\mathcal{E}\) occurs, for all \(t\in\mathbb{N}\), \(h>1\) and any \(\pi\), 

    \begin{align*}
        &\mathbb{E}_{(s,a)\sim d_{\hat{\mathcal{P}}_{t}, h}^{\pi}}[g_{h}(s,a)] \\  &\quad\leq \mathbb{E}_{(s,a)\sim d_{\hat{\mathcal{P}}_{t}, h-1}^{\pi}}[\Vert\hat{\phi}_{t, h-1}(s,a)\Vert_{\Sigma_{\rho_{t, h-1}, \hat{\phi}_{t, h-1}}^{-1}}]\sqrt{2t\frac{|\mathcal{A}|}{\xi_{t}}\mathbb{E}_{(s,a) \sim\rho_{t,h}'}[g_{h}(s,a)^{2}] + B^{2}\lambda_{t}d + 2t\frac{|\mathcal{A}|}{\xi_{t}}B^{2}\zeta_{t}}
    \end{align*}
        
    
\end{lemma}

\begin{proof}
    Let \(t\in\mathbb{N}\) be arbitrary. For all \(h=2, ..., H\) we have,
    \begin{align*}
        &\mathbb{E}_{(s,a)\sim d_{\hat{\mathcal{P}}_{t}, h}^{\pi}}[g_{h}(s,a)] \\ &= \mathbb{E}_{(\tilde{s},\tilde{a})\sim d_{\hat{\mathcal{P}}_{t}, h-1}^{\pi}, s\sim \hat{\mathcal{P}}_{t, h-1}(\cdot|\tilde{s}, \tilde{a}), a\sim \pi_{h}(\cdot|s)}[g_{h}(s,a)] \\
        &= \mathbb{E}_{(\tilde{s},\tilde{a})\sim d_{\hat{\mathcal{P}}_{t}, h-1}^{\pi}}[\langle\hat{\phi}_{t,h-1}(\tilde{s}, \tilde{a}), \sum_{(s,a)\in\mathcal{S}\times\mathcal{A}}\hat{\mu}_{t,h-1}(s)\pi_{h}(a|s)g_{h}(s,a)\rangle] \\
        &\stackrel{(i)}{\leq} \mathbb{E}_{(\tilde{s},\tilde{a})\sim d_{\hat{\mathcal{P}}_{t}, h-1}^{\pi}}[\Vert\hat{\phi}_{t, h-1}(\tilde{s},\tilde{a})\Vert_{\Sigma_{\rho_{t, h-1}, \hat{\phi}_{t, h-1}}^{-1}} \Vert \sum_{(s,a)\in\mathcal{S}\times\mathcal{A}}\hat{\mu}_{t,h-1}(s)\pi_{h}(a|s)g_{h}(s,a)\Vert_{\Sigma_{\rho_{t, h-1}, \hat{\phi}_{t, h-1}}}],
    \end{align*}
    where \((i)\) follows from the symmetry of the covariance matrix and an application of the Cauchy-Schwarz inequality. Further we have for all \(h=2, ..., H\),
    \begin{align*}
        &\Vert \sum_{(s,a)\in\mathcal{S}\times\mathcal{A}}\hat{\mu}_{t,h-1}(s)\pi_{h}(a|s)g_{h}(s,a)\Vert_{\Sigma_{\rho_{t, h-1}, \hat{\phi}_{t, h-1}}}^{2} \\
        &\stackrel{(i)}{\leq} t\mathbb{E}_{(\tilde{s}, \tilde{a})\sim\rho_{t,h-1}}[(\sum_{(s,a)\in\mathcal{S}\times\mathcal{A}}\langle\hat{\phi}_{t,h-1}(\tilde{s}, \tilde{a}),\hat{\mu}_{t,h-1}(s)\rangle\pi_{h}(a|s)g_{h}(s,a))^{2}] + B^{2}\lambda_{t}d \\
        &= t\mathbb{E}_{(\tilde{s}, \tilde{a})\sim\rho_{t,h-1}}[\mathbb{E}_{s\sim\hat{\mathcal{P}}_{t,h-1}(\cdot|\tilde{s}, \tilde{a}), a\sim\pi_{h}(\cdot|s)}[g_{h}(s,a)]^{2}] + B^{2}\lambda_{t}d \\
        &\stackrel{(ii)}{\leq} 2t\mathbb{E}_{(\tilde{s}, \tilde{a})\sim\rho_{t,h-1}}[\mathbb{E}_{s\sim\mathcal{P}^{\star}_{h-1}(\cdot|\tilde{s}, \tilde{a}), a\sim\pi_{h}(\cdot|s)}[g_{h}(s,a)]^{2}] + B^{2}\lambda_{t}d + 2tB^{2}\zeta_{t} \\
        &\leq 2t\mathbb{E}_{s \sim\rho_{t,h}', a\sim \pi_{h}(\cdot| s)}[g_{h}(s,a)^{2}] + B^{2}\lambda_{t}d + 2t\frac{|\mathcal{A}|}{\xi_{t-1}}B^{2}\zeta_{t} \\
        &\stackrel{(iii)}{\leq} 2t\frac{|\mathcal{A}|}{\xi_{t-1}}\mathbb{E}_{(s,a) \sim\rho_{t,h}'}[g_{h}(s,a)^{2}] + B^{2}\lambda_{t}d + 2t\frac{|\mathcal{A}|}{\xi_{t-1}}B^{2}\zeta_{t},
    \end{align*}
    where, \((i)\) is by assumptions \(\Vert g_{h}(s,a)\Vert_{\infty}\leq B\) and \(\Vert\int_{\mathcal{S}}\hat{\mu}(s)h(s)p(s)\Vert_{2} \leq \sqrt{d}\) for any \(h:\mathcal{S}\to[0,1]\) (realizability, Assumption \ref{ass:realizability}), \((ii)\) follows from \((a+b)^{2} \leq 2a^{2} + 2b^{2}\), importance sampling and the event \(\mathcal{E}\) and \((iii)\) is again by importance sampling.
\end{proof}

The following lemma exploits the one-step back inequalities to relate the bonus and the estimation error to elliptical potential functions. The formulation of the statement is inspired by Lemma 3 of \cite{cheng2023improved}.

\begin{lemma}\label{lemma:bonus_relations}(Bonus relations)
    Given that the event \(\mathcal{E}\) occurs, for all  \(t\in\mathbb{N}\), \(h>1\) and any \(\pi\),
    \begin{align*}
        &\mathbb{E}_{(s,a)\sim d_{\hat{\mathcal{P}}_{t}, h}^{\pi}}[f_{t,h}(s,a)] \leq \alpha_{t}\mathbb{E}_{(s,a)\sim d_{\hat{\mathcal{P}}_{t}, h-1}^{\pi}}[\Vert\hat{\phi}_{t,h-1}(s,a)\Vert_{\Sigma_{\rho_{t,h-1}, \hat{\phi}_{t,h-1}}^{-1}}], \\
         &\mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star}, h}^{\pi}}[f_{t,h}(s,a)] \leq \alpha_{t}\mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star}, h-1}^{\pi}}[\Vert\phi^{\star}_{h-1}(s,a)\Vert_{\Sigma_{\rho_{t,h-1}, \phi^{\star}_{h-1}}^{-1}}], \\
        &\mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star}, h}^{\pi}}[\hat{b}_{t,h}(s,a)] \leq \beta_{t}\mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star}, h-1}^{\pi}}[\Vert\phi^{\star}_{h-1}(s,a)\Vert_{\Sigma_{\gamma_{t,h-1}, \phi^{\star}_{h-1}}^{-1}}],
    \end{align*}
    where \(\alpha_{t}=\sqrt{4t\zeta_{t}\frac{|\mathcal{A}|}{\xi_{t}} + \lambda_{t}d}\) and \(\beta_{t} = \sqrt{\frac{|\mathcal{A}|}{\xi_{t}}40\alpha_{t}^{2}d + \lambda_{t}d}\). In particular, for h=1,
    \begin{align*}
        \mathbb{E}_{s \sim d_{1}, a \sim \pi_{1}(\cdot|s)}[f_{t,1}(s,a)] \leq \sqrt{\frac{|\mathcal{A}|}{\xi_{t}}\zeta_{t}}, \qquad \mathbb{E}_{s \sim d_{1}, a \sim \pi_{1}(\cdot|s)}[\hat{b}_{t,1}(s,a)] \leq 15\alpha_{t}\sqrt{\frac{d|\mathcal{A}|}{t\xi_{t}}}.
    \end{align*}
\end{lemma}
\begin{proof}
    Let \(t\in\mathbb{N}\) be arbitrary. For all \(h>1\) we have,
    \begin{align*}
        &\mathbb{E}_{(s,a)\sim d_{\hat{\mathcal{P}}_{t}, h}^{\pi}}[f_{t,h}(s,a)] \\ &\stackrel{(i)}{\leq} \mathbb{E}_{(s,a)\sim d_{\hat{\mathcal{P}}_{t}, h-1}^{\pi}}[\Vert\hat{\phi}_{t, h-1}(s,a)\Vert_{\Sigma_{\rho_{t, h-1}, \hat{\phi}_{t, h-1}}^{-1}}]\sqrt{2t\frac{|\mathcal{A}|}{\xi_{t}}\mathbb{E}_{(s,a) \sim\rho_{t,h}'}[f_{t,h}(s,a)^{2}] + \lambda_{t}d + 2t\frac{|\mathcal{A}|}{\xi_{t}}\zeta_{t}} \\
    &\stackrel{(ii)}{\leq} \alpha_{t}\mathbb{E}_{(s,a)\sim d_{\hat{\mathcal{P}}_{t}, h-1}^{\pi}}[\Vert\hat{\phi}_{t, h-1}(s,a)\Vert_{\Sigma_{\rho_{t, h-1}, \hat{\phi}_{t, h-1}}^{-1}}],
    \end{align*}
    where \((i)\) is by Lemma \ref{lemma:one_step_back_learned} and \(\Vert f_{t,h}\Vert_{\infty}\leq 1\) and \((ii)\) follows from the event $\mathcal{E}$. Similarly, for all \(h>1\),
    \begin{align*}
        &\mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star}_{t}, h}^{\pi}}[f_{t,h}(s,a)] \\ &\stackrel{(i)}{\leq} \mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star}, h-1}^{\pi}}[\Vert\phi^{\star}_{h-1}(s,a)\Vert_{\Sigma_{\rho_{t, h-1}, \phi^{\star}_{h-1}}^{-1}}]\sqrt{2t\frac{|\mathcal{A}|}{\xi_{t}}\mathbb{E}_{(s,a) \sim\rho_{t,h}'}[f_{t,h}(s,a)^{2}] + \lambda_{t}d} \\
    &\stackrel{(ii)}{\leq} \alpha_{t}\mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star}, h-1}^{\pi}}[\Vert\phi^{\star}_{h-1}(s,a)\Vert_{\Sigma_{\rho_{t, h-1}, \phi^{\star}_{h-1}}^{-1}}],
    \end{align*}
    where \((i)\) is by Lemma \ref{lemma:one_step_back_true} and \(\Vert f_{t,h}\Vert_{\infty}\leq 1\) and \((ii)\) follows from the event $\mathcal{E}$. For \(h=1\) we have,
    \begin{align*}
        \mathbb{E}_{s \sim d_{1}, a \sim \pi_{1}(\cdot| s)}[f_{t,1}(s,a)] &\stackrel{(i)}{\leq} \sqrt{\frac{|\mathcal{A}|}{\xi_{t}} \mathbb{E}_{(s,a)\sim \rho_{t,1}}[f_{t,1}(s,a)^{2}]} \leq \sqrt{\frac{|\mathcal{A}|}{\xi_{t}}\zeta_{t}},
    \end{align*}
    where \((i)\) is by importance sampling and Jensen's inequality. We can bound the bonus by,
    \begin{align*}
        &\mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star}_{t}, h}^{\pi}}[\hat{b}_{t,h}(s,a)] \\ &\leq \mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star}, h-1}^{\pi}}[\Vert\phi^{\star}_{h-1}(s,a)\Vert_{\Sigma_{\gamma_{t, h-1}, \phi^{\star}_{h-1}}^{-1}}]\sqrt{t\frac{|\mathcal{A}|}{\xi_{t}}\mathbb{E}_{(s,a) \sim\rho_{t,h}}[\hat{b}_{t,h}(s,a)^{2}] + \lambda_{t}d},
    \end{align*}
    which follows from Lemma \ref{lemma:one_step_back_true} and \(\Vert \hat{b}_{t,h}\Vert_{\infty}\leq 1\). Further,
    \begin{align*}
        &t\mathbb{E}_{(s,a) \sim\rho_{t,h}}[\hat{b}_{t,h}(s,a)^{2}] \\ &\leq t \mathbb{E}_{(s,a) \sim\rho_{t,h}}[\hat{\alpha}_{t}^{2}\Vert\hat{\phi}_{t,h}(s,a)\Vert_{\hat{\Sigma}_{t,h}^{-1}}^{2}] \\
        &\stackrel{(i)}{\leq} t\mathbb{E}_{(s,a) \sim\rho_{t,h}}[9\hat{\alpha}_{t}^{2}\Vert\hat{\phi}_{t,h}(s,a)\Vert_{\Sigma_{\rho_{t,h}, \hat{\phi}_{t,h}}^{-1}}^{2}] \\
        &= 9\hat{\alpha}_{t}^{2} t\textnormal{Tr}\left(\mathbb{E}_{(s,a) \sim\rho_{t,h}}[\hat{\phi}_{t,h}(s,a)\hat{\phi}_{t,h}(s,a)^{T}](t\mathbb{E}_{(s,a) \sim\rho_{t,h}}[\hat{\phi}_{t,h}(s,a)\hat{\phi}_{t,h}(s,a)^{T}] + \lambda_{t}I)^{-1}\right) \\
        &\stackrel{(ii)}{\leq} 9\hat{\alpha}_{t}^{2}d,
    \end{align*}
    where \((i)\) follows from the event \(\mathcal{E}\) and \((ii)\) follows from Lemma \ref{lemma:bounded_squared_uncertainty}. Therefore,
    \begin{align*}
        &\mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star}_{t}, h}^{\pi}}[\hat{b}_{t,h}(s,a)] \\ &\leq \mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star}, h-1}^{\pi}}[\Vert\phi^{\star}_{h-1}(s,a)\Vert_{\Sigma_{\gamma_{t, h-1}, \phi^{\star}_{h-1}}^{-1}}]\sqrt{\frac{|\mathcal{A}|}{\xi_{t}}9\hat{\alpha}_{t}^{2}d + \lambda_{t}d}.
    \end{align*}
    Finally, for \(h=1\),
    \begin{align*}
        \mathbb{E}_{s \sim d_{1}, a \sim \pi_{1}(\cdot|s)}[\hat{b}_{t,1}(s,a)] &\stackrel{(i)}{\leq} 3\hat{\alpha}_{t}\sqrt{\frac{|\mathcal{A}|}{\xi_{t}}\mathbb{E}_{(s,a)\sim\rho_{t,1}}[\Vert\hat{\phi}_{t,1}(s,a)\Vert_{\Sigma_{\rho_{t, 1}, \hat{\phi}_{t,1}}^{-1}}^{2}]} \\
        &\stackrel{(ii)}{\leq} 15\alpha_{t}\sqrt{\frac{d|\mathcal{A}|}{t\xi_{t}}},
    \end{align*}
    where \((i)\) follows from the event \(\mathcal{E}\), importance sampling and Jensen's inequality and \((ii)\) follows from Lemma \ref{lemma:bounded_squared_uncertainty}.
\end{proof}

The next result shows that the optimal value w.r.t. the bonus-augmented reward function in the estimated environment provides an almost optimistic estimate of the true value achieved by any optimal policy.

\begin{lemma}\label{lemma:almost_optimism_at_init_dist}(Almost Optimism at the Initial Distribution)
    Given that the event \(\mathcal{E}\) occurs, for all \(t\in\mathbb{N}\),
    \[
         V_{\mathcal{P}^{\star}, r^{\star},1}^{\pi^{\star}, d_{1}} - V_{\hat{\mathcal{P}}, r^{\star}+\hat{b}_{t},1}^{\pi^{\star}, d_{1}} \leq \sqrt{\frac{|\mathcal{A}|}{\xi_{t}}\zeta_{t}}
    \]
\end{lemma}

\begin{proof}
Let \(t\in\mathbb{N}\) be arbitrary.
\begin{align*}
    &V_{\mathcal{P}^{\star}, r^{\star},1}^{\pi^{\star}, d_{1}} - V_{\hat{\mathcal{P}}, r^{\star}+\hat{b}_{t},1}^{\pi^{\star}, d_{1}} \\
    &\stackrel{(i)}{=} \sum_{h=1}^{H}\mathbb{E}_{(s,a)\sim d_{\hat{\mathcal{P}}_{t}, h}^{\pi^{\star}}}[(\mathcal{P}_{h}^{\star} - \hat{\mathcal{P}}_{t,h})V_{\mathcal{P}^{\star},r^{\star}, h+1}^{\pi^{\star}}(s,a) - \hat{b}_{t,h}(s,a)] \\
    &\stackrel{(ii)}{\leq} \sum_{h=1}^{H}\mathbb{E}_{(s,a)\sim d_{\hat{\mathcal{P}}_{t}, h}^{\pi^{\star}}}[f_{t,h}(s,a) - \min\{1, \frac{\hat{\alpha}_{t}}{5}\Vert\hat{\phi}_{t,h}(s,a)\Vert_{\Sigma_{\rho_{t,h}, \hat{\phi}_{t,h}}^{-1}}\}] \\
    &\stackrel{(iii)}{\leq} \sqrt{\frac{|\mathcal{A}|}{\xi_{t}}\zeta_{t}} + \sum_{h=1}^{H-1}\mathbb{E}_{(s,a)\sim d_{\hat{\mathcal{P}}_{t}, h}^{\pi^{\star}}}[\min\{1,\alpha_{t}\Vert\hat{\phi}_{t,h}(s,a)\Vert_{\Sigma_{\rho_{t,h}, \hat{\phi}_{t,h}}^{-1}}\}]\\ &\qquad -\sum_{h=1}^{H}\mathbb{E}_{(s,a)\sim d_{\hat{\mathcal{P}}_{t}, h}^{\pi^{\star}}}[\min\{1,\alpha_{t}\Vert\hat{\phi}_{t,h}(s,a)\Vert_{\Sigma_{\rho_{t,h}, \hat{\phi}_{t,h}}^{-1}}\}] \\
    &\leq \sqrt{\frac{|\mathcal{A}|}{\xi_{t}}\zeta_{t}},
\end{align*}    
    where \((i)\) follows from Lemma \ref{lemma:simulation}, \((ii)\) follows from the event \(\mathcal{E}\) and \(\Vert V_{\mathcal{P}^{\star},r^{\star}}^{\pi}\Vert_{\infty}\leq 1\) and
    \((iii)\) follows from Lemma \ref{lemma:bonus_relations} and \(\Vert f_{t,h}\Vert_{\infty}\leq 1\).
\end{proof}

We are now ready to show that algorithm \ref{alg:UniSREP} achieves sub-linear pseudo-regret; that is, the regret of the behavior polices is sub-linear. However, the actual regret of algorithm \ref{alg:UniSREP} might not be, as we explore uniformly at random in each episode with positive probability.

\begin{lemma}\label{lemma:sublinear_pseudo_regret_without_unisoft}(Sub-linear pseudo-regret without UniSOFT representations)
    Given that event \(\mathcal{E}\) occurs for all \(T\in\mathbb{N}\),
    \[
        \mathcal{R}(T) \lesssim H^{2}d^{2}|\mathcal{A}|\frac{\sqrt{T}\log^{2}(4TH|\Phi||\Psi|/\delta)}{\xi_{T}} \lesssim \tilde{O}(\frac{\sqrt{T}}{\xi_{T}}).
    \]
\end{lemma}
\begin{proof}
    Let $T\in\mathbb{N}$ be arbitrary. Then, for all episodes \(t\leq T\) we have,
    \begin{align*}
        &V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi^{\star}, d_{1}} - V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi_{t}, d_{1}} \\ &= V_{\hat{\mathcal{P}}_{t}, \hat{b}_{t} + r^{\star}, 1}^{\pi^{\star}, d_{1}} - V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi_{t}, d_{1}} + V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi^{\star}, d_{1}} - V_{\hat{\mathcal{P}}_{t}, \hat{b}_{t} + r^{\star}, 1}^{\pi^{\star}, d_{1}} \\
        &\stackrel{(i)}{\leq} V_{\hat{\mathcal{P}}_{t}, \hat{b}_{t} + r^{\star}, 1}^{\pi_{t}, d_{1}} - V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi_{t}, d_{1}} + \sqrt{\frac{|\mathcal{A}|}{\xi_{t}}\zeta_{t}} \\
        &\stackrel{(ii)}{=} \sum_{h=1}^{H}\mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star}, h}^{\pi_{t}}}[\hat{b}_{t,h}(s,a) + (\hat{\mathcal{P}}_{t,h} - \mathcal{P}_{h}^{\star})V_{\hat{\mathcal{P}}_{t}, r^{\star} + \hat{b}_{t}, h+1}^{\pi_{t}}(s,a)] + \sqrt{\frac{|\mathcal{A}|}{\xi_{t}}\zeta_{t}} \\
        &\stackrel{(iii)}{\leq} 2H\sum_{h=1}^{H}\mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star}, h}^{\pi_{t}}}[\hat{b}_{t,h}(s,a) + f_{t,h}(s,a)] + \sqrt{\frac{|\mathcal{A}|}{\xi_{t}}\zeta_{t}},
    \end{align*}
    where \((i)\) is by Lemma \ref{lemma:almost_optimism_at_init_dist}, \((ii)\) follows from Lemma \ref{lemma:simulation} and \((iii)\) follows from \(\Vert V_{\mathcal{P},r^{\star}+\hat{b}}^{\pi}\Vert_{\infty}\leq 2H\). Then, by Lemma \ref{lemma:bonus_relations},
    \begin{align*}
        &\sum_{h=1}^{H}\mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star}, h}^{\pi_{t}}}[\hat{b}_{t,h}(s,a) + f_{t,h}(s,a)] \\
        &\leq \sqrt{\frac{\zeta_{t}|\mathcal{A}|}{\xi_{t}}} + 15\alpha_{t}\sqrt{\frac{d|\mathcal{A}|}{t\xi_{t}}} + \alpha_{t}\sum_{h=1}^{H-1}\mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star}, h}^{\pi_{t}}}[\Vert\phi_{h}^{\star}(s,a)\Vert_{\Sigma_{\rho_{t,h}, \phi_{h}^{\star}}^{-1}}] \\
        &\qquad+ \beta_{t}\sum_{h=1}^{H-1}\mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star}, h}^{\pi_{t}}}[\Vert\phi_{h}^{\star}(s,a)\Vert_{\Sigma_{\gamma_{t,h}, \phi_{h}^{\star}}^{-1}}].
    \end{align*}
    Further, for all \(h\in[H]\),
    \begin{align*}
        \sum_{t=1}^{T}\mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star}, h}^{\pi_{t}}}[\Vert\phi_{h}^{\star}(s,a)\Vert_{\Sigma_{\gamma_{t,h}, \phi_{h}^{\star}}^{-1}}]
        &\stackrel{(i)}{\leq} \sqrt{T\sum_{t=1}^{T}\mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star}, h}^{\pi_{t}}}[\Vert\phi_{h}^{\star}(s,a)\Vert_{\Sigma_{\gamma_{t,h}, \phi_{h}^{\star}}^{-1}}^{2}]} \\
        &= \sqrt{T\sum_{t=1}^{T}\textnormal{tr}(\mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star}, h}^{\pi_{t}}}[\phi_{h}^{\star}(s,a)\phi_{h}^{\star}(s,a)^{T}]\Sigma_{\gamma_{t,h}, \phi_{h}^{\star}}^{-1})} \\
        &\stackrel{(ii)}{\leq} \sqrt{Td\log(1+\frac{T}{d\lambda_{1}})}
    \end{align*}
    where \((i)\) follows from the Cauchy-Schwarz inequality and Jensen's inequality and \((ii)\) follows from Lemma \ref{lemma:elliptical_potential} by noting that, \(\Sigma_{\gamma_{t,h}, \phi}^{-1} - \lambda_{t}I = t\mathbb{E}_{\gamma_{t,h}}[\phi\phi^{T}] = \sum_{i=1}^{t}\mathbb{E}_{d_{\mathcal{P}^{\star},h}^{\pi_{i}}}[\phi\phi^{T}]\) and that $\lambda_{t}$ is increasing. Similarly, for all $h\in[H]$,
    \begin{align*}
        \sum_{t=1}^{T}\mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star}, h}^{\pi_{t}}}[\Vert\phi_{h}^{\star}(s,a)\Vert_{\Sigma_{\rho_{t,h}, \phi_{h}^{\star}}^{-1}}]
        &\stackrel{(i)}{\leq} \sqrt{T\frac{|\mathcal{A}|}{\xi_{T}}\sum_{t=1}^{T}\mathbb{E}_{s\sim d_{\mathcal{P}^{\star}, h}^{\pi_{t}}, a\sim\mathcal{U}(\mathcal{A})}[\Vert\phi_{h}^{\star}(s,a)\Vert_{\Sigma_{\rho_{t,h}, \phi_{h}^{\star}}^{-1}}^{2}]} \\
        &\stackrel{(ii)}{\leq} \sqrt{T\frac{|\mathcal{A}|}{\xi_{T}}d\log(1+\frac{T}{d\lambda_{1}})},
    \end{align*}
    where \((i)\) follows from the Cauchy-Schwarz inequality, Jensen's inequality, importance Sampling and $\xi_{t}$ being decreasing and \((ii)\) follows from Lemma \ref{lemma:elliptical_potential}. Finally,
    \begin{align*}
        &\sum_{t=1}^{T}V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi^{\star}, d_{1}} - V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi_{t}, d_{1}} \\
        &\leq 8H\sqrt{\frac{\zeta_{T}T^{2}|\mathcal{A}|}{\xi_{T}}} + 30H\alpha_{T}\sqrt{\frac{Td|\mathcal{A}|}{\xi_{T}}} + 2H^{2}\alpha_{T} \sqrt{T\frac{|\mathcal{A}|}{\xi_{T}}d\log(1+\frac{T}{d\lambda_{1}})} \\
        &\qquad+  2H^{2}\beta_{T} \sqrt{Td\log(1+\frac{T}{d\lambda_{1}})} \\
        &\lesssim H^{2}d^{2}|\mathcal{A}|\frac{\sqrt{T}\log^{2}(4HT|\Phi||\Psi|/\delta)}{\xi_{T}}
    \end{align*}
\end{proof}

We proceed by providing an expected regret bound. Let \(\mathbb{E}_{\xi}\) and \(\mathbb{E}_{\delta}\) denote expectations w.r.t. the exploration probabilities and some good event \(\mathcal{E}(\delta)\), respectively. Additionally, note that we sample from \(d_{\mathcal{P}^{\star}, h}^{\pi_{t}}\) for each time step and hence produce $H$ trajectories per episode. Then, the expected regret of algorithm \ref{alg:UniSREP} can be upper bounded as follows:

\regretwithoutunisoftreps*

\begin{proof}
Let $T$ be given and fixed. Choose \(\delta = T^{-1}\). Recall that Algorithm \ref{alg:UniSREP} explores for $H$ time steps, for each $h\in[H]$ and episode $t$, by rolling into time step $h-1$ with policy $\pi_{t-1}$, taking actions according to $\tilde{\pi}_{t,h-1}$ and $\tilde{\pi}_{t,h}$ and finally, rolling out to time step $H$ with policy $\pi_{t-1}$. Let us denote \(\tilde{V}^{d_{1}}_{t,h}\)
%\begin{align*}
%    \tilde{V}_{t, h} = \mathbb{E}[\sum_{i=1}^{h-2}r^{\star}_{i}(s_{i}, a_{i})|s_{1}\sim d_{1}, \pi_{t-1}, \mathcal{P}^{\star}] + \mathbb{E}[\sum_{i=h-1}^{h+1}r^{\star}_{i}(s_{i}, a_{i})|s_{h-1}\sim d_{\mathcal{P}^{\star},h-1}^{\pi_{t-1}}, \tilde{\pi}_{t}, \mathcal{P}^{\star}]
%\end{align*}
as the cumulative expected reward obtained by Algorithm \ref{alg:UniSREP} in episode $t$ and time step $h$. Then,
\begin{align*}
     &\mathbb{E}_{\delta, \xi}[\tilde{\mathcal{R}}(T)] \\&= \mathbb{E}_{\delta, \xi}[\sum_{t=1}^{T}\sum_{h=1}^{H}(V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi^{\star}, d_{1}} - \tilde{V}_{t,h}^{d_{1}})] \\
    &\leq \mathbb{E}_{\delta, \xi}[\sum_{t=1}^{T}\sum_{h=1}^{H}\mathbbm{1}\{e_{t}=1\}\mathbbm{1}\{\mathcal{E}(\delta)\}(V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi^{\star}, d_{1}} - \tilde{V}_{t,h}^{d_{1}})] + \mathbb{E}_{\delta, \xi}[\sum_{t=1}^{T}\sum_{h=1}^{H}\mathbbm{1}\{e_{t}=0\}(V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi^{\star}, d_{1}} - \tilde{V}_{t,h}^{d_{1}})] \\
    &\qquad+  \mathbb{E}_{\delta, \xi}[\sum_{t=1}^{T}\sum_{h=1}^{H}\mathbbm{1}\{\mathcal{E}^{c}(\delta)\}(V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi^{\star}, d_{1}} - \tilde{V}_{t,h}^{d_{1}})] \\
    &\stackrel{(i)}{\leq}  \mathbb{E}_{\delta, \xi}[\sum_{t=1}^{T}\sum_{h=1}^{H}\mathbbm{1}\{e_{t}=1\}\mathbbm{1}\{\mathcal{E}(\delta)\}(V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi^{\star}, d_{1}} - \tilde{V}_{t,h}^{d_{1}})] + \mathbb{E}_{\delta, \xi}[\sum_{t=1}^{T}\sum_{h=1}^{H}\mathbbm{1}\{e_{t}=0\} + \mathbbm{1}\{\mathcal{E}^{c}(\delta)\}] \\
    &\stackrel{(ii)}{\leq} \mathbb{E}_{\delta, \xi}[\sum_{t=1}^{T}\sum_{h=1}^{H}\mathbbm{1}\{e_{t}=1\}\mathbbm{1}\{\mathcal{E}(\delta)\}(V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi^{\star}, d_{1}} - V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi_{t-1}, d_{1}}] + H(T\delta + \sum_{t=1}^{T}\xi_{t})\} \\
    &\leq H\mathbb{E}_{\delta, \xi}[\sum_{t=1}^{T}\mathbbm{1}\{\mathcal{E}(\delta)\}(V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi^{\star}, d_{1}} - V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi_{t-1}, d_{1}}] + H(1 + \sum_{t=1}^{T}t^{-1/4})\} \\
    &\stackrel{(iii)}{\leq}  H\underbrace{\mathbb{E}_{\delta, \xi}[\sum_{t=1}^{T}\mathbbm{1}\{\mathcal{E}(\delta)\}(V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi^{\star}, d_{1}} - V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi_{t}, d_{1}}]}_{(A)} + HT^{3/4} + 2H\}
\end{align*}
where \((i)\) follows from the optimality of $\pi^{\star}$ and $\Vert V_{\mathcal{P}, r^{\star}}^{\pi}\Vert_{\infty}\leq1$, \((ii)\) follows from \(\tilde{\pi}_{t}\) and \(\pi_{t-1}\) agreeing on the event \(e_{t}=1\) and Lemma \ref{lemma:event_occurs} and $(iii)$ follows from an index shift and $\Vert V_{\mathcal{P}, r^{\star}}^{\pi}\Vert_{\infty}\leq1$. Finally, we can leverage the pseudo-regret result of Lemma \ref{lemma:sublinear_pseudo_regret_without_unisoft} to bound term $(A)$, 
\begin{align*}
    (A) &= \mathbb{E}_{\delta, \xi}[\sum_{t=1}^{T}\mathbbm{1}\{\mathcal{E}(\delta)\}(V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi^{\star}, d_{1}} - V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi_{t}, d_{1}}] \\
    &\lesssim H^{2}d^{2}|\mathcal{A}|\frac{\sqrt{T}\log^{2}(4TH|\Phi||\Psi|/\delta)}{\xi_{T}} \\
    &\lesssim H^{2}d^{2}|\mathcal{A}|T^{3/4}\log^{2}(4TH|\Phi||\Psi|),
\end{align*}
and hence, conclude the proof.
\end{proof}