In this section, we provide a constant pseudo-regret result that translates the uniform convergence of the confidence intervals to the expected sub-optimality gaps. We start by providing a sufficient condition that makes a deterministic policy optimal.

\begin{lemma}\label{lemma:identify_optimal_policy}
    Let \(\pi\) be any deterministic policy. Whenever,
    \[
        \mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star}, h}^{\pi}}[\Delta_{h}(s,a)] < d_{\textnormal{min}}^{\star}\Delta_{\textnormal{min}}
    \]
    holds for all \(h\in[H]\) simultaneously, there exists an optimal policy \(\tilde{\pi}^{\star}\in\Pi^{\star}\), such that, for all \(h\in[H]\),
    \[d_{\mathcal{P}^{\star},h}^{\tilde{\pi}^{\star}} \equiv d_{\mathcal{P}^{\star},h}^{\pi}.\]
    
\end{lemma}
\begin{proof}
We give a proof by induction. For \(h=1\) we have,
    \begin{align*}
         \mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star}, 1}^{\pi}}[\Delta_{1}(s,a)] &= \mathbb{E}_{s \sim d_{1}}[\Delta_{1}(s,\pi_{1}(s))] \\
        &= \sum_{s\in\mathcal{S}}d_{1}(s)\Delta_{1}(s, \pi_{1}(s)) \\
        &\geq d_{\textnormal{min}}^{\star}\sum_{s:d_{1}(s)>0}\Delta_{1}(s, \pi_{1}(s))
    \end{align*}
    Hence, for all \(s\in\mathcal{S}\) such that \(d_{1}(s)>0\),
    \begin{align*}
        \Delta_{1}(s, \pi_{1}(s)) < \Delta_{\textnormal{min}},
    \end{align*}
    and therefore, \(\pi_1(s)\in\Pi_{1}^{\star}(s)\) for all \(s\in\mathcal{S}\) such that \(d_{1}(s)>0\).
    Equivalently, there exits a policy \(\tilde{\pi}^{\star}\in\Pi^{\star}\) such that,
    \[
        d_{\mathcal{P}^{\star}, 1}^{\tilde{\pi}^{\star}} \equiv d_{\mathcal{P}^{\star}, 1}^{\pi}.
    \]

    Suppose the induction hypothesis that for any time step \(h\in[H]\) there exists an optimal policy \(\tilde{\pi}^{\star}\in\Pi^{\star}\) such that,
    \(
        d_{\mathcal{P}^{\star}, h}^{\tilde{\pi}^{\star}} \equiv d_{\mathcal{P}^{\star}, h}^{\pi}
    \)
    holds. Then, for an arbitrary \(h\in[H]\),
    \begin{align*}
        \mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star}, h+1}^{\pi}}[\Delta_{h+1}(s,a)]
        &\stackrel{(i)}{=} \mathbb{E}_{s \sim d^{\tilde{\pi}^{\star}}_{\mathcal{P}^{\star}, h+1}}[\Delta_{h+1}(s,\pi_{h+1}(s))] \\
        &= \sum_{s\in\mathcal{S}}d_{\mathcal{P}^{\star},h+1}^{\tilde{\pi}^{\star}}(s)\Delta_{h+1}(s,\pi_{h+1}(s))\\
        &\geq d_{\textnormal{min}}^{\star}\sum_{s:d^{\pi^{\star}}_{\mathcal{P}^{\star}, h+1}(s)}\Delta_{h+1}(s, \pi_{h+1}(s)),
    \end{align*}    
    where \((i)\) follows from the induction hypothesis.   Therefore, for all \(s\in\mathcal{S}\) such that \(d^{\tilde{\pi}^{\star}}_{\mathcal{P}^{\star}, h+1}(s) > 0\), we have \(
        \pi_{h+1}(s) \in \Pi_{h+1}^{\star}(s).
    \)    
\end{proof}



\begin{lemma}\label{lemma:constant_pseudo_regret_with_UniSOFT}(Constant pseudo-regret with UniSOFT representations)
    Let \(\alpha\in(0,1]\), \(\gamma\in(2,\infty)\) and \(\xi_{t}=t^{-1/\gamma}\). Suppose assumptions \ref{ass:realizability} (realizability), \ref{ass:unique_optimal_policy} (unique optimal policy), \ref{ass:sub_optimality_gap_exists} (minimal sub-optimality gap), \ref{ass:min_optimal_occupancy_exists} (minimal optimal occupancy) and \ref{ass:expressivness} ($\alpha^{\star}$-expressive function space) hold. Then, given that events \(\mathcal{E}(\delta)\) and \(\mathcal{F}(\delta)\) occur, there exists a constant \(\tau^{\star}\), after which the behavior policies \(\{\pi_{t}\}_{t\geq1}\) learned by algorithm \ref{alg:UniSREP}, incur no additional regret and hence, for all \(T\in\mathbb{N}\): 
\[
    \mathcal{R}(T) \lesssim \mathcal{R}(\tau^{\star}) = O(1)
\]
    
\end{lemma}

\begin{proof}
   Let $t$ be arbitrary and large enough. Then, since the event \(\mathcal{E}\) occurs by assumption, by Lemma \ref{lemma:expected_suboptimalitygap_to_bonus},
    we can bound the expected sub-optimality gaps for all \(h\in[H]\),

\[
        \mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star},h}^{\pi_{t}}}[\Delta_{h}(s, a)] \leq 10H^{2}(\sqrt{\frac{|\mathcal{A}|}{\xi_{t}}\zeta_{t}} + V_{\hat{\mathcal{P}}_{t}, \hat{b}_{t},1}^{\pi_{t}^{b}, d_{1}}) := (A).
\]

Further, according to Lemma \ref{lemma:sublinear_pseudo_regret_without_unisoft}, in the event \(\mathcal{E}\), \(\mathcal{R}(t) \leq g(t) = \tilde{O}(\sqrt{t}\xi_{t}^{-1}) = O(t^{\frac{2+\gamma}{2\gamma}})\) with \(\hat{\alpha}_{t}=\tilde{O}(\xi_{t}^{-1/2}) = \tilde{O}(t^{\frac{1}{2\gamma}})\). By Lemma \ref{lemma:UniSOFT_selection_full_rank} and the events \(\mathcal{F}\) and \(\mathcal{E}\), for all \(h\in[H]\), the learned feature maps \(\hat{\phi}_{t,h}\) are non-redundant and UniSOFT. Then, by Lemma \ref{lemma:bounded_uncertainty}, \(\gamma>2\) and the event \(\mathcal{F}\),
\begin{align*}
    & V_{\hat{\mathcal{P}}_{t}, \hat{b}_{t},1}^{\pi_{t}^{b}, d_{1}} \leq \hat{\alpha}_{t}\sum_{h=1}^{H}\mathbb{E}_{(s,a)\sim d_{\hat{\mathcal{P}}_{t},h}^{\pi_{t}^{b}}}[\Vert\hat{\phi}_{t,h}(s,a)\Vert_{\hat{\Sigma}_{t,h}^{-1}}] \\
    &\leq \frac{\hat{\alpha}_{t}H}{(\lambda_{\textnormal{max}}^{\star} t + \lambda_{t} - \sum_{i=1}^{t}\xi_{i-1} - g(t)\Delta_{\textnormal{min}}^{-1} - 18\sqrt{t\log(6tdH|\Phi|/\delta)})^{1/2}}\\
    &\leq \tilde{O}(\frac{t^{\frac{1}{2\gamma}}}{t^{1/2}}) = \tilde{O}(t^{-\frac{1}{2}(1-\frac{1}{\gamma})})  \xrightarrow[t \to \infty]{} 0,
\end{align*}
Additionally, we have
\[
    \sqrt{\frac{|\mathcal{A}|}{\xi_{t}}\zeta_{t}} = \tilde{O}(t^{-\frac{1}{2}(1-\frac{1}{\gamma})})  \xrightarrow[t \to \infty]{} 0
\]
Hence, there must exist an episode \(\tau^{\star}\) such that
\[
    \mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star},h}^{\pi_{t}}}[\Delta_{h}(s, a)] < \Delta_{\textnormal{min}}d_{\textnormal{min}}^{\star}
\]
for all $t\geq \tau^{\star}$. Then by Lemma \ref{lemma:identify_optimal_policy}, we get:
\begin{align*}
    \mathcal{R}(T) &\leq \sum_{t=1}^{\infty}\sum_{h=1}^{H}\mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star}, h}^{\pi_{t}}}[\Delta(s,a)] \\
    &\leq \sum_{t=1}^{\tau^{\star}}\sum_{h=1}^{H}\mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star}, h}^{\pi_{t}}}[\Delta(s,a)] = \mathcal{R}(\tau^{\star})=O(1).
\end{align*}
\end{proof}

\instancedependentregretwithunisoftandconstantpseudoregret*
\begin{proof}
Let $T$ be given and fixed. Choose $\delta=\frac{1}{T}$. Then
\begin{align*}
     &\mathbb{E}_{\delta, \xi}[\tilde{\mathcal{R}}(T)] \\
    &\stackrel{(i)}{\leq} H\mathbb{E}_{\delta, \xi}[\sum_{t=1}^{T}\mathbbm{1}\{\mathcal{E}(\delta)\}\mathbbm{1}\{\mathcal{F}(\delta)\}(V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi^{\star}, d_{1}} - V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi_{t-1}, d_{1}}] + H(2T\delta + \sum_{t=1}^{T}\xi_{t})\} \\
    &\stackrel{(ii)}{\lesssim} H(\tau^{\star})^{1/2 + 1/\gamma} + H\sum_{t=1}^{T}t^{-1/\gamma} + 2H \\
    &\lesssim H(\tau^{\star})^{1/2 + 1/\gamma} + HT^{\frac{\gamma-1}{\gamma}} + 4H + 2H
\end{align*}
where the details of \((i)\) can be found in the proof of Theorem \ref{thm:instance_dependent_regret_bound_with_unisoft}, \((ii)\) follows from the constant pseudo-regret result of Lemma \ref{lemma:constant_pseudo_regret_with_UniSOFT}. We conclude the proof by substituting \(\tau^{\star}\) with the sufficient condition provided in Lemma \ref{lemma:critical_episodes} and using \(T\gtrsim 
 a\log^{n}(ab)\) as a sufficient condition for \(T\geq a\log^{n}(bT)\).
\end{proof}


\begin{lemma}\label{lemma:critical_episodes}(Critical episodes)
      Let \(\alpha\in(0,1]\), \(\gamma\in(2,4]\) and \(\xi_{t}=t^{-1/\gamma}\). Suppose assumptions \ref{ass:realizability} (realizability), \ref{ass:unique_optimal_policy} (unique optimal policy), \ref{ass:sub_optimality_gap_exists} (minimal sub-optimality gap), \ref{ass:min_optimal_occupancy_exists} (minimal optimal occupancy) and \ref{ass:expressivness} ($\alpha^{\star}$-expressive function space) hold. Suppose that we run algorithm \ref{alg:UniSREP}. Then, given that events \(\mathcal{E}(\delta)\) and \(\mathcal{F}(\delta)\) occur:

      $(1)$ all non-$\alpha^{\star}$-approximate representations are eliminated after at most
      \[
      \tau_{\alpha} \lesssim \kappa_{1}^{m}\cdot \log^{2m}(\kappa_{1} \cdot \kappa_{2})
      \]

      $(2)$ all redundant and non-UniSOFT representations are eliminated after at most
      \[
        \tau_{\textnormal{good}} \lesssim \{ \kappa_{3}^{m} \cdot \log^{2m}(\kappa_{3} \cdot \kappa_{2}) \vee  \tau_{\alpha}\}
      \]
      
      $(3)$ the behavior policy \(\pi_{t}\) is optimal after at most
    \[
        \tau^{\star} \lesssim \{\kappa_{4}^{m'}\cdot \log^{m'}(\kappa_{4} \cdot \kappa_{2}) \vee \tau_{\textnormal{good}}\}
    \]
    episodes, where
    \(\kappa_{1}=\frac{H^{2}d^{2}|\mathcal{A}|}{\alpha\Delta_{\textnormal{min}}d_{\textnormal{min}}^{\star}}\), \(\kappa_{2}=H|\Phi||\Psi|/\delta\), \(\kappa_{3}=\frac{H^{2}d^{2}|\mathcal{A}|}{\lambda_{\textnormal{max}}^{\star}\Delta_{\textnormal{min}}}\), \(\kappa_{4} = \frac{H^{6}d^{2}|\mathcal{A}|}{(\Delta_{\textnormal{min}}d_{\textnormal{min}}^{\star})^{2}\lambda_{\textnormal{max}}^{\star}}\), \(m=\frac{2\gamma}{\gamma - 2}\) and \(m'=\frac{\gamma}{\gamma - 1}\).
\end{lemma}
\begin{proof}
    By Lemma \ref{lemma:sublinear_pseudo_regret_without_unisoft}, for all \(t\in\mathbb{N}\), 
    \begin{align*}
        &\mathcal{R}(t)\leq c_{3}H^{2}d^{2}|\mathcal{A}|\frac{\sqrt{t}\log^{2}(4tH|\Phi||\Psi|/\delta)}{\xi_{t}}, \\
        &\hat{\alpha}_{t} = \sqrt{4t\zeta_{t}\frac{|\mathcal{A}|}{\xi_{t}} + \lambda_{t}d},
    \end{align*}
    where \(c_{3}\) is some universal constant. In the following, we will use \(t\geq 3a\log(ab)\) as a sufficient condition for \(t\geq a\log(bt)\) with reasonable values for \(a\) and \(b\) and \(t>0\). See Lemma 20 in \cite{papini2021reinforcement} for details. In particular, by substituting $t$ with \(u=a^{\frac{1}{n}}t^{\frac{1}{mn}}\), we get that for any \(n\geq1\) and \(m\geq1\):
    \begin{align}\label{eq:sufficient_time_logn}
        t>(mn)^{n}a^{m}(3\log(ab))^{mn}\Rightarrow  t^{\frac{1}{m}}>a\log^{n}(bt).
    \end{align}
    We divide the analysis in four parts, where in each part we derive a sufficient condition for \(\tau^{\star}\).\\
    \textbf{Part 1.} $\tau^{\star}$ must satisfy the \(\alpha^{\star}\)-selection criteria in Lemma \ref{lemma:alpha_star_selection}. 
    \begin{align*}
        t&>\frac{1}{\alpha}(\frac{\mathcal{R}(t)}{\Delta_{\textnormal{min}}d_{\textnormal{min}}^{\star}} + \frac{|\mathcal{A}|}{\xi_{t}}\sqrt{2t\log(4tH|\Phi||\Psi|/\delta)}) \\
        t&>\frac{1}{\alpha}(\frac{c_{3}H^{2}d^{2}|\mathcal{A}|t^{(1/2 + 1/\gamma)}\log^{2}(4tH|\Phi||\Psi|)}{\Delta_{\textnormal{min}}d_{\textnormal{min}}^{\star}} + |\mathcal{A}|t^{(1/2 + 1/\gamma)}\sqrt{2\log(4t|\Phi||\Psi|H/\delta)}) \\
        t&>t^{\frac{\gamma + 2}{2\gamma}}\cdot c_{3}2\underbrace{\frac{H^{2}d^{2}|\mathcal{A}|}{\alpha\Delta_{\textnormal{min}}d_{\textnormal{min}}^{\star}}}_{\kappa_{1}}\cdot\log^{2}(t\cdot 4\underbrace{H|\Phi||\Psi|/\delta}_{\kappa_{2}}) \\     
        t&\stackrel{(i)}{>}(2m)^{2}(2c_{3}\kappa_{1})^{m}3^{2m}\log^{2m}(\kappa_{1} \cdot 4\kappa_{2}) := \bar{\kappa}_{1},
    \end{align*}
    where \((i)\) follows from the condition \ref{eq:sufficient_time_logn} with \(m=\frac{2\gamma}{\gamma - 2}\). We gain statement $(1)$, by taking $\tau_{\alpha}=\bar{\kappa}_{1}$.
    
    \textbf{Part 2.} $\tau^{\star}$ must satisfy the UniSOFT-selection criteria in Lemma \ref{lemma:UniSOFT_selection_full_rank}. 
    \begin{align*}
        t&>\frac{2}{\lambda_{\textnormal{max}}^{\star}}(\Delta_{\textnormal{min}}^{-1}\mathcal{R}(t) + 2\sum_{i=1}^{t}\xi_{i-1} + 18\sqrt{t\log(6tdH|\Phi|/\delta)}) \\
        t&>\frac{2}{\lambda_{\textnormal{max}}^{\star}}(\frac{c_{3}H^{2}d^{2}|\mathcal{A}|t^{1/2 + 1/\gamma}\log^{2}(4tH|\Phi||\Psi|)}{\Delta_{\textnormal{min}}} + 2\frac{\gamma}{\gamma - 1}t^{1-1/\gamma} + 18\sqrt{t\log(6tdH|\Phi|/\delta)}) \\
        t&\stackrel{(i)}{>}t^{\frac{2+\gamma}{2\gamma}}\cdot c_{3}22\underbrace{\frac{H^{2}d^{2}|\mathcal{A}|}{\lambda_{\textnormal{max}}^{\star}\Delta_{\textnormal{min}}}}_{\kappa_{3}}\cdot\log^{2}(t\cdot 6\underbrace{dH|\Phi||\Psi|/\delta}_{\kappa_{2}}) \\
        t&\stackrel{(ii)}{>}(2m)^{2}(22c_{3}\kappa_{3})^{m}3^{2m}\log^{2m}(\kappa_{3}\cdot 6\kappa_{2}) := \bar{\kappa}_{2},
    \end{align*}
    where \((i)\) follows from \(\gamma \leq 4\) and \((ii)\) follows from the condition \ref{eq:sufficient_time_logn} with \(m=\frac{2\gamma}{\gamma - 2}\). \\

    \textbf{Part 3.}  \(\tau^{\star}\) must satisfy the invertibility condition from Lemma \ref{lemma:bounded_uncertainty}. 

    \begin{align*}
        t&>\frac{\mathcal{R}(t)\Delta_{\textnormal{min}}^{-1} + 2\sum_{i=1}^{t}\xi_{i-1} + 18\sqrt{t\log(6tdH|\Phi|/\delta)}}{\lambda_{\textnormal{max}}^{\star}},
    \end{align*}
    Note that the condition is fulfilled if \(t\geq\bar{\kappa}_{2}\). By taking, \(\tau_{\textnormal{good}}:=\max\{\bar{\kappa}_{1}, \bar{\kappa}_{2}\}\), we gain statement $(2)$. \\
    
    \textbf{Part 4.} First note we can upper bound,
    \begin{align*}
        \hat{\alpha}_{t} &= 5\sqrt{4t\zeta_{t}\frac{|\mathcal{A}|}{\xi_{t}} + \lambda_{t}d} \\
        &= 5\sqrt{8\log(4|\Phi||\Psi|Ht/\delta)\frac{|\mathcal{A}|}{\xi_{t}} + c_{1}d^{2}\log(4tH|\Phi|/\delta)} \\
        &\leq 5\sqrt{8c_{1}d^{2}|\mathcal{A}|t^{\frac{1}{\gamma}}\log(4|\Phi||\Psi|Ht/\delta)} \\
        &\leq 5dt^{\frac{1}{2\gamma}}\sqrt{8|\mathcal{A}|c_{1}\log(4|\Phi||\Psi|Ht/\delta)}.
    \end{align*}
    For now we assume that \(t\geq\bar{\kappa}_{2}\). Then,

    \begin{align*}
        \Delta_{\textnormal{min}}d_{\textnormal{min}}^{\star} &> 20H^{2}( \frac{\hat{\alpha}_{t}H}{(\lambda_{\textnormal{max}}^{\star} t + \lambda_{t} - 2\sum_{i=1}^{t}\xi_{i-1} - \mathcal{R}(t)\Delta_{\textnormal{min}}^{-1} - 18\sqrt{t\log(6tdH|\Phi|/\delta)})^{1/2}} + \sqrt{\frac{|A|}{\xi_{t}}\zeta_{t}}) \\
        \Delta_{\textnormal{min}}d_{\textnormal{min}}^{\star} &\stackrel{(i)}{>} 20H^{2}(\frac{\hat{\alpha}_{t}H}{(\frac{3}{2}\lambda_{\textnormal{max}}^{\star}t)^{1/2}} + \sqrt{2|A|t^{\frac{1}{\gamma}-1}\log(4t|\Phi||\Psi|H/\delta))}\\
        \Delta_{\textnormal{min}}d_{\textnormal{min}}^{\star} &> t^{-\frac{1}{2}(1-\frac{1}{\gamma})}\cdot 150\sqrt{c_{1}} \frac{H^{3}d|\mathcal{A}|^{1/2}}{(\lambda_{\textnormal{max}}^{\star})^{1/2}}\cdot \sqrt{\log(t\cdot4 |\Phi||\Psi|H/\delta)},
    \end{align*}
    where \((i)\) follows from \(t\geq\bar{\kappa}_{1}\). After rearranging, we get:

    \begin{align*}
     t^{\frac{1}{2}(1-\frac{1}{\gamma})}&>  150\sqrt{c_{1}} \frac{H^{3}d|\mathcal{A}|^{1/2}}{\Delta_{\textnormal{min}}d_{\textnormal{min}}^{\star}(\lambda_{\textnormal{max}}^{\star})^{1/2}}\cdot \log^{1/2}(t\cdot4 |\Phi||\Psi|H/\delta) \\
     t^{(1-\frac{1}{\gamma})}&>  150^{2}c_{1} \underbrace{\frac{H^{6}d^{2}|\mathcal{A}|}{(\Delta_{\textnormal{min}}d_{\textnormal{min}}^{\star})^{2}\lambda_{\textnormal{max}}^{\star}}}_{\kappa_{4}}\cdot \log(t\cdot4 \underbrace{|\Phi||\Psi|H/\delta}_{\kappa_{2}}) \\
     t&\stackrel{(i)}{>}m450^{2m}(c_{1}\kappa_{4})^{m}\log^{m}(\kappa_{4}\cdot 4\kappa_{2}) := \bar{\kappa}_{3}
    \end{align*}
    where \((i)\) follows from condition \ref{eq:sufficient_time_logn} with \(m=\frac{\gamma}{\gamma - 1}\). Finally, by taking 
    \[
        \tau^{\star}=\max\{\bar{\kappa}_{1}, \bar{\kappa}_{2}, \bar{\kappa}_{3}\}
    \]
    we conclude.
\end{proof}

\optimalpolicyidentification*

\begin{proof}
    We know, by the proof of Lemma \ref{lemma:sublinear_pseudo_regret_with_UniSOFT}, that given that the events \(\mathcal{E}\) and \(\mathcal{F}\) hold, there exists an episode \(\tau^{\star}\) such that, for all \(t\geq\tau^{\star}\) and \(h\in[H]\),
    \begin{align}
        \mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star},h}^{\pi_{t}}}[\Delta_{h}(s,a)] &\leq 10H^{2}(\sqrt{\frac{|\mathcal{A}|}{\xi_{t}}\zeta_{t}} + V_{\mathcal{P}^{\star}, \hat{b}_{t},1}^{\pi_{t}^{b}, d_{1}})\\ &< \Delta_{\textnormal{min}}d_{\textnormal{min}}^{\star}.
    \end{align}
    In particular, we know from Lemma \ref{lemma:identify_optimal_policy}, that any deterministic policy satisfying the chain of inequalities above is optimal. Furthermore, the event \(\mathcal{E}(\delta)\cap\mathcal{F}(\delta)\) holds with probability \(1-2\delta\) by Lemma \ref{lemma:event_occurs} and Lemma \ref{lemma:eigenvalue_bounds}. Hence, with probability at least \(1-2\delta\), algorithm \ref{alg:UniSREP} returns an optimal policy after at most \(\tau^{\star}\) episodes.
\end{proof}