In this section, we show how we can use good representations to improve the pseudo-regret result \ref{lemma:sublinear_pseudo_regret_without_unisoft} provided in Section \ref{sec:sublinear-regret}. Subsequently, we can provide an improved expected regret result.

On a high level, we show that the bonus terms provide an almost optimistic estimate for the expected sub-optimality gaps incurred by the behavior policies of algorithm \ref{alg:UniSREP}. We can then exploit the UniSOFT property of the good representations that we are guaranteed to select, as shown in the previous section, to show uniformly decreasing confidence intervals. Let us start by providing two results that are adapted from \cite{cheng2023improved}, which show that the bonus term can be used to provide a trajectory-wise uncertainty measure for the model estimation error over the occupancy distribution of the behavior policies.

\begin{lemma}\label{lemma:value_function_difference}(Value difference of transition operators)
For all \(t\in\mathbb{N}\), any policy \(\pi\), state $s\in\mathcal{S}$, time step \(h\in[H]\) and set of reward function \(\{r_{h}\}_{h=1}^{H}\) such that \(r_{h}:\mathcal{S}\times\mathcal{A}\to[0,1]\) and \(\sum_{h=1}^{H}r_{h}\leq 1\),
\[
    |V_{\mathcal{P}^{\star}, r, h}^{\pi}(s) - V_{\hat{\mathcal{P}}_{t}, r, h}^{\pi}(s)| \leq V_{\mathcal{P}, f_{t}, h}^{\pi}(s),
\]
where \(\mathcal{P}\in\{\hat{\mathcal{P}}_{t}, \mathcal{P}^{\star}\}\).

\end{lemma}

\begin{proof}
    We give a proof by induction. For \(h=H+1\) and any \(s\in\mathcal{S}\), we have \(|V_{\mathcal{P}^{\star}, r, H+1}^{\pi}(s) - V_{\hat{\mathcal{P}}_{t}, r, H+1}^{\pi}(s)| = 0 = V_{\mathcal{P}, f_{t}, H+1}^{\pi}\) for \(\mathcal{P}\in\{\hat{\mathcal{P}}_{t}, \mathcal{P}^{\star}\}\). Suppose the induction hypothesis, \(|V_{\mathcal{P}^{\star}, r, h+1}^{\pi}(s) - V_{\hat{\mathcal{P}}_{t}, r, h+1}^{\pi}(s)| \leq V_{\mathcal{P}, f_{t}, h+1}^{\pi}(s)\) for \(\mathcal{P}\in\{\hat{\mathcal{P}}_{t}, \mathcal{P}^{\star}\}\) and  any \(s\in\mathcal{S}\). Then, for any \(h\in[H]\) and \(s\in\mathcal{S}\),
    \begin{align*}
        &|V_{\mathcal{P}^{\star}, r, h}^{\pi}(s) - V_{\hat{\mathcal{P}}_{t}, r, h}^{\pi}(s)| \\
        &\leq  \mathbb{E}_{a\sim \pi(\cdot|s)}[|Q_{\mathcal{P}^{\star}, r, h}^{\pi}(s,a) - Q_{\hat{\mathcal{P}}_{t}, r, h}^{\pi}(s, a)|]\\
        &=  \mathbb{E}_{a\sim \pi(\cdot|s)}[|\mathcal{P}_{h}^{\star}V_{\mathcal{P}^{\star}, r, h+1}^{\pi}(s,a) - \hat{\mathcal{P}}_{t,h}V_{\hat{\mathcal{P}}_{t}, r, h+1}^{\pi}(s,a) |] =: (A).
    \end{align*}
    Then, the first claim ($\mathcal{P}=\hat{\mathcal{P}}_{t}$) follows from:
    \begin{align*}
        (A) &= \mathbb{E}_{a\sim \pi(\cdot|s)}[| \hat{\mathcal{P}}_{t,h}(V_{\mathcal{P}^{\star}, r, h+1}^{\pi} - V_{\hat{\mathcal{P}}_{t}, r, h+1}^{\pi})(s,a) + (\mathcal{P}_{h}^{\star} - \hat{\mathcal{P}}_{t,h})V_{\mathcal{P}^{\star}, r, h+1}^{\pi}(s,a)|] \\
        &\stackrel{(i)}{\leq} \mathbb{E}_{a\sim \pi(\cdot|s)}[ \hat{\mathcal{P}}_{t,h}V_{\hat{\mathcal{P}}_{t}, f_{t}, h+1}^{\pi}(s,a) + f_{t,h}(s,a)] \\
        &= V_{\hat{\mathcal{P}}_{t}, f_{t}, h}^{\pi}(s),
    \end{align*}
    where \((i)\) follows from the induction hypothesis and \(\Vert V_{\mathcal{P}, r, h}^{\pi}\Vert_{\infty} \leq 1\). The second claim ($\mathcal{P}=\mathcal{P}^{\star}$) follows from:
    \begin{align*}
        (A) &= \mathbb{E}_{a\sim \pi(\cdot|s)}[| \mathcal{P}_{h}^{\star}(V_{\mathcal{P}^{\star}, r, h+1}^{\pi} - V_{\hat{\mathcal{P}}_{t}, r, h+1}^{\pi})(s,a) + (\mathcal{P}_{h}^{\star} - \hat{\mathcal{P}}_{t,h})V_{\hat{\mathcal{P}}_{t}, r, h+1}^{\pi}(s,a)|] \\
        &\stackrel{(i)}{\leq} \mathbb{E}_{a\sim \pi(\cdot|s)}[ \mathcal{P}_{h}^{\star}V_{\mathcal{P}^{\star}, f_{t}, h+1}^{\pi}(s,a) + f_{t,h}(s,a)] \\
        &= V_{\mathcal{P}^{\star}, f_{t}, h}^{\pi}(s),
    \end{align*}
     where \((i)\) follows from the induction hypothesis and \(\Vert V_{\mathcal{P}, r, h}^{\pi}\Vert_{\infty} \leq 1\).
\end{proof}

\begin{lemma}\label{lemma:bounded_model_misspecification}(Uncertainty bounded model estimation error)
Given that the event \(\mathcal{E}\) occurs, we have for all \(t\in\mathbb{N}\) and any policy \(\pi\),
\begin{align*}
    &V_{\mathcal{P}^{\star}, f_{t}, 1}^{\pi, d_{1}} \leq 2H\sqrt{\frac{|\mathcal{A}|}{\xi_{t}}\zeta_{t}} + 2HV_{\hat{\mathcal{P}_{t}}, \hat{b}_{t},1}^{\pi, d_{1}}, \textnormal{ and} \\
    &V_{\hat{\mathcal{P}}_{t}, f_{t}, 1}^{\pi, d_{1}} \leq \sqrt{\frac{|\mathcal{A}|}{\xi_{t}}\zeta_{t}} + V_{\hat{\mathcal{P}_{t}}, \hat{b}_{t},1}^{\pi, d_{1}}.
\end{align*}

\end{lemma}
\begin{proof}
    For all \(h>1\),
    \begin{align*}
        \mathbb{E}_{(s, a)\sim d_{\hat{\mathcal{P}}_{t};h}^{\pi}}[f_{t,h}(s,a)]  &\stackrel{(i)}{\leq}  \mathbb{E}_{(s, a)\sim d_{\hat{\mathcal{P}}_{t};h-1}^{\pi}}[\min\{1, \alpha_{t}\Vert\hat{\phi}_{t,h-1}(s,a)\Vert_{\Sigma_{\rho_{t,h-1}, \hat{\phi}_{t,h-1}}^{-1}}\}] \\
        &\stackrel{(ii)}{\leq} \mathbb{E}_{(s, a)\sim d_{\hat{\mathcal{P}}_{t};h-1}^{\pi}}[\hat{b}_{t,h-1}(s,a)],
    \end{align*}
    where \((i)\) is by Lemma \ref{lemma:bonus_relations} and \(\Vert f_{t,h}\Vert_{\infty}\leq 1\) and \((ii)\) follows from the event \(\mathcal{E}\). Additionally, by Lemma \ref{lemma:bonus_relations}, we have,
    \[
       \mathbb{E}_{(s, a)\sim d_{\hat{\mathcal{P}}_{t};1}^{\pi}}[f_{t,1}(s,a)] \leq \sqrt{\frac{|\mathcal{A}|}{\xi_{t}}\zeta_{t}},
    \]
    which gives the second claim. Additionally,
    \begin{align*}
        V_{\mathcal{P}^{\star}, f_{t}, 1}^{\pi, d_{1}} &\leq V_{\hat{\mathcal{P}}_{t}, f_{t}, 1}^{\pi, d_{1}} + H|\frac{1}{H}V_{\mathcal{P}^{\star}, f_{t}, 1}^{\pi, d_{1}} - \frac{1}{H}V_{\hat{\mathcal{P}}_{t}, f_{t}, 1}^{\pi, d_{1}}| \\
        &\stackrel{(i)}{\leq} V_{\hat{\mathcal{P}}_{t}, f_{t}, 1}^{\pi, d_{1}} + HV_{\hat{\mathcal{P}}_{t}, f_{t}, 1}^{\pi, d_{1}} \\
        &\stackrel{(ii)}{\leq} 2H\sqrt{\frac{|\mathcal{A}|}{\xi_{t}}\zeta_{t}} + 2HV_{\hat{\mathcal{P}_{t}}, \hat{b}_{t},1}^{\pi, d_{1}},
    \end{align*}
    where \((i)\) is by Lemma \ref{lemma:value_function_difference} and $(ii)$ follows from the second claim.
\end{proof}

Next, we introduce an optimism result similar to that of Lemma \ref{lemma:almost_optimism_at_init_dist}, which holds locally on the state-occupancy distribution of the behavior policies.


\begin{lemma}\label{lemma:local_optimism}(Almost Local Optimism)
Given that the event \(\mathcal{E}\) occurs, for all \(t\in\mathbb{N}\) and \(h\in[H]\),
\begin{align*}
    \mathbb{E}_{s\sim d_{\mathcal{P}^{\star},h}^{\pi_{t}}}[V_{\mathcal{P}^{\star},r^{\star},h}^{\pi^{\star}}(s) - V_{\hat{\mathcal{P}}_{t},r^{\star} + \hat{b}_{t}, h}^{\pi^{\star}}(s)] &\leq 2H\sqrt{\frac{|A|}{\xi_{t}}\zeta_{t}} + 2HV_{\hat{\mathcal{P}}_{t},\hat{b}_{t},1}^{\pi_{t}^{b}, d_{1}},
\end{align*}
where $\pi_{t}^{b}=\arg\max_{\pi\in\Pi}V_{\hat{\mathcal{P}}_{t}, \hat{b}_{t}, 1}^{\pi, d_{1}}$.
    
\end{lemma}
\begin{proof}
    We have for all $h\in[H]$:
    \begin{align*}
        \mathbb{E}_{s\sim d_{\mathcal{P}^{\star},h}^{\pi_{t}}}[V_{\mathcal{P}^{\star},r^{\star},h}^{\pi^{\star}}(s) - V_{\hat{\mathcal{P}}_{t},r^{\star} + \hat{b}_{t}, h}^{\pi^{\star}}(s)]
        &\leq \mathbb{E}_{s\sim d_{\mathcal{P}^{\star},h}^{\pi_{t}}}[V_{\mathcal{P}^{\star},r^{\star},h}^{\pi^{\star}}(s) - V_{\hat{\mathcal{P}}_{t},r^{\star}, h}^{\pi^{\star}}(s)] \\
        &\leq \mathbb{E}_{s\sim d_{\mathcal{P}^{\star},h}^{\pi_{t}}}[|V_{\mathcal{P}^{\star},r^{\star},h}^{\pi^{\star}}(s) - V_{\hat{\mathcal{P}}_{t},r^{\star}, h}^{\pi^{\star}}(s)|] \\
        &\stackrel{(i)}{\leq} \mathbb{E}_{s\sim d_{\mathcal{P}^{\star},h}^{\pi_{t}}}[V_{\mathcal{P}^{\star},f_{t},h}^{\pi^{\star}}(s)] =: (A),
    \end{align*}
    where $(i)$ follows from Lemma \ref{lemma:value_function_difference}. Now, let $f_{t,i}^{(h:)}(s,a)=f_{t,i}(s,a)\mathbbm{1}\{i\geq h\}$ and $\pi_{t,i}^{(h:)^\star}(a|s)=\pi_{t}(a|s)\mathbbm{1}{\{i<h\}} + \pi^{\star}(a|s)\mathbbm{1}\{i\geq h\}$ for any $h\in [H]$. Then, 
    \begin{align*}
        (A) &= V_{\mathcal{P}^{\star}, f_{t}^{(h:)}, 1}^{\pi_{t}^{(h:)^{\star}}, d_{1}} \stackrel{(i)}{\leq} V_{\mathcal{P}^{\star}, f_{t}, 1}^{\pi_{t}^{(h:)^{\star}}, d_{1}} \stackrel{(ii)}{\leq} 2H\sqrt{\frac{|A|}{\xi_{t}}\zeta_{t}} + 2HV_{\hat{\mathcal{P}}_{t},\hat{b}_{t},1}^{\pi_{t}^{(h:)^{\star}}, d_{1}},
    \end{align*}
    where $(i)$ follows from $f_{t,h}\geq0$ being non-negative for all $h$ and $t$ and $(ii)$ follows from Lemma \ref{lemma:bounded_model_misspecification}. Now, the claim follows by the definition of $\pi_{t}^{b}$.
\end{proof}

We continue by providing a local simulation lemma.

\begin{lemma}\label{lemma:simulation_anytime}
    For all \(t\in\mathbb{N}\) and \(h\in[H]\), we have
\begin{align*}
    \mathbb{E}_{s\sim d_{\mathcal{P}^{\star},h}^{\pi_{t}}}[V_{\hat{\mathcal{P}}_{t},r^{\star} + b_{t,h}, h}^{\pi_{t}}(s) - V_{\mathcal{P}^{\star},r^{\star}, h}^{\pi_{t}}(s)]
    &\leq 2HV_{\mathcal{P}^{\star}, \hat{b}_{t} + f_{t},1}^{\pi_{t}, d_{1}}
\end{align*}

\end{lemma}
\begin{proof}
    We have,
\begin{align*}
    &\mathbb{E}_{s\sim d_{\mathcal{P}^{\star},h}^{\pi_{t}}}[V_{\hat{\mathcal{P}}_{t},r^{\star} + \hat{b}_{t}, h}^{\pi_{t}}(s) - V_{\mathcal{P}^{\star},r^{\star}, h}^{\pi_{t}}(s)] \\
    &= \mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star},h}^{\pi_{t}}}[ Q_{\hat{\mathcal{P}}_{t},r^{\star} + b_{t,h}, h}^{\pi_{t}}(s, a) - Q_{\mathcal{P}^{\star},r^{\star}, h}^{\pi_{t}}(s, a)] \\
    &\leq \mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star},h}^{\pi_{t}}}[ \hat{b}_{h,t}(s, a)] + |\mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star},h}^{\pi_{t}}}[(\hat{\mathcal{P}}_{t,h} - \mathcal{P}_{h}^{\star})V_{\hat{\mathcal{P}}_{t},r^{\star} + \hat{b}_{t}, h+1}^{\pi_{t}}(s,a)]| \\ &\qquad+ \mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star},h}^{\pi_{t}}}[\mathcal{P}_{h}^{\star}(V_{\hat{\mathcal{P}}_{t},r^{\star} + \hat{b}_{t}, h+1}^{\pi_{t}} -V_{\mathcal{P}^{\star},r^{\star},h+1}^{\pi_{t}})(s,a)] \\
    &\stackrel{(i)}{\leq} \mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star},h}^{\pi_{t}}}[\hat{b}_{t,h}(s, a)] + 2H\mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star},h}^{\pi_{t}}}[f_{t,h}(s,a)] \\
    &\qquad+ \mathbb{E}_{s\sim d_{\mathcal{P}^{\star},h+1}^{\pi_{t}}}[V_{\hat{\mathcal{P}}_{t},r^{\star} + \hat{b}_{t}, h+1}^{\pi_{t}}(s) - V_{\mathcal{P}^{\star},r^{\star}, h+1}^{\pi_{t}}(s)],
\end{align*}
where \((i)\) follows from \(\Vert V_{\mathcal{P}, r^{\star} + \hat{b}_{t}}^{\pi}\Vert_{\infty} \leq 2H\). Unraveling the recursion gives the result.
\end{proof}

The previous four lemmata combined are enough to show that the bonus terms provide an almost optimistic estimate of the expected sub-optimality gaps incurred by the behavior policies of algorithm \ref{alg:UniSREP}.

\begin{lemma}\label{lemma:expected_suboptimalitygap_to_bonus}(Sub-optimality gap to bonus)
    Given that the event $\mathcal{E}$ occurs, we have for all \(t\in\mathbb{N}\) and \(h\in[H]\),
    \[
        \mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star},h}^{\pi_{t}}}[\Delta_{h}(s, a)] \leq 10H^{2}(\sqrt{\frac{|A|}{\xi_{t}}\zeta_{t}} + V_{\hat{\mathcal{P}}_{t}, \hat{b}_{t},1}^{\pi_{t}^{b}, d_{1}}),
    \]
    where $\pi_{t}^{b}=\arg\max_{\pi\in\Pi}V_{\hat{\mathcal{P}}_{t}, \hat{b}_{t}, 1}^{\pi, d_{1}}$.
\end{lemma}
\begin{proof}
We have for all $h\in[H]$ and $t\in\mathbb{N}$:
    \begin{align*}
    &\mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star},h}^{\pi_{t}}}[\Delta_{h}(s, a)] \\
    &\stackrel{(i)}{\leq }\mathbb{E}_{s\sim d_{\mathcal{P}^{\star},h}^{\pi_{t}}}[V_{\mathcal{P}^{\star}, r^{\star}, h}^{\pi^{\star}}(s) - V_{\mathcal{P}^{\star},r^{\star}, h}^{\pi_{t}}(s)] \\
    &\stackrel{(ii)}{\leq } 
    \mathbb{E}_{s\sim d_{\mathcal{P}^{\star},h}^{\pi_{t}}}[V_{\hat{\mathcal{P}}_{t},r^{\star} + \hat{b}_{t},h}^{\pi_{t}}(s) - V_{\mathcal{P}^{\star}, r^{\star}, h}^{\pi_{t}}(s) + 
    V_{\mathcal{P}^{\star},r^{\star}, h}^{\pi^{\star}}(s)
    - V_{\hat{\mathcal{P}}_{t},r^{\star}+\hat{b}_{t},h}^{\pi^{\star}}(s)] \\
    & \stackrel{(iii)}{\leq} 2HV_{\mathcal{P}^{\star}, \hat{b}_{t},1}^{\pi_{t}, d_{1}} + 2HV_{\mathcal{P}^{\star}, f_{t},1}^{\pi_{t}, d_{1}} + \mathbb{E}_{s\sim d_{\mathcal{P}^{\star},h}^{\pi_{t}}}[V_{\mathcal{P}^{\star},r^{\star}, h}^{\pi^{\star}}(s)
    - V_{\hat{\mathcal{P}}_{t},r^{\star}+\hat{b}_{t},h}^{\pi^{\star}}(s)] \\
    & \stackrel{(iv)}{\leq} 2H\underbrace{V_{\mathcal{P}^{\star}, \hat{b}_{t},1}^{\pi_{t}, d_{1}}}_{=:(A)} + 2H\underbrace{V_{\mathcal{P}^{\star}, f_{t},1}^{\pi_{t}, d_{1}}}_{=:(B)} + 2H\sqrt{\frac{|A|}{\xi_{t}}\zeta_{t}} + 2HV_{\hat{\mathcal{P}}_{t},\hat{b}_{t},1}^{\pi_{t}^{b}, d_{1}},
\end{align*}

where $(i)$ follows from the optimality of $\pi^{\star}$, $(ii)$ by the optimality of $\pi_{t}$, \((iii)\) follows from the local simulation  Lemma \ref{lemma:simulation_anytime} and $(iv)$ follows from the local optimism Lemma \ref{lemma:local_optimism}. Further,
\begin{align*}
    (A) &= V_{\mathcal{P}^{\star}, \hat{b}_{t},1}^{\pi_{t}, d_{1}} \\
    &\leq V_{\hat{\mathcal{P}}_{t}, \hat{b}_{t},1}^{\pi_{t}, d_{1}} + H|\frac{1}{H}V_{\mathcal{P}^{\star}, \hat{b}_{t},1}^{\pi_{t}, d_{1}} - \frac{1}{H}V_{\hat{\mathcal{P}}_{t}, \hat{b}_{t},1}^{\pi_{t}, d_{1}}| \\
    &\stackrel{(i)}{\leq} V_{\hat{\mathcal{P}}_{t}, \hat{b}_{t},1}^{\pi_{t}, d_{1}} + HV_{\hat{\mathcal{P}}_{t}, f_{t},1}^{\pi_{t}, d_{1}} \\
    &\stackrel{(ii)}{\leq} V_{\hat{\mathcal{P}}_{t}, \hat{b}_{t},1}^{\pi_{t}, d_{1}} + H(\sqrt{\frac{|A|}{\xi_{t}}\zeta_{t}} + V_{\hat{\mathcal{P}}_{t}, \hat{b}_{t},1}^{\pi_{t}, d_{1}}) \\
    &\stackrel{(iii)}{\leq} 2HV_{\hat{\mathcal{P}}_{t}, \hat{b}_{t},1}^{\pi_{t}^{b}, d_{1}} + H\sqrt{\frac{|A|}{\xi_{t}}\zeta_{t}},
\end{align*}
where $(i)$ follows from Lemma \ref{lemma:value_function_difference}, $(ii)$ follows from Lemma \ref{lemma:bounded_model_misspecification} and $(iii)$ by the optimality of $\pi_{t}^{b}$. Similarly, 
\begin{align*}
    (B) &= V_{\mathcal{P}^{\star}, f_{t},1}^{\pi_{t}, d_{1}} \\
    &\stackrel{(i)}{\leq} 2H(\sqrt{\frac{|A|}{\xi_{t}}\zeta_{t}} + V_{\hat{\mathcal{P}}_{t}, \hat{b}_{t},1}^{\pi_{t}, d_{1}}) \\
    &\stackrel{(ii)}{\leq} 2H(\sqrt{\frac{|A|}{\xi_{t}}\zeta_{t}} + V_{\hat{\mathcal{P}}_{t}, \hat{b}_{t},1}^{\pi_{t}^{b}, d_{1}}),
\end{align*}
where $(i)$ follows from Lemma \ref{lemma:bounded_model_misspecification} and $(ii)$ follows from the optimality of $\pi_{t}^{b}$. Finally, we get:
\begin{align*}
    \mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star},h}^{\pi_{t}}}[\Delta_{h}(s, a)] &\leq 10H^{2}(\sqrt{\frac{|A|}{\xi_{t}}\zeta_{t}} + V_{\hat{\mathcal{P}}_{t}, \hat{b}_{t},1}^{\pi_{t}^{b}, d_{1}})
\end{align*}
\end{proof}

We can now leverage the fact that we eventually select only good representations, which leads to the following improved pseudo-regret bound.

\begin{lemma}\label{lemma:sublinear_pseudo_regret_with_UniSOFT}(Sub-linear pseudo-regret with UniSOFT representations)
    Let $\xi_{t}=t^{-1/3}$ and \(\alpha>0\). Suppose assumptions \ref{ass:realizability} (realizability), \ref{ass:unique_optimal_policy} (unique optimal policy), \ref{ass:sub_optimality_gap_exists} (minimal sub-optimality gap) and \ref{ass:expressivness} ($\alpha^{\star}$-expressive function space) hold. Additionally, if \(\alpha <1\), suppose that assumption \ref{ass:min_optimal_occupancy_exists} (minimal optimal occupancy) holds. Then, given that events \(\mathcal{E}(\delta)\) and \(\mathcal{F}(\delta)\) occur, there exists a constant \(\tau\), such that for all \(T\geq\tau\), the behavior policies \(\{\pi_{t}\}_{t\geq1}\) learned by algorithm \ref{alg:UniSREP}, enjoy sub-linear regret: 
\[
    \mathcal{R}(T) \lesssim \frac{\sqrt{\tau}}{\xi_{\tau}} + \frac{1}{\lambda_{\textnormal{max}}^{\star}}H^{3}d|\mathcal{A}|^{1/2}T^{2/3}\log(4T|\Phi||\Psi|H/\delta) \lesssim \tilde{O}(T^{2/3})
\]
    
\end{lemma}
\begin{proof}
    Let \(\tau:=\{\tau_{\textnormal{unisoft}} \vee \tau_{\textnormal{inv}}\}\). Let $t\geq \tau$ be arbitrary. Then, since the event \(\mathcal{E}\) occurs by assumption, by Lemma \ref{lemma:expected_suboptimalitygap_to_bonus},
    we can bound the expected sub-optimality gaps for all \(h\in[H]\),

\[
        \mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star},h}^{\pi_{t}}}[\Delta_{h}(s, a)] \leq 10H^{2}(\sqrt{\frac{|\mathcal{A}|}{\xi_{t}}\zeta_{t}} + V_{\hat{\mathcal{P}}_{t}, \hat{b}_{t},1}^{\pi_{t}^{b}, d_{1}}) := (A).
\]

Further, according to Lemma \ref{lemma:sublinear_pseudo_regret_without_unisoft}, in the event \(\mathcal{E}\), \(\mathcal{R}(t) \leq g(t) =\tilde{O}(\sqrt{t}\xi_{t}^{-1})\) with \(\hat{\alpha}_{t}=\tilde{O}(\xi_{t}^{-1/2})\). We note that if \(\alpha=1\), then all representations are \(\alpha^{\star}\)-approximate and hence we do not require assumption \ref{ass:min_optimal_occupancy_exists} (minimal optimal occupancy) to guarantee their selection in Lemma \ref{lemma:alpha_star_selection}. By Lemma \ref{lemma:UniSOFT_selection_full_rank} and the events \(\mathcal{F}\) and \(\mathcal{E}\), for all \(h\in[H]\), the learned feature maps \(\hat{\phi}_{t,h}\) are non-redundant and UniSOFT. Then, by Lemma \ref{lemma:bounded_uncertainty} and the event \(\mathcal{F}\),
\begin{align*}
    & V_{\hat{\mathcal{P}}_{t}, \hat{b}_{t},1}^{\pi_{t}^{b}, d_{1}} \leq \hat{\alpha}_{t}\sum_{h=1}^{H}\mathbb{E}_{(s,a)\sim d_{\hat{\mathcal{P}}_{t},h}^{\pi_{t}^{b}}}[\Vert\hat{\phi}_{t,h}(s,a)\Vert_{\hat{\Sigma}_{t,h}^{-1}}] \\
    &\leq \frac{\hat{\alpha}_{t}H}{(\lambda_{\textnormal{max}}^{\star} t + \lambda_{t} - \sum_{i=1}^{t}\xi_{i-1} - g(t)\Delta_{\textnormal{min}}^{-1} - 18\sqrt{t\log(6tdH|\Phi|/\delta)})^{1/2}}\\
    &\leq \tilde{O}(\frac{t^{1/6}}{t^{1/2}}) = \tilde{O}(t^{-1/3}).
\end{align*}
Since $t$ was chosen arbitrarily, we get, for all \(T\geq \tau\):
\begin{align*}
    \mathcal{R}(T) &= \sum_{t=1}^{\tau}(V_{\mathcal{P}^{\star}, r^{\star},1}^{\pi^{\star},d_{1}} - V_{\mathcal{P}^{\star}, r^{\star},1}^{\pi_{t},d_{1}}) + \sum_{t=\tau}^{T}\sum_{h=1}^{H}\mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star}, h}^{\pi_{t}}}[\Delta(s,a)] \\
    &\stackrel{(i)}{\lesssim} \tilde{O}(\frac{\sqrt{\tau}}{\xi_{\tau}}) + \frac{1}{\lambda_{\textnormal{max}}^{\star}}H^{3}d|\mathcal{A}|^{1/2}T^{2/3}\log(4T|\Phi||\Psi|H/\delta),
\end{align*}
where \((i)\) follows from the pseudo-regret bound without UniSOFT representations given in Lemma \ref{lemma:sublinear_pseudo_regret_without_unisoft}.
\end{proof}

\begin{restatable}[Instance-dependent regret with UniSOFT representations, Theorem \ref{thm:instance_dependent_regret_bound_with_unisoft}]{theorem}{instancedependentregretwithunisoftformal}\label{thm:instance_dependent_regret_bound_with_unisoft_formal}
       Let $\xi_{t}=t^{-1/3}$ and \(\alpha\in(0, 1]\). Suppose assumptions \ref{ass:realizability} (realizability), \ref{ass:sub_optimality_gap_exists} (minimal sub-optimality gap),
       \ref{ass:expressivness} ($\alpha^{\star}$-expressive function space) and \ref{ass:unique_optimal_policy} (unique optimal policy) hold. Additionally, if \(\alpha < 1\), suppose that assumption \ref{ass:min_optimal_occupancy_exists} (minimal optimal occupancy) holds. Then for any \(T\in\mathbb{N}\), UniSREP-UCB (Algorithm \ref{alg:UniSREP}) satisfies the following:
    \begin{align*}
    \mathbb{E}[\tilde{\mathcal{R}}(T)] &= \tilde{O}\left( H^{3}d^{2}|\mathcal{A}|(\tau_{\textnormal{
    good
    }}\wedge T)^{5/6} + \frac{1}{\lambda_{\textnormal{max}}^{\star}}H^{4}d|\mathcal{A}|^{1/2}T^{2/3}\right),
    \end{align*}
    where 
    \begin{align*}
    \tau_{\textnormal{good}}&\lesssim\{ \kappa_{3}^{6} \cdot \log^{12}(\kappa_{3} \cdot \kappa_{2}) \vee  \kappa_{1}^{6}\cdot \log^{12}(\kappa_{1} \cdot \kappa_{2}) \}\\
    &\lesssim\frac{H^{12}d^{12}|\mathcal{A}|^{6}}{(\Delta_{\textnormal{min}}\{\alpha d_{\textnormal{min}}^{\star} \wedge \lambda_{\textnormal{max}}^{\star}\})^{6}}\cdot \log^{12}(TH^{3}d^{3/2}|\mathcal{A}||\Phi||\Psi|),
    \end{align*}
    with \(\kappa_{1}=\frac{H^{2}d^{2}|\mathcal{A}|}{\alpha\Delta_{\textnormal{min}}d_{\textnormal{min}}^{\star}}\), \(\kappa_{2}=TH|\Phi||\Psi|\), \(\kappa_{3}=\frac{H^{2}d^{2}|\mathcal{A}|}{\lambda_{\textnormal{max}}^{\star}\Delta_{\textnormal{min}}}\) and 
    \(\lambda_{\textnormal{max}}^{\star} = \min_{\tilde{\alpha}\leq\alpha}\max_{\phi\in\Phi_{\tilde{\alpha}}^{\textnormal{unisoft}}}\lambda^{\star}(\phi)\).
\end{restatable}

%\instancedependentregretwithunisoft*

\begin{proof}
Let \(\tau_{\textnormal{good}}:=\{\tau_{\textnormal{unisoft}} \vee \tau_{\textnormal{inv}}\}\) and \(T\geq\tau_{\textnormal{good}}\) be given and fixed. Choose \(\delta = T^{-1}\). Recall that Algorithm \ref{alg:UniSREP} explores for $H$ time steps, for each $h\in[H]$ and episode $t$, by rolling into time step $h-1$ with policy $\pi_{t-1}$, taking actions according to $\tilde{\pi}_{t,h-1}$ and $\tilde{\pi}_{t,h}$ and finally, rolling out to time step $H$ with policy $\pi_{t-1}$. Let us denote \(\tilde{V}^{d_{1}}_{t,h}\)
as the cumulative expected reward obtained by Algorithm \ref{alg:UniSREP} in episode $t$ and time step $h$. Then,
\begin{align*}
     &\mathbb{E}_{\delta, \xi}[\tilde{\mathcal{R}}(T)] \\&= \mathbb{E}_{\delta, \xi}[\sum_{t=1}^{T}\sum_{h=1}^{H}(V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi^{\star}, d_{1}} - \tilde{V}_{t,h})] \\
    &\leq \mathbb{E}_{\delta, \xi}[\sum_{t=1}^{T}\sum_{h=1}^{H}\mathbbm{1}\{e_{t}=1\}\mathbbm{1}\{\mathcal{E}(\delta)\}\mathbbm{1}\{\mathcal{F}(\delta)\}(V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi^{\star}, d_{1}} - \tilde{V}_{t,h})] \\
    &\qquad+ \mathbb{E}_{\delta, \xi}[\sum_{t=1}^{T}\sum_{h=1}^{H}\mathbbm{1}\{e_{t}=0\}(V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi^{\star}, d_{1}} - \tilde{V}_{t,h})] +  \mathbb{E}_{\delta, \xi}[\sum_{t=1}^{T}\sum_{h=1}^{H}\mathbbm{1}\{\mathcal{E}^{c}(\delta)\}(V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi^{\star}, d_{1}} - \tilde{V}_{t,h})]\\
    &\qquad + \mathbb{E}_{\delta, \xi}[\sum_{t=1}^{T}\sum_{h=1}^{H}\mathbbm{1}\{\mathcal{F}^{c}(\delta)\}(V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi^{\star}, d_{1}} - \tilde{V}_{t,h})]  \\
\end{align*}
\begin{align*}
    &\stackrel{(i)}{\leq}  \mathbb{E}_{\delta, \xi}[\sum_{t=1}^{T}\sum_{h=1}^{H}\mathbbm{1}\{e_{t}=1\}\mathbbm{1}\{\mathcal{E}(\delta)\}\mathbbm{1}\{\mathcal{F}(\delta)\}(V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi^{\star}, d_{1}} - \tilde{V}_{t,h})]\\
    &\qquad+ \mathbb{E}_{\delta, \xi}[\sum_{t=1}^{T}\sum_{h=1}^{H}\mathbbm{1}\{e_{t}=0\} + \mathbbm{1}\{\mathcal{E}^{c}(\delta)\} + \mathbbm{1}\{\mathcal{F}^{c}(\delta)\}] \\
    &\stackrel{(ii)}{\leq} \mathbb{E}_{\delta, \xi}[\sum_{t=1}^{T}\sum_{h=1}^{H}\mathbbm{1}\{e_{t}=1\}\mathbbm{1}\{\mathcal{E}(\delta)\}\mathbbm{1}\{\mathcal{F}(\delta)\}(V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi^{\star}, d_{1}} - V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi_{t-1}, d_{1}}] + H(2T\delta + \sum_{t=1}^{T}\xi_{t})\} \\
    &\leq H\mathbb{E}_{\delta, \xi}[\sum_{t=1}^{T}\mathbbm{1}\{\mathcal{E}(\delta)\}\mathbbm{1}\{\mathcal{F}(\delta)\}(V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi^{\star}, d_{1}} - V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi_{t-1}, d_{1}}] + H(2 + \sum_{t=1}^{T}t^{-1/3})\} \\
    &\stackrel{(iii)}{\leq}  H\underbrace{\mathbb{E}_{\delta, \xi}[\sum_{t=1}^{T}\mathbbm{1}\{\mathcal{E}(\delta)\}\mathbbm{1}\{\mathcal{F}(\delta)\}(V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi^{\star}, d_{1}} - V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi_{t}, d_{1}}]}_{(A)} + \frac{3}{2}HT^{2/3} + 3H\},
\end{align*}
where \((i)\) follows from $\Vert V_{\mathcal{P}, r^{\star}}^{\pi}\Vert_{\infty}\leq1$, \((ii)\) follows from \(\tilde{\pi}_{t}\) and \(\pi_{t-1}\) agreeing on the event \(e_{t}=1\), Lemma \ref{lemma:event_occurs} and Lemma \ref{lemma:bounded_uncertainty} and $(iii)$ follows from an index shift and $\Vert V_{\mathcal{P}, r^{\star}}^{\pi}\Vert_{\infty}\leq1$. Now, we can leverage the pseudo-regret result of Lemma \ref{lemma:sublinear_pseudo_regret_with_UniSOFT} to bound term $(A)$,
\begin{align*}
    (A) &= \mathbb{E}_{\delta, \xi}[\sum_{t=1}^{T}\mathbbm{1}\{\mathcal{E}(\delta)\}\mathbbm{1}\{\mathcal{F}(\delta)\}(V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi^{\star}, d_{1}} - V_{\mathcal{P}^{\star}, r^{\star}, 1}^{\pi_{t}, d_{1}}] \\
    &\lesssim \frac{\sqrt{\tau_{\textnormal{good}}}}{\xi_{\tau_{\textnormal{good}}}} + \frac{1}{\lambda_{\textnormal{max}}^{\star}}H^{3}d|\mathcal{A}|^{1/2}T^{2/3}\log(4 T|\Phi||\Psi|H/\delta) \\
    &\lesssim \tau_{\textnormal{good}}^{5/6} + \tilde{O}(T^{2/3}).
\end{align*}
Substituting \(\tau_{\textnormal{good}}\) with the sufficient condition in Lemma \ref{lemma:critical_episodes} with \(\gamma=3\) and using \(T\gtrsim 
 a\log^{n}(ab)\) as a sufficient condition for \(T\geq a\log^{n}(bT)\), concludes the proof.
\end{proof}

