Here we provide the omitted proofs of the main paper. In particular, Section \ref{sec:sublinear-regret} provides the proof for the baseline result in Theorem \ref{lemma:sublinear_expected_regret_without_unisoft}, in Section \ref{sec:unisoft_selection} we show how we guarantee the selection of good representations and in Section \ref{sec:improved_pseudo_regret} and Section \ref{sec:constant_pseudo_regret} we show how good representations can be leveraged to obtain an improved regret bound (Theorem \ref{thm:instance_dependent_regret_bound_with_unisoft}) and constant regret (Theorem \ref{thm:optimal_policy_identification}), respectively. Finally, in Section \ref{sec:existance_unisoft} we discuss the existence of good representations, in Section \ref{sec:mult_pol} we show how our results can be extended for multiple optimal policies and Section \ref{sec:auxiliary_lemmas} provides auxiliary results.

We begin by introducing notation and good events. Let us denote
\[
    \tilde{\pi}_{t,h}(a|s) = \xi_{t-1}\cdot\frac{1}{|\mathcal{A}|} + (1-\xi_{t-1})\cdot\pi_{t-1,h}(a|s)
\]
as the roll-out policy in episode $t$, which, with probability \(\xi_{t}\), explores by taking an action uniformly at random and otherwise, selects an action according to the behavior policy \(\pi_{t-1,h}\) from the previous episode. Importantly, we assume that the sequence \((\xi_{t})_{t=1}^{T}\) is decreasing. Note that policy \(\tilde{\pi}_{t,h}\) collects the transitions stored in the datasets of algorithm \ref{alg:UniSREP} and only interacts with the environment after sampling a state from \(d_{\mathcal{P}^{\star},h-1}^{\pi_{t-1}}\). Further, we denote the average roll-out policy as
\[
    \bar{\pi}_{t,h}(a|s)=\frac{1}{t}\sum_{i=0}^{t-1}\left(\xi_{i}\cdot\frac{1}{|\mathcal{A}|} + (1-\xi_{i})\cdot\pi_{i,h}(a|s)\right),
\] 
We define the mixture occupancy distributions 
\begin{align*}
    &\rho_{t,h}(s) = \frac{1}{t}\sum_{i=0}^{t-1}d_{\mathcal{P}^{\star},h}^{{\pi}_{i}}(s), \\ &\gamma_{t,h}(s,a) = \frac{1}{t}\sum_{i=0}^{t-1}d_{\mathcal{P}^{\star},h}^{{\pi}_{i}}(s,a),\\ 
    &\rho_{t,h}(s,a) = \rho_{t,h}(s)\bar{\pi}_{t,h}(a|s),
\end{align*}
the next-state marginal distribution and next-state mixture occupancy distribution
\[
    \rho_{t,h}'(s') = \sum_{(s,a)\in\mathcal{S}\times\mathcal{A}}\rho_{t,h-1}(s,a)\mathcal{P}_{h}^{\star}(s'|s,a), \textnormal{ and}
\]
\[
    \rho_{t, h}'(s, a) = \rho_{t,h}'(s)\bar{\pi}_{t,h}(a|s),
\]
respectively.
Denote the total variation distance between the estimated model and the true model as
\[
    f_{t,h}(s,a) := \Vert\hat{\mathcal{P}}_{h, t}(\cdot|s, a) - \mathcal{P}_{h}^{\star}(\cdot|s,a)\Vert_{\textnormal{TV}}.
\]
Additionally, let
\begin{align*}
    \Sigma_{\rho_{t}, \phi} &= t\mathbb{E}_{(s,a)\sim\rho_{t}}[\phi(s,a)\phi(s,a)^{T}] + \lambda_{t}I,
%    \\
%    \hat{\Sigma}_{t,h, \phi} &= t\mathbb{E}_{(s,a)\in\mathcal{D}_{t,h}}[\phi(s,a)\phi(s,a)^{T}] + \lambda_{t}I,
\end{align*}
where \(\lambda_{t} = c_{1}d\log(4tH|\Phi|/\delta)\), \(c_{1}\) is a constant and \(\rho_{t}\in\Delta(\mathcal{S}\times\mathcal{A})\) is an episode dependent distribution over the state-action space. Further we define the following two good events:

\begin{align*}
    &\mathcal{E}_{1}(\delta) = \{\forall t\in\mathbb{N}, h\in[H], s\in\mathcal{S}, a\in\mathcal{A}: 
 \mathbb{E}_{(s,a)\sim \rho_{t,h}'}[f_{t,h}(s,a)^{2}] \leq \zeta_{t} \} \\
    &\mathcal{E}_{2}(\delta) = \{\forall t\in\mathbb{N}, h\in[H], s\in\mathcal{S}, a\in\mathcal{A}: \\
    &\qquad \frac{1}{5}\Vert\hat{\phi}_{t,h}(s,a)\Vert_{\Sigma_{\rho_{t,h},\hat{\phi}_{t,h}}^{-1}} \leq \Vert\hat{\phi}_{t,h}(s,a)\Vert_{\hat{\Sigma}_{t,h}^{-1}} \leq 3\Vert\hat{\phi}_{t,h}(s,a)\Vert_{\Sigma_{\rho_{t,h},\hat{\phi}_{t,h}}^{-1}}
    \},
\end{align*}
where \(\zeta_{t}=\frac{2\log(4t|\Phi||\Psi|H/\delta)}{t}\). Finally, let \(\mathcal{E}(\delta):=\mathcal{E}_{1}(\delta/2)\cap\mathcal{E}_{2}(\delta/2)\). The good event $\mathcal{E}$ guarantees the convergence of the MLE oracle \citep{uehara2021representation} and the concentration of the bonus term.

\begin{lemma}\label{lemma:event_occurs}
    Fix \(\delta\in(0,1)\). Suppose Assumption \ref{ass:realizability} (realizability) holds and we run algorithm \ref{alg:UniSREP}. Then, with probability at least \(1-\delta\), the event \(\mathcal{E}(\delta)\) occurs.
\end{lemma}
\begin{proof}
    By Lemma \ref{lemma:MLE}, with probability at least \(1 - \delta/2\), event \(\mathcal{E}_{1}(\delta/2)\) occurs.
    Furthermore, by Lemma 11 in \cite{uehara2021representation}, with probability at least \(1-\delta/2\), event \(\mathcal{E}_{2}(\delta/2)\) occurs. Taking an union bound concludes the proof.
\end{proof}

\section{Sub-Linear Pseudo-Regret without Good Representations}\label{sec:sublinear-regret}
\input{uai2025-template/Appendix Sections/Sublinear REP-UCB}

\section{Selecting Good Representations}\label{sec:unisoft_selection}
\input{uai2025-template/Appendix Sections/UniSOFT Selection}

\section{Improved Pseudo-Regret with Good Representations}\label{sec:improved_pseudo_regret}
\input{uai2025-template/Appendix Sections/Improved REP-UCB}

\section{Constant Pseudo-Regret with Good Representations}\label{sec:constant_pseudo_regret}
\input{uai2025-template/Appendix Sections/Constant Regret}

\section{Existence of Good Representations}\label{sec:existance_unisoft}
\input{uai2025-template/Appendix Sections/UniSOFT Existence}

\section{Multiple Optimal Policies}\label{sec:mult_pol}
\input{uai2025-template/Appendix Sections/Multiple Optimal Policies}

\section{Auxiliary Results}\label{sec:auxiliary_lemmas}
\input{uai2025-template/Appendix Sections/Auxilliary Results}