In this section, we show how the expressiveness assumption \ref{ass:expressivness} and the constrained optimization objective (Algorithm \ref{alg:UniSREP}, Line \ref{alg:oracle}) play together to guarantee the selection of good representations. The analysis builds on the sub-linear regret result for the behavior policies (Lemma \ref{lemma:sublinear_pseudo_regret_without_unisoft}) provided in the previous section.

\paragraph{Selecting \(\alpha^{\star}\)-approximate representations}

We start by introducing an important result provided by \cite{huang2022tiered}, which states that the average occupancy distribution induced by any sequence of deterministic policies that achieve low regret eventually provides a good approximation of the occupancy distribution of the optimal policy (assuming the optimal policy is unique).

Let us denote \(\Pi^{\star}\) as the set of all optimal (deterministic) policies and \(\Pi_{h}^{\star}(s)\) as the set of all optimal actions in state \(s\in\mathcal{S}\) and time step \(h\in[H]\). Then, we construct \(\tilde{\pi}_{t}^{\star}:=\{\tilde{\pi}_{t,h}^{\star}\}_{h\in[H]}\), where for each \(h\in[H]\),
\begin{equation*}
    \tilde{\pi}_{t,h}^{\star}(s) =
    \begin{cases}
        \pi_{t,h}(s) & \textnormal{if } \pi_{t,h}(s)\in\Pi_{h}^{\star}(s)  \\
        Select(\Pi_{h}^{\star}(s)) & \textnormal{otherwise}
    \end{cases},
\end{equation*}
where \(Select\) is a function which returns a fixed element of some set and \(\pi_{t}\) is the behavior policy of algorithm \ref{alg:UniSREP} at episode \(t\in\mathbb{N}\). We define the mixture occupancy distribution of our constructed optimal policies \(\tilde{\pi}_{t}^{\star}\) as 
\[
    \tilde{\gamma}_{t,h}^{\star}(s,a) = \frac{1}{t}\sum_{i=0}^{t-1}d_{\mathcal{P}^{\star},h}^{\tilde{\pi}_{i}^{\star}}(s,a).
\] 
Note that \(\tilde{\gamma}_{t,h}^{\star} \equiv d_{\mathcal{P}^{\star},h}^{\pi^{\star}}\) whenever there exists a unique optimal policy (Assumption \ref{ass:unique_optimal_policy}). 

\begin{theorem}\label{thm:occupancy_bound_sequence_opt_policies_mult_pol}(\citep{huang2022tiered}, Theorem 4.7)
    Suppose that we run algorithm \ref{alg:UniSREP}. Then, for all \(h\in[H]\) and \((s,a)\in\mathcal{S}\times\mathcal{A}\),

    \[
        \sum_{i=1}^{t}d_{\mathcal{P}^{\star},h}^{\pi_{i}}(s,a) \geq \sum_{i=1}^{t}d_{\mathcal{P}^{\star},h}^{\tilde{\pi}_{i}^{\star}}(s,a) - \frac{1}{\Delta_{\textnormal{min}}}\left(\sum_{i=1}^{t}V_{\mathcal{P}^{\star}, r^{\star},1}^{\tilde{\pi}^{\star}_{i}, d_{1}} - V_{\mathcal{P}^{\star}, r^{\star},1}^{\pi_{i}, d_{1}}\right).
    \]
    
\end{theorem}

\begin{corollary}\label{corr:occupancy_bound}
    Suppose that we run algorithm \ref{alg:UniSREP} and assumption \ref{ass:sub_optimality_gap_exists} (minimal sub-optimality gap) hold. Then, Theorem \ref{thm:occupancy_bound_sequence_opt_policies_mult_pol} implies, for all \(h\in[H]\), \(t\in\mathbb{N}\) and \((s,a)\in\mathcal{S}\times\mathcal{A}\),
    \[
        \tilde{\gamma}_{t,h}^{\star}(s,a) \leq \gamma_{t,h}(s,a) + \frac{\mathcal{R}(t)}{t\Delta_{\textnormal{min}}}.
    \]
\end{corollary}

We can leverage the above corollary to show that, whenever there exists a unique optimal policy, the MLE oracle converges uniformly on the optimal occupancy distribution, provided that the distribution is well defined for all states. Subsequently, for any given $\alpha$, there must exist an episode after which algorithm \ref{alg:UniSREP} will only select representations that are \(\alpha^{\star}\)-approximate.

\begin{lemma}\label{lemma:alpha_star_selection}(Selecting \(\alpha^{\star}\)-representations)
    Fix any \(\alpha>0\). Assume there exists an increasing sub-linear function $g$ such that \(\mathcal{R}(t) \leq g(t)\) for all \(t\in\mathbb{N}\). Suppose we run algorithm \ref{alg:UniSREP} and assumptions \ref{ass:unique_optimal_policy} (unique optimal policy), \ref{ass:min_optimal_occupancy_exists} (minimal optimal occupancy) and \ref{ass:sub_optimality_gap_exists} (minimal sub-optimality gap) hold. Then, given that the event \(\mathcal{E}\) occurs, there exists an episode \(\tau_{\alpha}\), such that for all \(t\geq \tau_{\alpha}\) and \(h\in[H]\), the learned feature maps \(\hat{\phi}_{t,h}\) are \(\alpha^{\star}\)-approximate, where
    \[
        \tau_{\alpha} := \min\{t|t>\frac{1}{\alpha}\left(\frac{g(t)}{\Delta_{\textnormal{min}}d_{\textnormal{min}}^{\star}} + \frac{|\mathcal{A}|}{\xi_{t}}\sqrt{2t\log(4t|\Phi||\Psi|H/\delta)}\right)\}.
    \]
\end{lemma}
\begin{proof}
    Let $t\in\mathbb{N}$ be arbitrary. Then, for all \(h\in[H]\),
    \begin{align*}
        \mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star},h}^{\pi^{\star}}}[f_{t,h}(s,a)] &= \sum_{(s,a)\in\mathcal{S}\times\mathcal{A}}d_{\mathcal{P}^{\star},h}^{\pi^{\star}}(s,a)f_{t,h}(s,a) \\
        &\stackrel{(i)}{\leq}\sum_{(s,a):d_{\mathcal{P}^{\star},h}^{\pi^{\star}}(s,a)>0}(\gamma_{t,h}(s,a) + \frac{\mathcal{R}(t)}{t\Delta_{\textnormal{min}}})f_{t,h}(s,a) \\
        &\stackrel{(ii)}{\leq}\mathbb{E}_{(s,a)\sim \gamma_{t,h}}[f_{t,h}(s,a)] + \frac{g(t)}{t\Delta_{\textnormal{min}}} \sum_{(s,a):d_{\mathcal{P}^{\star},h}^{\pi^{\star}}(s,a)>0}\frac{d_{\mathcal{P}^{\star},h}^{\pi^{\star}}(s,a)}{d_{\textnormal{min}}^{\star}} \\
        &\stackrel{(iii)}{\leq}\sqrt{\frac{|\mathcal{A}|^{2}}{\xi_{t}^{2}}\mathbb{E}_{(s,a)\sim \rho_{t,h}'}[f_{t,h}(s,a)^{2}]} + \frac{g(t)}{t\Delta_{\textnormal{min}}d_{\textnormal{min}}^{\star}} \\
        &\stackrel{(iv)}{\leq} \frac{|\mathcal{A}|}{\xi_{t}}\sqrt{\zeta_{t}} + \frac{g(t)}{t\Delta_{\textnormal{min}}d_{\textnormal{min}}^{\star}},
    \end{align*}
    where \((i)\) is by Corollary \ref{corr:occupancy_bound}, \((ii)\) follows from \(\Vert f_{t,h}\Vert_{\infty}\leq 1\), \((iii)\) is by importance sampling and Jensen's inequality and \((iv)\) follows from the event \(\mathcal{E}\). Since \(g\) is sub-linear, the above quantity decreases with \(t\). Solving for \(t\) yields the result. 
\end{proof}

\paragraph{Selecting non-redundant UniSOFT representations}

Although we can now be sure to select $\alpha^{\star}$-approximate representations, we still need to ensure that the UniSOFT loss in equation \ref{eq:unisoft_loss} will lead to Algorithm \ref{alg:UniSREP} actually selecting UniSOFT representations. Hence, we want to relate the eigenvalues of the expected covariance matrix of the optimal policy, which tells us if a feature map is UniSOFT, to the eigenvalues of the sample covariance matrix, which are captured by the UniSOFT loss in equation \ref{eq:unisoft_loss}. We define the following good events:
\begin{align*}
    \mathcal{F}_{1}(\delta) &:= \{\forall t\in\mathbb{N}, h\in[H], \phi\in|\Phi|:\\
    &\qquad \Sigma_{t,h} \succcurlyeq t\Sigma_{t,h}^{\star} + \lambda_{t} I - 2I\sum_{i=1}^{t}\xi_{i-1} - \Delta_{\textnormal{min}}^{-1}g(t)I - 18I\sqrt{t\log(6tdH|\Phi|/\delta)}\} \\
    \mathcal{F}_{2}(\delta) &:= \{\forall t\in\mathbb{N}, h\in[H], \phi\in|\Phi|:\\
    &\qquad \Sigma_{t,h} \preccurlyeq t\Sigma_{t,h}^{\star} + \lambda_{t} I + 2I\sum_{i=1}^{t}\xi_{i-1} + \Delta_{\textnormal{min}}^{-1}g(t)I + 18I\sqrt{t\log(6tdH|\Phi|/\delta)}\},
\end{align*}
where \(\Sigma_{t,h}^{\star}=\mathbb{E}_{(s,a)\sim \tilde{\gamma}_{t,h}^{\star}}[\phi(s,a)\phi(s,a)^{T}]\), \(\Sigma_{t,h} = \sum_{(s,a)\in\mathcal{D}_{t,h}}\phi_{h}(s,a)\phi_{h}(s,a)^{T}\) and $g$ is any increasing function such that \(\mathcal{R}(t)\leq g(t)\) for all \(t\in\mathbb{N}\). In addition, define \(\mathcal{F}(\delta):=\mathcal{F}_{1}(\delta/2)\cap\mathcal{F}_{2}(\delta/2)\).

\begin{lemma}\label{lemma:eigenvalue_bounds}(Eigenvalue bounds)
   Assume that there exists an increasing sub-linear function $g$ such that \(\mathcal{R}(t) \leq g(t)\) for all \(t\in\mathbb{N}\). Assume that we run Algorithm \ref{alg:UniSREP} and that assumption \ref{ass:sub_optimality_gap_exists} (minimal sub-optimality gap) holds. Then, with probability at least \(1-\delta\), the event \(\mathcal{F}(\delta)\) occurs.
\end{lemma}
\begin{proof}  
    Recall that algorithm \ref{alg:UniSREP} produces for each time step $h\in[H]$, one trajectory \(\tau_{h}\), in any episode $t$. Furthermore, for each trajectory \(\tau_{h}\), we only use the transition at the time step $h$ to construct the empirical covariance matrix \(\hat{\Sigma}_{t,h}\).
    
    \textbf{Upper bound:}  Let \(\tau^{(t,h)}\) denote the trajectory produced by rolling in with the behavior policy \(\pi_{t-1}\) and then taking action according to \(\tilde{\pi}_{t,h}\) in episode \(t\in\mathbb{N}\) for time step \(h\in[H]\). Additionally, \((s_{h}^{\tau}, a_{h}^{\tau})\) denotes a state-action pair at time step \(h\in[H]\) of trajectory \(\tau\). We define the set of trajectories of length \(h\in[H]\) under which the (deterministic) behavior policy in some episode \(t\in\mathbb{N}\) is optimal:
    \begin{equation*}
        \Gamma_{h,t}^{\star} = \{\tau\in\Gamma_{h}:\pi_{t-1,i}(s_{i}^{\tau})=\tilde{\pi}_{t-1,i}^{\star}(s_{i}^{\tau}) \text{ for } i = 1, ..., h\},
    \end{equation*}
    where \(\Gamma_{h}\) denotes the set of trajectories of length \(h\in[H]\). The distribution over trajectories induced by any (deterministic) policy \(\pi\) is given by
    \begin{align*}
        \rho_{h}^{\pi} = d_{1}(s_{1})\mathbbm{1}[a_{1}=\pi_{1}(s_{1})]\mathcal{P}_{1}^{\star}(s_{2}|a_{1}, s_{1})...\mathcal{P}_{h-1}^{\star}(s_{h}|a_{h-1}, s_{h-1})\mathbbm{1}[a_{h}=\pi_{h}(s_{h})].
    \end{align*}
    Additionally, for any (deterministic) policy \(\pi\), we denote
    \begin{align*}
        \rho_{h}^{\pi, \xi} = d_{1}(s_{1})\mathbbm{1}[a_{1}=\pi_{1}(s_{1})]\mathcal{P}_{1}^{\star}(s_{2}|a_{1}, s_{1})...\mathcal{P}_{h-1}^{\star}(s_{h}|a_{h-1}, s_{h-1})\tilde{\pi}_{h, \xi}(a_{h}|s_{h}),
    \end{align*}
    where \(\tilde{\pi}_{h, \xi}(a_{h}|s_{h}) = \frac{\mathbbm{1}[e = 0]}{|\mathcal{A}|} + \mathbbm{1}[e = 1]\mathbbm{1}[a_{h}=\pi_{h}(s_{h})]\) and \(e\sim \textnormal{Ber}(1-\xi)\),
    as the trajectory distribution induced by algorithm \ref{alg:UniSREP}.
    Finally, we denote \(\tau_{1:h}^{(t, h)}\) as the trajectory \(\tau^{(t, h)}\) cut off at time step \(h\in[H]\). Then,
\begin{align*}
    \Sigma_{h,t} - \lambda_{t} I &= \sum_{i=1}^{t}\phi(s_{h}^{\tau^{(i, h+1)}},a_{h}^{\tau^{(i, h+1)}})\phi(s_{h}^{\tau^{(i, h+1)}},a_{h}^{\tau^{(i, h+1)}})^{T} \\
    &\preccurlyeq \underbrace{\sum_{i=1}^{t}\mathbbm{1}[e_{i}=1]\mathbbm{1}[\tau_{1:h}^{(i, h+1)}\in\Gamma_{h,i}^{\star}]\phi(s_{h}^{\tau^{(i, h+1)}},a_{h}^{\tau^{(i, h+1)}})\phi(s_{h}^{\tau^{(i, h+1)}},a_{h}^{\tau^{(i, h+1)}})^{T}}_{(A)} \\
    &\qquad+\underbrace{\sum_{i=1}^{t}\mathbbm{1}[\tau_{1:h}^{(i, h+1)}\notin\Gamma_{h,i}^{\star}]\phi(s_{h}^{\tau^{(i, h+1)}},a_{h}^{\tau^{(i, h+1)}})\phi(s_{h}^{\tau^{(i, h+1)}},a_{h}^{\tau^{(i, h+1)}})^{T}}_{(B)} \\
    &\qquad+\underbrace{\sum_{i=1}^{t}\mathbbm{1}[e_{i} = 0]\phi(s_{h}^{\tau^{(i, h+1)}},a_{h}^{\tau^{(i, h+1)}})\phi(s_{h}^{\tau^{(i, h+1)}},a_{h}^{\tau^{(i, h+1)}})^{T}}_{(C)}
\end{align*}
Then, with probability of at least \(1-\delta/6\), for all \(t\in\mathbb{N}\) and all \(h\in[H]\) and \(\phi\in\Phi\),
\begin{align*}
    (A) &= \sum_{i=1}^{t}\mathbbm{1}[e_{i}=1]\mathbbm{1}[\tau_{1:h}^{(i, h+1)}\in\Gamma_{h,i}^{\star}]\phi(s_{h}^{\tau^{(i, h+1)}},a_{h}^{\tau^{(i, h+1)}})\phi(s_{h}^{\tau^{(i, h+1)}},a_{h}^{\tau^{(i, h+1)}})^{T} \\ &= \sum_{i=1}^{t}\mathbbm{1}[e_{i}=1]\mathbbm{1}[\tau_{1:h}^{(i, h+1)}\in\Gamma_{h,i}^{\star}]\phi(s_{h}^{\tau^{(i, h+1)}},\tilde{\pi}_{t-1,h}^{\star}(s_{h}^{\tau^{(i, h+1)}}))\phi(s_{h}^{\tau^{(i, h+1)}},\tilde{\pi}_{t-1,h}^{\star}(s_{h}^{\tau^{(i, h+1)}}))^{T} \\
    &= \sum_{i=1}^{t}\mathbb{E}_{\tau\sim\rho_{h}^{\pi_{i-1}, \xi_{i-1}}}[\mathbbm{1}[e=1]\mathbbm{1}[\tau\in\Gamma_{h,i}^{\star}]\phi(s_{h}^{\tau},\tilde{\pi}_{t-1,h}^{\star}(s_{h}^{\tau}))\phi(s_{h}^{\tau},\tilde{\pi}_{t-1,h}^{\star}(s_{h}^{\tau}))^{T}]\\ 
    &\qquad+ \sum_{i=1}^{t}\mathbbm{1}[e_{i}=1]\mathbbm{1}[\tau_{1:h}^{(i, h+1)}\in\Gamma_{h,i}^{\star}]\phi(s_{h}^{\tau^{(i, h+1)}},\tilde{\pi}_{t-1,h}^{\star}(s_{h}^{\tau^{(i, h+1)}}))\phi(s_{h}^{\tau^{(i, h+1)}},\tilde{\pi}_{t-1,h}^{\star}(s_{h}^{\tau^{(i, h+1)}}))^{T} \\
    &\qquad- \sum_{i=1}^{t}\mathbb{E}_{\tau\sim\rho_{h}^{\pi_{i-i}, \xi_{i-1}}}[\mathbbm{1}[e=1]\mathbbm{1}[\tau\in\Gamma_{h,i}^{\star}]\phi(s_{h}^{\tau},\tilde{\pi}_{t-1,h}^{\star}(s_{h}^{\tau}))\phi(s_{h}^{\tau},\tilde{\pi}_{t-1,h}^{\star}(s_{h}^{\tau}))^{T}]\\
    &\stackrel{(i)}{\preccurlyeq}\underbrace{ \sum_{i=1}^{t}\mathbb{E}_{\tau\sim\rho_{h}^{\pi_{i-1}, \xi_{i-1}}}[\mathbbm{1}[e=1]\mathbbm{1}[\tau\in\Gamma_{h,i}^{\star}]\phi(s_{h}^{\tau},\tilde{\pi}_{t-1,h}^{\star}(s_{h}^{\tau}))\phi(s_{h}^{\tau},\tilde{\pi}_{t-1,h}^{\star}(s_{h}^{\tau}))^{T}]}_{(A1)} + 8I\sqrt{t\log(6tdH|\Phi|/\delta)},
\end{align*}
where \((i)\) follows from \(\Vert\phi_{h}\Vert_{2}\leq 1\) and Proposition \ref{prop:matrix_azuma} in combination with a union bound over all episodes \(t\in\mathbbm{N}\), time steps \(h\in[H]\) and feature maps \(\phi\in\Phi\). Further,
\begin{align*}
    (A1) &= \sum_{i=1}^{t}\mathbb{E}_{\tau\sim\rho_{h}^{\pi_{i-1}, \xi_{i-1}}}[\mathbbm{1}[e=1]\mathbbm{1}[\tau\in\Gamma_{h,i}^{\star}]\phi(s_{h}^{\tau},\tilde{\pi}_{t-1,h}^{\star}(s_{h}^{\tau}))\phi(s_{h}^{\tau},\tilde{\pi}_{t-1,h}^{\star}(s_{h}^{\tau}))^{T}] \\
    &\stackrel{(i)}{=} \sum_{i=1}^{t}\mathbb{E}_{\tau\sim\rho_{h}^{\pi_{i-1}}}[\mathbbm{1}[\tau\in\Gamma_{h,i}^{\star}]\phi(s_{h}^{\tau},\tilde{\pi}_{t-1,h}^{\star}(s_{h}^{\tau}))\phi(s_{h}^{\tau},\tilde{\pi}_{t-1,h}^{\star}(s_{h}^{\tau}))^{T}] \\
    &\stackrel{(ii)}{\preccurlyeq} \sum_{i=1}^{t}\mathbb{E}_{\tau\sim \rho_{h}^{\tilde{\pi}_{i-1}^{\star}}}[\phi(s_{h}^{\tau},a_{h}^{\tau})\phi(s_{h}^{\tau},a_{h}^{\tau})^{T}] \\
    &\stackrel{(iii)}{=} t\mathbb{E}_{(s,a)\sim \tilde{\gamma}_{t,h}^{\star}}[\phi(s,a)\phi(s,a)^{T}],
\end{align*}
\noindent where \((i)\) follows from \(\rho_{h}^{\pi, \xi}\) and \(\rho_{h}^{\pi}\) agreeing on the event \(e=1\) and \((ii)\) follows from the occupancy distributions \(d_{\mathcal{P}^{\star},h}^{\tilde{\pi}_{t}^{\star}}\) and  \(d_{\mathcal{P}^{\star},h}^{\pi_{t}}\) agreeing on \(\Gamma_{h,t}^{\star}\) and for $(iii)$ recall that \(\tilde{\gamma}_{t,h}^{\star}(s,a) = \frac{1}{t}\sum_{i=o}^{t-1}d^{\tilde{\pi}_{t}^{\star}}_{\mathcal{P}^{\star}, h}(s,a)\). Similarly, with probability of at least \(1-\delta/6\), for all \(t\in\mathbb{N}\) and all \(h\in[H],\phi\in\Phi\),
\begin{align*}
     (B) &= \sum_{i=1}^{t}\mathbbm{1}[\tau_{1:h}^{(i, h+1)}\notin\Gamma_{h,i}^{\star}]\phi(s_{h}^{\tau^{(i, h+1)}},a_{h}^{\tau^{(i, h+1)}})\phi(s_{h}^{\tau^{(i, h+1)}},a_{h}^{\tau^{(i, h+1)}})^{T} \\ 
    &\stackrel{(i)}{\preccurlyeq} \sum_{i=1}^{t}\mathbb{E}_{\tau\sim \rho_{h}^{\pi_{i-1}, \xi_{i-1}}}[\mathbbm{1}[\tau\notin\Gamma_{h,i}^{\star}]\phi(s_{h}^{\tau},a_{h}^{\tau})\phi(s_{h}^{\tau},a_{h}^{\tau})^{T}] + 8I\sqrt{t\log(6tdH|\Phi|/\delta)} \\
    &\stackrel{(ii)}{\preccurlyeq} \underbrace{I\sum_{i=1}^{t}\mathbb{E}_{\tau\sim \rho_{h}^{\pi_{i-1}, \xi_{i-1}}}[\mathbbm{1}[\tau\notin\Gamma_{h,i}^{\star}]]}_{(B1)} + 8I\sqrt{t\log(6tdH|\Phi|/\delta)},
\end{align*}
where \((i)\) follows, similarly to before, from Proposition \ref{prop:matrix_azuma} in combination with an union bound and \((ii)\) is by \(\Vert\phi_{h}\Vert_{2}\leq 1\). Further,
\begin{align*}
    (B1) &= I\sum_{i=1}^{t}\mathbb{E}_{\tau\sim \rho_{h}^{\pi_{i-1}, \xi_{i-1}}}[\mathbbm{1}[\tau\notin\Gamma_{h,i}^{\star}]] \\
    &= I\sum_{i=1}^{t}\mathbb{E}_{\tau\sim \rho_{h}^{\pi_{i-1}, \xi_{i-1}}}[\mathbbm{1}[e=1]\mathbbm{1}[\tau\notin\Gamma_{h,i}^{\star}]] + \mathbb{E}_{\tau\sim \rho_{h}^{\pi_{i-1}, \xi_{i-1}}}[\mathbbm{1}[e=0]\mathbbm{1}[\tau\notin\Gamma_{h,i}^{\star}]] \\
    &\stackrel{(i)}{\preccurlyeq} I\sum_{i=1}^{t}\mathbb{E}_{\tau\sim \rho_{h}^{\pi_{i-1}}}[\mathbbm{1}[\tau\notin\Gamma_{h,i}^{\star}]] + I\sum_{i=1}^{t}\mathbb{E}_{e\sim\textnormal{Ber}(1-\xi_{i-1})}[\mathbbm{1}[e=0]]\\
     &\stackrel{(ii)}{\preccurlyeq} I\sum_{i=1}^{t}\sum_{h=1}^{H}\mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star},h}^{\pi_{i-1}}}[\mathbbm{1}[a \notin \Pi_{h}^{\star}(s)]] + I\sum_{i=1}^{t}\xi_{i-1}\\
    &\preccurlyeq I\sum_{i=1}^{t}\sum_{h=1}^{H}\mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star},h}^{\pi_{i-1}}}[\mathbbm{1}[\Delta_{h}(s,a) \geq \Delta_{\textnormal{min}}]] + I\sum_{i=1}^{t}\xi_{i-1} \\
    &\preccurlyeq 
    I\frac{1}{\Delta_{\textnormal{min}}}\sum_{i=1}^{t}\sum_{h=1}^{H}\mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star},h}^{\pi_{i-1}}}[\Delta_{h}(s,a)] + I\sum_{i=1}^{t}\xi_{i-1}\\
    &\stackrel{(iii)}{=} \frac{\mathcal{R}(t)}{\Delta_{\textnormal{min}}}I + I\sum_{i=1}^{t}\xi_{i-1},
\end{align*}
where \((i)\) follows from \(\rho_{h}^{\pi, \xi}\) and \(\rho_{h}^{\pi}\) agreeing on the event \(e=1\), \((ii)\) follows from the definition of \(\tilde{\pi}_{t,h}^{\star}\) and \((iii)\) follows from Lemma \ref{lemma:regret_gap_equivalence}. Finally, with probability at least \(1-\delta/6\), for all \(t\in\mathbb{N}\) and \(h\in[H]\),
\begin{align*}
    (C) &= \sum_{i=1}^{t}\mathbbm{1}[e_{i} = 0]\phi_{h}(s_{h}^{\tau^{(i, h+1)}},a_{h}^{\tau^{(i, h+1)}})\phi_{h}(s_{h}^{\tau^{(i, h+1)}},a_{h}^{\tau^{(i, h+1)}})^{T}\\ &\stackrel{(i)}{\preccurlyeq} I\sum_{i=1}^{t}\mathbbm{1}[e_{i} = 0] \\
    &\stackrel{(ii)}{\preccurlyeq} I(\sum_{i=1}^{t}\mathbb{E}_{e \sim \textnormal{Ber}(1-\xi_{i-1})}[\mathbbm{1}[e = 0]] + \sqrt{t\log(6tH/\delta)}) \\
    &\preccurlyeq I\sum_{i=1}^{t}\xi_{i-1} + \sqrt{t\log(6tH/\delta)},
\end{align*}
where \((i)\) follows from \(\Vert\phi_{h}\Vert_{2}\leq 1\) and \((ii)\) is by Hoeffding's inequality with a union bound over episodes and time steps.\\

\textbf{Lower bound:} The lower bound is easily derived by similar arguments. With probability at least \(1-\delta/2\), for all \(t\in\mathbb{N}\), and all \(\phi\in\Phi\), \(h\in[H]\):
\begin{align*}
    \Sigma_{h,t} - \lambda_{t} I &\succcurlyeq (A) \\
    &\succcurlyeq (A1) - 8I\sqrt{t\log(6tdH|\Phi|/\delta)}\\
    &\succcurlyeq t\mathbb{E}_{(s,a)\sim \tilde{\gamma}_{t,h}^{\star}}[\phi(s,a)\phi(s,a)^{T}] - (B) - (C) - 8I\sqrt{t\log(6tdH|\Phi|/\delta)}.
\end{align*}
We conclude the proof by performing an union bound over the results for the lower and upper bound.
\end{proof}

By the lower bound of the previous result, we immediately obtain the following:

\begin{lemma}\label{lemma:bounded_uncertainty}
Consider a feature map \(\phi\in\Phi\) that is non-redundant.  Assume there exists an increasing sub-linear function $g$ such that \(\mathcal{R}(t) \leq g(t)\) for all \(t\in\mathbb{N}\). Suppose assumptions \ref{ass:unique_optimal_policy} (unique optimal policy) and \ref{ass:sub_optimality_gap_exists} (minimal sub-optimality gap) holds. Then, given that the event $\mathcal{F}$ occurs, there exists a constant \(\tau_{\textnormal{inv}}\) such that, for all \(t\geq \tau_{\textnormal{inv}}\), \(h\in[H]\) and \((s,a)\in\mathcal{S}\times\mathcal{A}\),
\[
    \Vert\phi_{h}(s,a)\Vert_{\Sigma_{t,h}^{-1}} \leq (t\lambda_{\textnormal{min}}(\Sigma_{t,h}^{\star}) + \lambda_{t} - 2\sum_{i=1}^{t}\xi_{i-1} - \Delta_{\textnormal{min}}^{-1}g(t) - 18\sqrt{t\log(6tdH|\Phi|/\delta)})^{-1/2}.
\]
\end{lemma}

\begin{proof}
Let $\tau_{\textnormal{inv}}$ be large enough so that
\[
    t\lambda_{\textnormal{min}}(\Sigma_{t,h}^{\star}) + \lambda_{t} > 2\sum_{i=1}^{t}\xi_{i-1} + \Delta_{\textnormal{min}}^{-1}g(t) + 18\sqrt{t\log(6tdH|\Phi|/\delta)}
\]
holds. Then, for all $t\geq\tau_{\textnormal{inv}}$, \(h\in[H]\) and \((s,a)\in\mathcal{S}\times\mathcal{A}\) 
    \begin{align*}
        \Vert\phi_{h}(s,a)\Vert_{\hat{\Sigma}_{t,h}^{-1}} &= (\phi_{h}(s,a)^{T}\hat{\Sigma}_{t,h}^{-1}\phi_{h}(s,a))^{1/2} \\
        &\stackrel{(i)}{\leq} (\lambda_{\textnormal{min}}(\hat{\Sigma}_{t,h}^{-1})\phi_{h}(s,a)^{T}\phi_{h}(s,a))^{1/2} \\
        &\leq \lambda_{\textnormal{min}}(\hat{\Sigma}_{t,h})^{-1/2},
    \end{align*}
    where \((i)\) follows from the symmetry of the covariance matrix. We conclude the proof by substituting \(\hat{\Sigma}_{t,h}\) with the lower bound provided by the event \(\mathcal{F}_{1}\).
\end{proof}

Note that \(\lambda_{\textnormal{min}}(\Sigma_{t,h}^{\star}) > 0\) holds whenever there exists a unique optimal policy and the feature map is UniSOFT. The final lemma of this section shows that algorithm \ref{alg:UniSREP} is guaranteed to eventually select only good representations.

\begin{lemma}\label{lemma:UniSOFT_selection_full_rank}(Selecting non-redundant UniSOFT representation)
    Fix any \(\alpha>0\).  Assume that there exists an increasing sublinear function $g$ such that \(\mathcal{R}(t) \leq g(t)\) for all \(t\in\mathbb{N}\). Suppose we run algorithm \ref{alg:UniSREP} and assumptions \ref{ass:unique_optimal_policy} (unique optimal policy), \ref{ass:expressivness} (expressiveness) and \ref{ass:sub_optimality_gap_exists} (minimal sub-optimality gap) hold. Additionally, if \(\alpha <1\), suppose that assumption \ref{ass:min_optimal_occupancy_exists} (minimal optimal occupancy) holds. Then, given that events \(\mathcal{E}(\delta)\) and \(\mathcal{F}(\delta)\) occur, there exists an episode \(\tau_{\textnormal{unisoft}}\geq\tau_{\alpha}\) such that for all subsequent episodes \(t\geq\tau_{\textnormal{unisoft}}\) and time steps \(h\in[H]\), the learned feature maps \(\hat{\phi}_{t,h}\) are UniSOFT and non-redundant, where
\[
    \tau_{\textnormal{unisoft}} := \min\{t|t>\left(\frac{2}{\lambda_{\alpha}^{\star}}(\Delta_{\textnormal{min}}^{-1}\mathcal{R}(t) + 2\sum_{i=1}^{t}\xi_{i-1} + 18\sqrt{t\log(6tdH|\Phi|/\delta)}) \vee \tau_{\alpha}\right)\}.
\]
\end{lemma}
\begin{proof}
    Note that, by Lemma \ref{lemma:alpha_star_selection}, given that \(\mathcal{E}\) occurs, there exists an episode \(\tau_{\alpha}\) such that for all $t\geq\tau_{\alpha}$ and $h\in[H]$, the learned features \(\hat{\phi}_{t,h}\) are \(\alpha^{\star}\)-approximate.
    
    Let \(\Phi^{\textnormal{unisoft}}\subseteq\Phi\) denote the set that contains only non-redundant UniSOFT feature mappings. 
    By Lemma \ref{lemma:eigenvalue_bounds}, given that the event \(\mathcal{F}\) occurs, for all \(t\in\mathbb{N}\), \(h\in[H]\), \(\phi\in\Phi\setminus\Phi^{\textnormal{unisoft}}\) and \(\phi^{\textnormal{unisoft}}\in\Phi^{\textnormal{unisoft}}\),
\begin{align*}
    &\lambda_{\textnormal{min}}(\Sigma_{t,h}(\phi^{\textnormal{unisoft}}) - \lambda_{t} I) \geq t\lambda^{\star}(\phi) - 2\sum_{i=1}^{t}\xi_{i-1} - \Delta_{\textnormal{min}}^{-1}g(t) - 18\sqrt{t\log(6tdH|\Phi|/\delta)}, \\
    &\lambda_{\textnormal{min}}(\Sigma_{t,h}(\phi) - \lambda_{t} I) \leq  2\sum_{i=1}^{t}\xi_{i-1} + \Delta_{\textnormal{min}}^{-1}g(t) + 18\sqrt{t\log(6tdH|\Phi|/\delta)},
\end{align*}
where \(\Sigma_{t,h}(\phi) = \sum_{(s,a)\in\mathcal{D}_{t,h}}\phi_{h}(s,a)\phi_{h}(s,a)^{T}\). Let us denote \(\Phi_{\alpha}\times\Psi_{\alpha}\subseteq\Phi\times\Psi\) as the set of \(\alpha^{\star}\)-approximate representations. Additionally, denote
\[\Phi_{\alpha}^{\textnormal{unisoft}}\times\Psi_{\alpha}^{\textnormal{unisoft}} = \left(\Phi_{\alpha}\times\Psi_{\alpha}\right) \cap \left(\Phi^{\textnormal{unisoft}}\times\Psi\right),\]
as the set containing all \(\alpha^{\star}\)-approximate representations such that the feature map is non-redundant and UniSOFT, which is non-empty by assumption \ref{ass:expressivness}. A non-redundant UniSOFT representation \(\phi^{\textnormal{unisoft}}\) is selected in episode \(t\geq\tau_{\alpha}\) if for all \(\tilde{\alpha}\leq\alpha\), 
\[
    \max_{\phi^{\textnormal{unisoft}}\in\Phi_{\tilde{\alpha}}^{\textnormal{unisoft}}} \lambda_{\textnormal{min}}(\Sigma_{t,h}(\phi^{\textnormal{unisoft}}) - \lambda_{t}I) > \max_{\phi\in\Phi_{\tilde{\alpha}}\setminus\Phi_{\tilde{\alpha}}^{\textnormal{unisoft}}}\lambda_{\textnormal{min}}(\Sigma_{t,h}(\phi) - \lambda_{t}I),
\]
or equivalently,
\[
    t\lambda_{\alpha}^{\star}(\phi^{\textnormal{unisoft}}) > 2(2\sum_{i=1}^{t}\xi_{i-1} + \Delta_{\textnormal{min}}^{-1}g(t) + 18\sqrt{t\log(6tdH|\Phi|/\delta)}),
\]
where \(\lambda_{\alpha}^{\star} := \min_{\tilde{\alpha}\leq\alpha}\max_{\phi^{\textnormal{unisoft}}\in\Phi_{\tilde{\alpha}}^{\textnormal{unisoft}}}\lambda^{\star}(\phi^{\textnormal{unisoft}})\). 
\end{proof}