As noted in the preliminary section, we can extend our results to environments with multiple optimal policies as well. Recall that we denote \(\Pi^{\star}\) as the set of all optimal (deterministic) policies. We say that a feature map \(\phi\) is UniSOFT w.r.t. some policy \(\pi\), if \(\pi\in\Pi^{\star}\) and \(\phi\) fulfills the UniSOFT property, as in definition \ref{def:unisoft}, interchanging $\pi$ and $\pi^{\star}$. In particular, a UniSOFT representation is non-redundant if $\lambda^{\star}(\phi)>0$ where  $\lambda^{\star}(\phi):=\min_{h\in[H],\pi^{\star}\in\Pi^{\star}} \lambda_{\textnormal{min}}(\mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star},h}^{\pi^{\star}}}[\phi_{h}(s,a)\phi_{h}(s,a)^{T}])$. We adjust the notion of \(\alpha^{\star}\)-approximate representations accordingly.

\begin{restatable}[$(\mathcal{\sigma}^{\star},\alpha^{\star})$-Approximate Representation]{definition}{alphaapproximatemultpol}\label{def:alpha_approximate_mult_pol}
       A representation \((\phi, \mu)\in\Phi\times\Psi\), with induced model \(\mathcal{P}\), is $(\mathcal{\sigma}^{\star}, \alpha)$-approximate at level $\alpha$ if for the finite sequence \(\mathcal{\sigma}^{\star} = (\pi_{1}^{\star}, \pi_{2}^{\star}, ..., \pi_{t}^{\star})\) of optimal policies and for all \(h\in[H]\),
\[
    \mathbb{E}_{(s,a)\sim \gamma_{t,h}^{\star}}[\Vert\mathcal{P}_{h}(\cdot|s,a) - \mathcal{P}_{h}^{\star}(\cdot|s,a)\Vert_{\textnormal{TV}}] \leq \alpha,
\]
where \(\gamma_{t,h}^{\star}(s,a) = \frac{1}{t}\sum_{i=1}^{t} d_{\mathcal{P}^{\star},h}^{\pi_{i}^{\star}}(s,a)\).
\end{restatable}

\begin{restatable}[$\alpha^{\star}$-Expressive Function Space]{assumption}{expressivnessmultpol}\label{ass:expressivness_mult_pol}
    Let \(\mathcal{\sigma}^{\star}\) be an arbitrary sequence of optimal policies of finite length. For all $(\mathcal{\sigma}^{\star}, \alpha^{\star})$-approximate representations \((\phi, \mu)\in\Phi\times\Psi\), there exists a non-redundant representation \((\tilde{\phi}, \tilde{\mu})\in\Phi\times\Psi\) that is UniSOFT w.r.t. all \(\pi^{\star}\in\mathcal{\sigma}^{\star}\), such that the induced models \(\mathcal{P}
    \) and \( 
    \tilde{\mathcal{P}}\)
     agree on all \((s,a)\in\mathcal{S}\times\mathcal{A}\), for which there exists a policy \(\pi\in\Pi\), such that for any \(h\in[H]\), we have \(d_{\mathcal{P}^{\star},h}^{\pi}(s,a) > 0\). 
\end{restatable}

Furthermore, recall that \(\tilde{\pi}_{t}^{\star}:=\{\tilde{\pi}_{t,h}^{\star}\}_{h\in[H]}\), where for each \(h\in[H]\),
\begin{equation*}
    \tilde{\pi}_{t,h}^{\star}(s) =
    \begin{cases}
        \pi_{t,h}(s) & \textnormal{if } \pi_{t,h}(s)\in\Pi_{h}^{\star}(s)  \\
        Select(\Pi_{h}^{\star}(s)) & \textnormal{otherwise}
    \end{cases}.
\end{equation*} 
We define $\tilde{\sigma}_{t}^{\star}:=(\tilde{\pi}_{1}^{\star}, \tilde{\pi}_{2}^{\star}, ..., \tilde{\pi}_{t}^{\star})$. 

Compared to the unique optimal policy case, we must ensure the existence of feature maps that are UniSOFT w.r.t. all optimal policies, as we do not know in advance which distribution of optimal policies the algorithm converges to. In exchange for updating the expressiveness assumption \ref{ass:expressivness} to the more restrictive assumption \ref{ass:expressivness_mult_pol}, we can drop the unique optimal policy assumption. We note that allowing multiple optimal policies only worsens the sample complexity in the instance-dependent variables, which now depend on the 'worst' deterministic optimal policy. 
 
 The following two results ensure the selection of good representation. The remaining analysis can be performed analogously to the previous sections.

\begin{lemma}\label{lemma:alpha_star_selection_mult_pol}(Selecting \((\tilde{\mathcal{\sigma}}_{t}^{\star}, \alpha)\)-representations)
    Fix any \(\alpha>0\). Assume there exists an increasing sub-linear function $g$ such that \(\mathcal{R}(t)\leq g(t)\) for all \(t\in\mathbb{N}\). Suppose we run algorithm \ref{alg:UniSREP} and assumptions \ref{ass:min_optimal_occupancy_exists} (minimal optimal occupancy) and \ref{ass:sub_optimality_gap_exists} (minimal sub-optimality gap) hold. Then, given that the event \(\mathcal{E}\) occurs, there exists an episode \(\tau_{\alpha}\) such that for all episodes \(t\geq \tau_{\alpha}\) and time steps \(h\in[H]\), the learned feature maps \(\hat{\phi}_{t,h}\) are \((\tilde{\mathcal{\sigma}}_{t}^{\star},\alpha)\)-approximate, where
    \[
        \tau_{\alpha} := \min\{t|t>\frac{1}{\alpha}(\frac{\mathcal{R}(t)}{\Delta_{\textnormal{min}}d_{\textnormal{min}}^{\star}} + \frac{|\mathcal{A}|}{\xi_{t}}\sqrt{2t\log(4t|\Phi||\Psi|H/\delta)})\}.
    \]
\end{lemma}
\begin{proof}
    Directly follows from Corollary \ref{corr:occupancy_bound} and the proof of Lemma \ref{lemma:alpha_star_selection}.
\end{proof}

\begin{lemma}\label{lemma:UniSOFT_selection_full_rank_mult_pol}(Selecting non-redundant UniSOFT representation)
    Fix any \(\alpha>0\). Assume there exists an increasing sub-linear function $g$ such that \(\mathcal{R}(t)\leq g(t)\) for all \(t\in\mathbb{N}\). Suppose we run algorithm \ref{alg:UniSREP} and assumptions \ref{ass:expressivness_mult_pol} (expressiveness) and \ref{ass:sub_optimality_gap_exists} (minimal sub-optimality gap) hold. Additionally, if \(\alpha<1\), suppose assumption \ref{ass:min_optimal_occupancy_exists} (minimal optimal occupancy) holds. Then, given that the events \(\mathcal{E}(\delta)\) and \(\mathcal{F}(\delta)\) occur, there exists an episode \(\tau_{\textnormal{unisoft}}\geq\tau_{\alpha}\) such that for all subsequent episodes \(t\geq\tau_{\textnormal{unisoft}}\) and time steps \(h\in[H]\) the learned feature maps \(\hat{\phi}_{t,h}\) are UniSOFT w.r.t. any optimal policy \(\pi^{\star}\in\tilde{\sigma}_{t}^{\star}\), where
\[
    \tau_{\textnormal{unisoft}} := \min\{t|t>\left(\frac{2}{\lambda_{ \alpha}^{\star}}(\Delta_{\textnormal{min}}^{-1}\mathcal{R}(t) + 2\sum_{i=1}^{t}\xi_{i-1} + 18\sqrt{t\log(6dtH|\Phi|/\delta)}) \vee \tau_{\alpha}\right)\}.
\]
\end{lemma}
\begin{proof}
    Let \(\Phi_{\tilde{\sigma}_{t}^{\star}}^{\textnormal{unisoft}}\subseteq\Phi\) denote the set containing only non-redundant feature mappings that are UniSOFT w.r.t. at least one \(\tilde{\pi}^{\star}\in\tilde{\sigma}_{t}^{\star}\). 
    By Lemma \ref{lemma:eigenvalue_bounds}, with probability at least \(1-\delta\), for all \(t\in\mathbb{N}\), \(h\in[H]\), \(\phi\in\Phi\setminus\Phi_{\tilde{\sigma}_{t}^{\star}}^{\textnormal{unisoft}}\) and \(\phi^{\textnormal{unisoft}}\in\Phi_{\tilde{\sigma}_{t}^{\star}}^{\textnormal{unisoft}}\),
\begin{align*}
    &\lambda_{\textnormal{min}}(\Sigma_{t+1,h}(\phi^{\textnormal{unisoft}}) - \lambda_{t} I) \geq t\lambda^{\star}(\phi^{\textnormal{unisoft}}) - 2\sum_{i=1}^{t}\xi_{i-1} - \Delta_{\textnormal{min}}^{-1}\mathcal{R}(t) - 18\sqrt{t\log(6dtH|\Phi|/\delta)}, \\
    &\lambda_{\textnormal{min}}(\Sigma_{t+1,h}(\phi) - \lambda_{t} I) \leq 2\sum_{i=1}^{t}\xi_{i-1} + \Delta_{\textnormal{min}}^{-1}\mathcal{R}(t) + 18\sqrt{t\log(6dtH|\Phi|/\delta)},
\end{align*}
where \(\Sigma_{h,t+1}(\phi) = \sum_{(s,a)\in\mathcal{D}_{t,h}}\phi_{h}(s,a)\phi_{h}(s,a)^{T}\) and 
\begin{align*}
    \lambda^{\star}(\phi)&:= \min_{h\in[H], \pi^{\star}\in\Pi^{\star}}\lambda_{\textnormal{min}}(\mathbb{E}_{(s,a)\sim d^{\pi^{\star}}_{\mathcal{P}^{\star},h}}[\phi_{h}(s,a)\phi_{h}(s,a)^{T}]) \\
    &\leq \min_{h\in[H]}\lambda_{\textnormal{min}}(\mathbb{E}_{(s,a)\sim \tilde{\gamma}_{t,h}^{\star}}[\phi_{h}(s,a)\phi_{h}(s,a)^{T}]).
\end{align*}

Let us denote \(\Phi_{\alpha}\times\Psi_{\alpha}\subseteq\Phi\times\Psi\) as the set of \((\tilde{\sigma}_{t}^{\star},\alpha)\)-approximate representations. Additionally, denote
\[\Phi_{\alpha}^{\textnormal{unisoft}}\times\Psi_{\alpha}^{\textnormal{unisoft}} = \left(\Phi_{\alpha}\times\Psi_{\alpha}\right) \cap \left(\Phi_{\tilde{\sigma}_{t}^{\star}}^{\textnormal{unisoft}}\times\Psi\right),\]
as the set containing all \((\tilde{\sigma}_{t}^{\star},\alpha)\)-approximate representations such that the feature map is non-redundant and UniSOFT w.r.t. at least one \(\pi\in\tilde{\sigma}_{t}^{\star}\), which is non-empty by Assumption \ref{ass:expressivness_mult_pol}. A desired feature map is selected at episode \(t\geq\tau_{\alpha}\) if for all \(\tilde{\alpha}\leq\alpha\),
\[
    \max_{\phi^{\textnormal{unisoft}}\in\Phi_{\tilde{\alpha}}^{\textnormal{unisoft}}} \lambda_{\textnormal{min}}(\Sigma_{t+1,h}(\phi^{\textnormal{unisoft}}) - \lambda_{t}I) > \max_{\phi\in\Phi_{\tilde{\alpha}}\setminus\Phi_{\tilde{\alpha}}^{\textnormal{unisoft}}}\lambda_{\textnormal{min}}(\Sigma_{t+1,h}(\phi) - \lambda_{t}I),
\]
or equivalently,
\[
    t\lambda_{\alpha}^{\star}(\phi^{\textnormal{unisoft}}) > 2\left(\Delta_{\textnormal{min}}^{-1}\mathcal{R}(t) + 2\sum_{i=1}^{t}\xi_{i} + 18\sqrt{t\log(6dtH|\Phi|/\delta)}\right),
\]
where \(\lambda_{\alpha}^{\star} := \min_{\tilde{\alpha}\leq\alpha}\max_{\phi^{\textnormal{unisoft}}\in\Phi_{\tilde{\alpha}}^{\textnormal{unisoft}}}\lambda^{\star}(\phi^{\textnormal{unisoft}})\). 
\end{proof}