\begin{algorithm}[H]
\caption{\alg\textcolor{blue}{(+)}}\label{alg:UniSREP}


\textbf{Input:} \text{Function spaces $ \{\Phi_{h}\}_{h=1}^{H}, \{\Psi_{h}\}_{h=1}^{H} $}, Parameters $\lambda_{t}, \hat{\alpha}_{t}, \xi_{t} \textnormal{ decreasing}, T$
 

\textbf{Output:} $\pi_{t}$

\begin{algorithmic}[1]
\State{$\textnormal{Initialize:}$  $\mathcal{D}_{0,h}\!=\!\emptyset$, $\mathcal{D}_{0,h}'\!=\!\emptyset$, $\pi_{0, h}\equiv\mathcal{U}(\mathcal{A})$, $\forall h\in[H]$}

\For{$t=1,..., T$}

    % \State{\emph{// Interact with the MDP and collect transition data}}
    \LeftComment{Interact with the MDP and collect transition data}
    \State $e_{t}\sim\textnormal{Ber}(1-\xi_{t-1})$ \label{alg:exploration_start}

    \For{$h=1,...,H$}

        \State $s_{h-1}\sim d_{P^{\star};h-1}^{{\pi}_{t-1}}$

        \If{$e_{t}=1$}

            \State $a_{h-1} = \pi_{t-1,h-1}(s_{h-1})$, $s_{h}\sim P_{h-1}^{\star}$
            \State $a_{h} = \pi_{t-1,h}(s_{h}), s_{h+1}\sim P_{h}^{\star}$ 

        \Else

            \State $a_{h-1} \sim \mathcal{U}(\mathcal{A})$, $s_{h}\sim P_{h-1}^{\star}$,
            \State $a_{h} \sim \mathcal{U}(\mathcal{A})$, $s_{h+1}\sim P_{h}^{\star}$

        \EndIf
        
        \State $\mathcal{D}_{t,h-1} = \mathcal{D}_{t-1,h-1} \cup \{(s_{h-1}, a_{h-1})\}$
        \State $\mathcal{D}_{t,h}' = \mathcal{D}_{t-1,h}' \cup \{(s_{h}, a_{h}, s_{h+1})\}$
        
    \EndFor \label{alg:exploration_end}

    \LeftComment{Learn representations \& set bonus}
    
    \For{$h=1,...,H$} \label{alg:replearn_start}
    
        \State $\hat{\phi}_{t,h} = \arg\min_{\phi\in\Phi_{h}^{\textnormal{MLE}}(\mathcal{D}_{t,h}')}\mathcal{L}^{\textnormal{unisoft}}(\phi, \mathcal{D}_{t,h})$ \label{alg:oracle}
        
        \State $\hat{\Sigma}_{t,h} =\sum_{(s,a)\in\mathcal{D}_{t,h}}\hat{\phi}_{t,h}(s,a)\hat{\phi}_{t,h}(s,a)^{T}$
        \State $\qquad+\lambda_{t}I$
        
        \State $\hat{b}_{t,h}(s,a) =$
        \State $\qquad\min\{\hat{\alpha}_{t}\sqrt{\hat{\phi}_{t,h}(s,a)^{T}\hat{\Sigma}_{t,h}^{-1}\hat{\phi}_{t,h}(s,a)}, 1\}$ 

        \State $\hat{\mathcal{P}}_{t,h}(s'|s,a) = \langle\hat{\phi}_{t,h}(s,a),\hat{\mu}_{t,h}(s')\rangle$ 
    \EndFor{} \label{alg:replearn_end}
    \LeftComment{Update (deterministic) policy}

    \State $\pi_{t} = \arg\max_{\pi\in\Pi}V_{\hat{\mathcal{P}}_{t}, \hat{b}_{t} + r^{\star},1}^{\pi, d_{1}}$ \label{alg:planning}
    
    \LeftCommentt{Check for optimality}

    \color{blue}
    \State $\pi_{t}^{b} = \arg\max_{\pi\in\Pi}V_{\hat{\mathcal{P}}_{t}, \hat{b}_{t},1}^{\pi, d_{1}}$ \label{alg:eval_start}

    \State $c_{t} = 10H^{2}(V_{\hat{\mathcal{P}}_{t}, \hat{b}_{t},1}^{\pi_{t}^{b}, d_{1}} + \sqrt{\frac{|\mathcal{A}|}{\xi_{t}}\zeta_{t}})$
    
    \If{$c_{t} < \Delta_{\textnormal{min}} d_{\textnormal{min}}^{\star}$} \label{alg:optimality_condition}

        \State \Return{$\pi_{t}$}

    \color{black}

    \EndIf \label{alg:eval_end}
    
\EndFor{}

\State
\Return{$\pi_{t}$}

\end{algorithmic}
\end{algorithm}