\newcommand{\rrEmpNumSamplesPsi}{\ensuremath{\tilde{\calO}\left(\frac{\sizeA\psi}{\lambda^2 (1-\epsilon)^4}\ln\left(\frac{i}{p}\right)\right)}}
\newcommand{\drrEmpNumSamplesPsi}{\ensuremath{\tilde{\calO}\left(\frac{\sizeA\psi}{\lambda^2}\ln\left(\frac{i}{p}\right)\right)}}
\newcommand{\rrEmpNumSamples}{\ensuremath{\tilde{\calO}\left(\frac{\sizeA\sizeS^3\left(B+\sqrt{\sizeA}\right)^2}{\delta^4(1-\gamma)^6\lambda^2 (1-\epsilon)^4}\ln\left(\frac{t}{p}\right)\right)}}
\newcommand{\drrEmpNumSamples}{\ensuremath{\tilde{\calO}\left(\frac{\sizeA\sizeS^3\left(B+\sqrt{\sizeA}\right)^2}{\delta^4(1-\gamma)^6\lambda^2}\ln\left(\frac{i+1}{p}\right)\right)}}
\newcommand{\mdrrEmpNumSamplesPsi}{\frac{(v-1)v^{k-1}}{v^{k}-1}\ensuremath{\tilde{\calO}\left(\frac{\sizeA\psi}{\lambda^2}\ln\left(\frac{i}{p}\right)\right)}}
\newcommand{\abbrrt}{\#rs}
\begin{table*}[!ht]
    \centering
    \caption{\textbf{Overview of Our Results.}\ \ \  
    Convergence criteria for computing a $\delta$-approximate stable policy.
    $\lambda$ is a factor of the regularization, 
    $\epsilon<1$ indicates the dependence of the current environment on the previous environment,
    $\iota<1$ indicates the dependence of the current environment on the deployed policy,
    $i$ denotes the current retraining iteration, $1-p$ is the probability of achieving said approximate stable policy in the finite sample setting, $k$ denotes the number of repeated deployments of the same policy in MDRR, $\sizeS$ is the number of states, $\sizeA$ the number of actions, and $\gamma$ is the discount factor of the MDP.
        }
    \label{table:overview}
    \begin{threeparttable}
    \begin{tabular}{lcccc}
    \toprule[1.0pt]
        \textbf{Algorithm} & $\boldsymbol{\lambda}$ & \textbf{\#retrainings} 
        & $\genfrac{}{}{0pt}{0}{\textbf{\#samples}}{\textbf{per deployment}}$ \\
        \midrule[1.0pt]
        RR\tnote{[exact]}& $\calO\left(\frac{\sizeS^{5/2}}{(1-\epsilon)(1-\gamma)^4}\right)$ &
        $\frac{\ln\left(\left(
        \frac{2}{1-\gamma} + \left(1+\sqrt{2}\right)\sqrt{\sizeS\sizeA}
        \right)/\delta\right)}{\ln\left(2/(1+\epsilon)\right)}$
        & N\textbackslash{}A\\ \midrule[0.2pt]
        DRR\tnote{[exact]} & $\calO\left(\frac{\iota\cdot \sizeS^{5/2}}{(1-\epsilon)(1-\gamma)^4}\right)$
        & 
        $\ln\left(\left(\frac{2}{1-\gamma}\right)/\delta\right)$
        & N\textbackslash{}A
   \\ \midrule[0.2pt]
       RR\tnote{[fin]} & $\mathcal{O}\left(\frac{\epsilon(\sizeS+\gamma \sizeS^{5/2})}{(1-\epsilon)(1-\gamma)^4} 
        \right)$ & $\frac{\ln\left(\frac{\frac{2}{1-\gamma}+\left(1+\sqrt{2}\right)\sqrt{\sizeS\sizeA}}{\delta}\right)}
        {\ln\left(4/\left(3+\epsilon\right)\right)}$ 
        & $\rrEmpNumSamplesPsi$\tnote{[a]}
        \\ \midrule[0.2pt]
        DRR\tnote{[fin]} & $\mathcal{O}\left(\frac{\iota(\sizeS+\gamma \sizeS^{5/2})}{
        (1-\epsilon)(1-\gamma)^4}\right)$ & $\frac{\ln\left(\frac{2}{1-\gamma}/\delta\right)}{
        \ln\left( 4/ \left(3+\epsilon\right)\right)}$ 
        &
        $\drrEmpNumSamplesPsi$\tnote{[a]}
        \\\midrule[0.2pt]
        MDRR\tnote{[fin]} & as for DRR & as for DRR 
        & $\mdrrEmpNumSamplesPsi $\tnote{[a,b]}\ \ \ \  \\
    \bottomrule[1.0pt]
    \end{tabular}
        \begin{tablenotes}\footnotesize
            \item[[a{]}] Here $\psi = \calO\left(\frac{\sizeS^{3}\left(B+\sqrt{\sizeA}\right)^2}{\delta^4(1-\gamma)^6}\right)$  and we ignore all terms which are logarithmic in $\sizeS$, $\sizeA$ and $1/\delta$.
            \item[[b{]}] $v> \frac{1}{\epsilon}$ is a hyperparamter of MDRR
            \item[[exact{]}] results when the learner knows current environment $(P_t, r_t)$
            \item[[fin{]}] results when the learner gets a finite set of samples from the current environment $(P_t, r_t)$
        \end{tablenotes}
    \end{threeparttable}
\end{table*}
