% !TEX root =  main.tex

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\todo{silence TODOs}

\section{Additional Related Work}\label{appdx.related-work}
In this section we present some more related work on Adversarial MDPs and Reinforcement Learning (RL).

{\bf Adversarial MDPs.} More broadly, our framework is related to the literature on adversarial and non-stationary MDPs, which extensively studied online learning under adversarial and non-stationary rewards and transitions\citep{even2004experts,even2009online,abbasi2013online,yu2009online,rosenberg2019online,cheung2020reinforcement,wei2021non}. The positive results therein, in particular, no-regret guarantees when both rewards and transitions evolve over time, often assume budget constraints on how many times and by how much the underlying MDP model can change~\citep{abbasi2013online,cheung2020reinforcement,wei2021non}. We instead rely on sensitivity assumptions (Assumption \ref{assumption_sensitivity}), introduced in Section \ref{sec.formal-setting}.   
%

{\bf Reinforcement Learning.} We also mention the recent work on RL in Newcomb-like environments~\citep{bell2021reinforcement}, whose framework is similar to the original performative RL framework of \cite{MTR23}. There, the focus is on the convergence of value-based RL algorithms; we focus on repeated retraining and allow the environment response model to gradually change over time.
%
From a practical point of view, repeated retraining is similar to alternating optimization for game-theoretic bi-level optimization problems in RL(e.g., \citep{rajeswaran2020game,mohammadi2023implicit}). The latter can be thought of as a training framework for finding optimal commitment policies in Markov games, whereas the former repeatedly deploys a policy, collects data, and trains a new policy using offline RL. In that regard, we also relate this paper to the vast literature on offline RL~\citep{levine2020offline}. From a technical point of view, the most relevant aspects are coverage assumptions and data generation process: we consider the ones from \citep{MTR23}, which are based on \citep{zhan2022offline, munos2008finite}.
\section{Additional Experimental Details}\label{appendix-experiments}
\begin{figure}[b]
\centering
\includegraphics[width=0.15\textwidth]{figures/gridworld4x4}
\caption{\label{fig:gridworld}The grid-world.}
\end{figure}
This section discusses more details on the experiments. Subsection~\ref{appdx.explanation-env} explains the environment further, subsections~\ref{appdx.sample-lists-mdrr} and~\ref{appdx.ftrl} discuss further algorithmic details, subsection~\ref{appdx.exp.compute} discusses the type and amount of compute used, and in subsection~\ref{subsec.sanity-check} we sanity-check if the comparison presented in the main paper is fair.

\subsection{Explanation of the Environment}
\label{appdx.explanation-env}
The experimental setting is an adapted version of the one from~\cite{MTR23}.
We consider the grid world environment depicted in figure~\ref{fig:gridworld}.
There is one actor in this grid-world environment, which is controlled by two agents, 
agent $A_1$ and agent $A_2$.
The actor starts randomly in one of the $S$ states, with uniform probability.
$A_1$ can decide where the actor goes by choosing one of the directions left, right, up or down.
$A_2$ can decide to intervene on the direction which $A_1$ chose.
The actions of $A_2$ are not-intervene, left, right, up or down.
In case $A_2$ chooses not-intervene, the direction chosen by $A_1$ is used.
Otherwise, the direction chosen by $A_2$ gets used.

Both agents are reinforcement learners with different goals.
% The agents control the same actor, but they have different goals.
$A_1$ optimizes according to the grid-world in figure~\ref{fig:gridworld}. 
$A_2$
optimizes according to a perturbed grid-world, where each blank, $F$ or $H$ cell is the same as for $A_1$ with probability $0.7$. With probability $0.3$,
it gets changed to either blank, $F$ or $H$ (chosen uniformely at random).

$A_1$ and $A_2$ get a negative reward of $-0.01$ if the actor visits a blank or an $S$ cell,
a slightly increased negative reward of
$-0.02$ if visiting a $F$ cell and a 
large negative reward of $-0.5$ for $H$ cells.
Additionally, when $A_2$ decides to intervene, an additional cost of $-0.05$ is inflicted on it.

$A_1$ is the main learner which performs RR, DRR or MDRR.
$A_2$ models the response of the environment.

$A_2$ starts by playing the policy which does never intervene.
In each iteration, first $A_1$ optimizes its policy, and then $A_2$ responds to the policy
played by $A_1$. 
$A_2$ slowly adapts to the current played policy by agent~1 in each round, by using a mixture
between the last played policy of $A_2$ and the softmax over the current optimal $Q$-values, as described in equation~\eqref{eq.agent2} in the main paper.

Furthermore, we use $\gamma=0.9$ for both $A_1$ and $A_2$ and a 
maximum trajectory length of $50$, i.e. after $50$ steps,
the trajectory is cut off.
Instead of using the exact occupancy measures $\bar{d}_i$ in the optimization, we approximate them using the trajectories.

\subsection{Computing Sample Lists for MDRR}\label{appdx.sample-lists-mdrr}
In this subsection we describe a practical way to compute samples from MDRR.

Recall that MDRR uses $m_{ik+t} = w_t \numsam_i$ samples in iteration $i$ from round $t$, where $w_t = \frac{(v-1)v^{t-1}}{v^k-1}$ and $\numsam_i$ is the total number of samples used in for the $i$-th retraining.
In practice, we assume that the learner is given some samples for each round.

In practice, we use a slightly different algorithm to compute the number of samples MDRR uses, because of two reasons.
The first reason is that $w_t \numsam_i$ could be non-integral. 
The second reason is that even though MDRR needs $m_{t}$ samples in round $t$, samples from rounds after $t$ could also count towards this, if the same policy was applied in the rounds between. This is 
because those samples are collected after more repeated applications of the same policy. Therefore the environment at this point is closer to the limiting environment than in round $t$ and using additional samples from higher rounds would increase performance more than using samples from round $t$.

To calculate the number of samples MDRR uses from each round, we propose Algorithm~\ref{algo:practical-sampling-mdrr}, which we explain in the following.
In the following we use the terms \emph{list} and \emph{sequence} somewhat loosely to refer to linked lists of samples and linked lists of linked lists of samples respectively. 

Algorithm~\ref{algo:practical-sampling-mdrr} takes as input a sequence of lists of samples
$S_1, \dots, S_{k}$ and weights $w_1, \dots, w_{k}\in \mathbb{R}^+$.
We can think of $S_1$ to be the number of samples in step $ik+1$ for some $i$, 
$S_2$ to be the number of samples in step $ik+2$, etc.~.
Algorithm~\ref{algo:practical-sampling-mdrr} fulfills the following property.
\begin{theorem}\label{thm.practical-sampling-mdrr}
    Algorithm~\ref{algo:practical-sampling-mdrr} outputs a sequence $\calF =[ F_1, \dots, F_{k}]$,
    which contains the maximal number of samples $|\calF|$ such that 
    \begin{enumerate}
        \item $F_t\subseteq S_t$ for all $t \in \{1, \dots, k\}$ and \label{item:F_t_subset}
        \item $\abs{[F_t, \dots, F_{k}]} \geq \sum_{t'=t}^{k} w_{t'} |\calF|$ for all $t \in \{1, \dots, k\}$\ .\label{item:F_t_proper_weights}
    \end{enumerate}
    where we denote by $\abs{[F_t, \dots, F_k]}$ the number of samples in total in $F_t, \dots, F_k$.
    Similarly $\abs{\calF}$ is the number of samples in $\calF$.
\end{theorem}
Item~\ref{item:F_t_subset} guarantees that $F_t$ only contains samples from $S_t$.
Item~\ref{item:F_t_proper_weights} guarantees
that for each round $t$, there is a sufficient number of samples assigned to this step either by samples
from rounds greater than $t$, which are not yet assigned to any round or directly from round $t$. 
To see this, notice that the total number of samples in this iteration is $\numsam_i=\abs{\calF}$.
Therefore, for round $t$, we need at least $w_t|\calF|$ samples. Those samples have to be from 
$F_t, F_{t+1}, \dots, F_k$ and must not be assigned to another round $t'\neq t$.
Assume that this already holds for all $t''>t$. Then we only need to ensure that the amount of samples 
which are not yet assigned to any round plus
the samples from round $t$ are greater equal $w_i|\calF|$.
This amount of not yet assigned samples plus the samples from round $t$ is equal to 
$\abs{[F_{t+1}, \dots, F_{k}]} - \sum_{t'=t+1}^k w_{t'} |\calF| + |F_t|$.
Item~\ref{item:F_t_proper_weights} follows from assuming that this is bigger than $w_t|\calF|$.

\begin{algorithm}
    \caption{Practical algorithm to compute the samples used by MDRR}
    \label{algo:practical-sampling-mdrr}
\begin{algorithmic}[1]
    \STATE {\bfseries Input:} A sequence $S_1, \dots, S_{k}$ of lists of samples and corresponding weights
    $w_1, \dots, w_{k}\in \mathbb{R}^+$ such that $\sum_{t=1}^kw_t=1$
    
    \STATE {\bfseries Output:} A sequence $\calF = [F_1, \dots, F_{k}]$ of lists of samples such that
        $F_t\subseteq S_t$,
        $\abs{[F_t, \dots, F_{k}]} \geq \sum_{t'=t}^{k} w_{t'} |\calF|$
        and $\abs{\calF}$ is maximal.\\\ 

    
    \STATE $M'\leftarrow +\infty$
    
    \STATE Let $\calF$ be a sequence of $k$ empty lists
    
    \FOR{$t=k, \dots, 1$}
        %\tcp{Denote by $\abs{F}$ the number of samples in $F$.}
        \IF{$M'-\abs{\calF} \leq  \abs{S_t}$}
        \label{line-algo-practical-if}
            \STATE Append $M' - \abs{\calF}$ samples from $S_t$ to $F_t$
            
            \STATE \textbf{Return} $\calF$
        \ENDIF
            
        \STATE $F_t \leftarrow S_t$
            
        \STATE $W\leftarrow \sum_{t'=t}^k w_{t'}$

        \STATE $M'\leftarrow \left\lfloor\min\left(\frac{\abs{\calF}}{W}, M'\right)\right\rfloor$
         \label{line.setMprime}
    \ENDFOR
\end{algorithmic}
\end{algorithm}

We now prove Theorem~\ref{thm.practical-sampling-mdrr} via a loop-invariant argument.
\begin{proof}[Proof of Theorem~\ref{thm.practical-sampling-mdrr}]
Item~\ref{item:F_t_subset} trivially holds, since only samples from $S_t$ are added to $F_t$.

We now show that item~\ref{item:F_t_proper_weights} also holds and that $\abs{\calF}$ is maximal.
We define the following proposition $B_t$ for every $t\in\{1, \dots, k\}$.
$B_t$ holds iff for every $j\geq t$, it holds that 
\begin{align}
F_{j}\subseteq S_{j} \text{ and } \abs{[F_{j}, \dots, F_{k}]}\geq \sum_{t'=j}^{k}w_{t'}\abs{\calF}
\label{eq:invariant_tprime}
\end{align}

We define the following loop invariant $C_t$.
$C_t$ holds iff after iteration $t$ of the loop, $M'$ is the maximum integer such that $B_{t}$ holds and  
using $M'-\abs{\calF}$ samples for $F_1, \dots, F_{t}$ does not lead to a violation of $B_{t}$.

If $C_t$ holds for every $t\in\{1,\dots, k\}$, the theorem is shown.

We prove that $C_t$ holds via induction.
$C_{k+1}$ holds before the loop starts, since
we can think of $t$ to be equal to $k+1$ at this time, $M'$ is infinity and $\calF$ empty.

The induction step goes from $t+1$ to $t$.
Assume $C_{t+1}$ holds.
The if-statement in line~\ref{line-algo-practical-if} then ensures that if there are more samples in $S_t$ than are still possible, $F_t$ is set equal to this number of samples and the algorithm returns. We know this is correct, since $M'$ is maximal.
Otherwise $F_t$ is set to $S_t$, because this capacity is still there for samples from $S_t$.

Then in line~\ref{line.setMprime}, the $\min\left(\frac{|\calF|}{W}, M'\right)$ defines the number of samples which can maximally be taken in total. 
The first argument of the minimum ensures that 
\eqref{eq:invariant_tprime} holds for $j=t$.
The second argument of the minimum, $M' +\abs{\calF}$ ensures that $B_{t+1}$ holds via the induction hypothesis.
Then $B_t$ holds and the induction step is shown.
\end{proof}

\subsection{Solving the Min-Max Optimization Problem}\label{appdx.ftrl}
In this subsection we describe how the learner solves the min-max problem~\eqref{eq:repeated-optim-finite} and the min-max problem in line~\ref{eq:minmaxMdrrAlgo} of Algorithm~\ref{algo:mdrr} in the experiments.

To solve the min-max problem of the empirical Lagrangians in equation~\eqref{eq:repeated-optim-finite}
we use Algorithm~1 from \cite{MTR23}. 

To solve the min-max problem for MDRR (line~\ref{eq:minmaxMdrrAlgo} of Algorithm~\ref{algo:mdrr}), we use Algorithm~\ref{algo.ftrl}.
It works the same as Algorithm~1 of \cite{MTR23}, the only difference
is in the conditions on $d$ in line~\ref{line.compute.d}, where 
we condition $d(s,a)/\bar{d}_t(s,a)\leq B$ for all steps since the 
last update of the policy.
We use parameters, $N=10$ and $\beta=\frac{\lambda}{2} = 0.05$.
\begin{algorithm}
    \caption{FTRL algorithm to calculate an approximization for the finite sample optimization problem (\eqref{eq:repeated-optim-finite} and \eqref{eq:mixed-response-lagrangian-optimization})}
    \label{algo.ftrl}
    \begin{algorithmic}[1]
    \STATE {\bfseries Input: }regularizing factor $\beta$, occupancy measures since the last update of the policy $\bar{d}_t$ for $t\in \{1, \dots, k\}$

    \STATE $d_0 \leftarrow \boldsymbol{0}$

    \FOR{$j=0, 1, \dots, N-1$}
        \IF{$j = 0$}
            \STATE $h_{j} \leftarrow \argmin_h \hat\calL^M(d_{0}, h) + \beta\norm{h}_2^2$ s.t. $\|h\|_2 \leq \frac{3\sizeS}{(1-\gamma)^2}$
        \ELSE
        \STATE $h_{j} \leftarrow \argmin_h \sum_{j'=1}^{j} \hat\calL^M(d_{j'}, h) + \beta\norm{h}_2^2$ s.t. $\|h\|_2 \leq \frac{3\sizeS}{(1-\gamma)^2}$
        \ENDIF
        
        \STATE $d_{j+1} \leftarrow \argmax_d \hat\calL^M(d, h_{j})$ s.t. $\max_{s,a} d(s,a)/\bar{d}_t(s,a) \leq B$ for all $t\in \{1,\dots, k\}$ \label{line.compute.d}
    \ENDFOR
    
    
    \STATE \textbf{Return} $\sum_{j=1}^N d_j / N$
    \end{algorithmic}
\end{algorithm}

\input{8.1_compute_table}

\subsection{Total Amount of Compute and Type of Resources}
\label{appdx.exp.compute}
The experiments of the main part (Figure~\ref{fig:main_plots}) were run on a compute cluster with each machine having 4 
Intel Xeon E7-8857 v2 CPUs (4 times 12 cores) and 1.5 TB of RAM.

In Table~\ref{tab.times}, we detail how long each experiment took to complete on these machines.

For the experiments with $w=0.85$ and $w=0.95$, we used machines with two AMD EPYC 7702 64-Core Processors and 2TB of RAM.
% \begin{table}[h]
% \caption{Compute Times of the Experiments}
%     \label{tab.times}
%     \begin{center}
%             \begin{tabular}{lccccc}
%                 \toprule[1.0pt]
%                 {\bf Algorithm} & $\boldsymbol{k}$ & $\boldsymbol{w}$ & $\boldsymbol{v}$ & {\bf time (rounded)}\\
%                 \hline
%                 RR & N\textbackslash{}A & $0.85$  &N\textbackslash{}A &  $\sim 94$ hrs 
%                 \\
%                 \hline
%                 DRR & $3$ & $0.85$ &N\textbackslash{}A &  $\sim $ hrs 
%                 \\
%                 \hline
%                 MDRR & $3$ & $0.85$ & $1.1$ &  $\sim $ hrs%$322490$ sec
%                 \\
%                 \hline
%                 RR & N\textbackslash{}A & $0.95$ &N\textbackslash{}A &  $\sim $ hrs% $376712$ sec
%                 \\
%                 \hline
%                 DRR & $3$ & $0.95$ &N\textbackslash{}A &  $\sim $ hrs %318247$ sec
%                 \\
%                 \hline
%                 MDRR & $3$ & $0.95$ & $1.1$ &  $\sim $ hrs %$361753$ sec
%                 \\
%                 \bottomrule[1.0pt]
%             \end{tabular}
%     \end{center}
% \end{table}
\subsection{Sanity-Check the Fairness of the Comparison}\label{subsec.sanity-check}
By only presenting Figures~\ref{fig:samples1000-to-last-iteration} and~\ref{fig:samples1000-to-last-iteration-w15} in the main paper, we can not rule out that some of the algorithms converge to very suboptimal solutions. In this case the comparison would be unfair.  

Therefore, in order to sanity-check the fairness of the comparison, 
we also investigate the expected value, $V^{d_t}_t$.
This is not directly associated to finding a stable occupancy measure, but should rather be seen as a check to see if the algorithms we propose reach similar solutions.
We compute $V^{d_t}_t$ using the rewards derived from the training sample trajectories. 
In other words, when $\on{Tr}_t$ is the set of trajectories sampled in round $t$, 
and for each trajectory $\tau$,
the reward in step $k$ is $r_t(\tau_k)$, then
$V^{d_t}_t = \sum_{\tau\in \on{Tr}_t}\sum_{k=0}^{l(\tau)} \gamma^k \cdot r_t(\tau_k)$.
Here $l(\tau)$ is the length of trajectory $\tau$.

We see the expected values of the algorithms in Figure~\ref{fig:values}.
As we see, after they settled down, the three algorithms have rather close expected values.
We believe that the differences stem from the initialization of the environment of the second agent rather than from some inherent differences in the algorithms.
\begin{figure*}[]
    \centering
    \setcounter{subfigure}{0}
    \begin{subfigure}[c]{\innerwidth\textwidth}
        \vspace{\innerspaceabove}
        \includegraphics[width=\textwidth]{figures/reward_step}
        \caption{Here $w=0.5$}
        \label{fig:reward-step-iteration}
    \end{subfigure}
    \begin{subfigure}[c]{\innerwidth\textwidth}
        \vspace{\innerspaceabove}
        \includegraphics[width=\textwidth]{figures/reward_step_w.15}
        \caption{Here $w=0.15$}
        \label{fig:reward-step-w}
    \end{subfigure}
    %\includegraphics[width=\textwidth]{}
    \caption{A sanity check if the algorithms reach valid solutions. Since the values of the three algorithms are close to one another, we assert that none of them reaches a much less optimal solution than another one, thereby validating all three approaches.}
    \label{fig:values}
\end{figure*}

\subsection{Additional results for large values of $w$}\label{appdx.large-w}
We additionally ran experiments for larger values of $w$, in particular $w=0.85$ and $w=0.95$. 
The results are depicted in Figure~\ref{fig:larger_w}. Suprisingly, we see that even with this large values of $w$, MDRR outperforms RR and DRR. 
This is somewhat counterintuitive, since at such large values of $w$ the environment is almost non-stateful, and we would expect RR to have an advantage here. We believe that this phenomenon is due to the fact that MDRR uses more samples than RR and DRR and therefore has a lower variance, even at the cost of a large bias. This seems to lead to a much better convergence in the settings we studied.
\begin{figure*}[ht]
    \centering
    \begin{subfigure}[c]{\outerwidth\textwidth}
        \includegraphics[width=\textwidth]{figures/w85_occ.pdf}
        \caption{Here $w=0.85$}
    \end{subfigure}
    \begin{subfigure}[c]{\outerwidth\textwidth}
        \includegraphics[width=\textwidth]{figures/w95_occ.pdf}
        \caption{Here $w=0.95$}
    \end{subfigure}
    \caption{
    Convergence plots for less stationary environments, i.e. larger values of $w$.
    Data generated as in Figure~\ref{fig:main_plots}.
    Also here MDRR outperforms the other algorithms.
    }
    \label{fig:larger_w}
\end{figure*}


\section{Additional Theoretical Results}\label{appendix-additional-theory}
\subsection{Example for Assumption~\ref{assumption_sensitivity}}
\label{appdx.example_sensitivity}
We now give an example to illustrate Assumption~\ref{assumption_sensitivity}.
For simplicity we assume that only the probability transition function $P$ changes and not the reward $r$. 
We consider a response model $\mathcal{P}$ which is defined in the following way: 
\begin{align*}
\mathcal{P}(d,P,r)=wP+(1-w)P^*(\pi_d) 
\end{align*}
for some decay rate $w\in(0,1)$ and some response function $P^*(\pi_d)$.
We can think of $\mathcal{P}$ being determined by a population and in each time-step a $(1-w)$ fraction of the population responds to the newly deployed policy $\pi_d$.
Similar settings have been studied in performative prediction~\citep{RRL+22}.

We also assume that the change in $P^*$ is bounded, i.e. $|P^*(\pi_d)(s'|s,a) - P^*(\pi_{d'})(s'|s,a)|\leq c||d-d'||_2$ for all $s,s'\in S$, $a\in A$ and some constant $c>0$. We can then derive the following Proposition.
\begin{proposition}
With the conditions set in this subsection, it holds that
\begin{align*}
||\mathcal{P}(d,P,r)-\mathcal{P}(d',P',r')||_2\leq w||P-P'||_2 + (1-w)c|S|\sqrt{|A|}\cdot||d-d'||_2.
\end{align*}
\end{proposition}
\begin{proof}
\begin{align*}
&||\mathcal{P}(d,P,r)-\mathcal{P}(d',P',r')||_2^2
\\&
=\sum_{s,a,s'}(w\cdot(P(s'|s,a)-P'(s'|s,a))
+(1-w)\cdot P^*(\pi_d)(s'|a,s)-
P^*(\pi_{d'})(s'|a,s))^2
\\&
\leq
\sum_{s,a,s'}
(w\cdot(P(s'|s,a)-P'(s'|s,a))
+(1-w)\cdot c\cdot||d-d'||_2)^2
\\&
\leq
\sum_{s,a,s'}
(w\cdot(P(s'|s,a)-P'(s'|s,a)))^2
+\sum_{s,a,s'}((1-w)\cdot c\cdot||d-d'||_2)^2
\\&
+\sum_{s,a,s'}2((1-w)\cdot c\cdot||d-d'||_2)
(w\cdot(P(s'|s,a)-P'(s'|s,a)))
\\&
\leq w^2||P-P'||_2^2+|S|^2|A|\cdot((1-w)c)^2\cdot||d-d'||_2^2
+2\cdot(1-w)\cdot c\cdot||d-d'||_2\cdot w\cdot||P-P'||_1
\\&
\leq w^2||P-P'||_2^2+|S|^2|A|\cdot((1-w)c)^2\cdot||d-d'||_2^2
+2|S|\sqrt{|A|}\cdot(1-w)\cdot c\cdot||d-d'||_2\cdot w\cdot||P-P'||_2
\end{align*}
Taking the square root on both sides gives the desired result.
\end{proof}

Given this proposition, we choose $\epsilon_{p,p}=w$ and $\iota_p=(1-w)c|S|\sqrt{|A|}$ (all other $\iota$ and $\epsilon$ parameters are $0$). Now if $\iota_p=(1-w)c|S|\sqrt{|A|}<1$, Assumption \ref{assumption_sensitivity} holds. That $(1-w)c|S|\sqrt{|A|}$ is smaller than $1$ is likely in many cases where $w$ is large and / or $c$ is small. The value of $w$ being large means that in each time-step, only a small fraction of the population responds to the new policy. This could likely be the case if each time-step encompasses a small amount of time. Additionally the total difference $||P^*(\pi_d)-P^*(\pi_{d'})||_1$ can be in the order of $c\cdot|S|^2|A|\cdot||d-d'||_2$, so the value of $c$ might likely be small.

\subsection{Existence of Stable Points}
Using arguments similar to~\cite{MTR23}, we show that there exists a stable point.
\begin{proposition}
\label{prop.fixedpt}
    If Assumption~\ref{assumption_sensitivity} holds, optimization problem~\eqref{eq:perf-stable-policy} has a fixed point.
\end{proposition}
\begin{proof}
This proposition is very similar to Proposition~1 from~\cite{MTR23}. The proof follows theirs, and we don't repeat the arguments made in their proof. However in order to make use of their arguments, we need to show that  $P_d$ and $r_d$ are continuous in $d$, which is not immediately clear. Recall that $P_d$ and $r_d$ map from occupancy measure $d$ to the environment the process converges to, if the learner always deploys $\pi_d$. We now prove that $P_d$ and $r_d$ are continuous in $d$.

We define $\epsilon \defeq \max(\epsilon_p, \epsilon_r)$.
Then we see that
\begin{align}
\begin{split}
    &\|P_d - P_{d'}\|_2 + \|r_d + r_{d'}\|_2 \leq \iota\norm{d-d'}_2 + \epsilon \left(\norm{P_d - P_{d'}}_2 +\norm{r_d - r_{d'}}_2\right)
   \\ 
    &\leq \dots \leq \sum_{i=0}^{\infty}\iota \epsilon^i\norm{d-d'}_2
    =\frac{\iota}{1-\epsilon}\norm{d-d'}_2
    \label{eq.bound_Pr_by_d}
\end{split}
\end{align}
The inequalities follow from Assumption~\ref{assumption_sensitivity}.
Thus $P_d$ and $r_d$ are continuous in $d$ and the rest of the proof follows
from the same arguments as the proof of Proposition~1 from \cite{MTR23}.
\end{proof}

\subsection{Approximating the Unregularized Objective}\label{appdx.apprx-unregularized}
Using arguments similar to~\cite{MTR23}, we can show the following approximation guarantee for the regularized objective.
\begin{theorem}
For each setting RR, DRR and MDRR, when they approximate a stable policy $d_S$ with 
respect to the regularized objective~\eqref{eq:perf-stable-policy-regularized}, the following guarantee holds:
\begin{align*}
\sum_{s,a} r_{d_S}(s,a)\cdot  d_S(s,a) \ge  \max_{d \in \mathcal{C}(d_S)} \sum_{s,a} r_{d_S}(s,a)\cdot d(s,a) - \calO\left(\frac{\lambda}{(1-\gamma)^2}\right)
\end{align*}
Here $\mathcal{C}(d_S)$ denotes the set of occupancy measures which are feasible with respect to~$P_{d_S}$.
\end{theorem}
\begin{proof}
Since $d_S$ is a stable point with respect to objective~\eqref{eq:perf-stable-policy-regularized}, it holds that
\begin{equation*}
\sum_{s,a} r_{d_S} (s,a)\cdot  d_S(s,a) - \frac{\lambda}{2} \norm{d_S}_2^2 \ge \max_{d \in \mathcal{C}(d_S) } \sum_{s,a} r_{d_S} (s,a)\cdot  d(s,a) - \frac{\lambda}{2} \norm{d}_2^2 
\end{equation*}
Therefore,
\begin{align*}
    \sum_{s,a} r_{d_S} (s,a)\cdot  d_S(s,a) &\ge \max_{d \in \mathcal{C}(d_S) } \sum_{s,a} r_{d_S} (s,a)\cdot  d(s,a) - \frac{\lambda}{2} \norm{d}_2^2 \\
    &\ge \max_{d \in \mathcal{C}(d_S) } \sum_{s,a} r_{d_S} (s,a)\cdot  d(s,a) - \frac{\lambda}{2(1-\gamma)^2} 
\end{align*}
The last inequality uses $\norm{d}_2^2 = \sum_{s,a} d(s,a)^2 = (1-\gamma)^{-2} \sum_{s,a} \left((1-\gamma) d(s,a)\right)^2 \le (1-\gamma)^{-2} \sum_{s,a} (1-\gamma) d(s,a) = (1-\gamma)^{-2}$.
\end{proof}

\subsection{Contraction}\label{appdx.sec.contraction}
In contrast to the main paper, in the appendix $\epsilon$ refers to $\epsilon \defeq \max (\epsilon_p, \epsilon_r)$, which signifies the dependency of the environment on the previous environment.

We define the following distances.
\begin{definition}
    For any occupancy measures $d, d'$, probability transition functions $P, P'$
    and reward functions $r,r'$, we define the distance between $(d,P,r)$ and $(d',P',r')$ to be equal to
    \begin{equation*}
    \dist((d,P,r),(d',P',r')) \defeq \norm{d-d'}_2 + \norm{P-P'}_2 + \norm{r-r'}_2\ .
    \end{equation*}
    We overload notation to also define
    \begin{equation*}
    \dist((P,r),(P',r')) \defeq \norm{P-P'}_2 + \norm{r-r'}_2\ .
    \end{equation*}
\end{definition}

As described in section~\ref{sec.formal-setting}, show that the mapping from $(P,r)$ to the successor environment $(\Pc(d,P,r), \Rc(d,P,r))$ is a contraction.
\begin{proposition}\label{prop.contraction}
Let $d$ be some occupancy measure.
When Assumption~\ref{assumption_sensitivity} holds, in particular
$\epsilon_p, \epsilon_r <1$, the mapping $g_d(P, r)\defeq (\Pc(d,P,r), \Rc(d,P,r))$ is a contraction with Lipschitz coefficient $\epsilon$. 
\end{proposition}
\begin{proof}
% For any $(P, r)$ and $(P', r')$ being arbitrary pairs of transition probability and reward functions, we define the distance between them as:
% \begin{equation*}
% \on{dist}((P, r), (P', r')) \defeq \norm{P-P'}_2 + \norm{r-r'}_2
% \end{equation*}

Let $P, P'$ be arbitrary probability transition functions and $r,r'$ arbitrary reward functions.

Then
\begin{align*}
    &\on{dist}(g_d(P,r) - g_d(P', r'))= 
    \norm{\Pc(d, P, r) - \Pc(d, P', r')}_2+
    \norm{\Rc(d, P, r) - \Rc(d, P', r')}_2\\
    &\leq 
    \epsilon_{p,p}\norm{P-P'}_2 + \epsilon_{p,r}\norm{r-r'}_2+
    \epsilon_{r,p}\norm{P-P'}_2+\epsilon_{r,r}\norm{r-r'}_2\\
    &\leq 
    \epsilon\norm{P-P'}_2 + \epsilon\norm{r-r'}_2 = \epsilon\cdot\on{dist}((P, r), (P', r'))\ .
\end{align*}
Where the first inequality follows from Assumption~\ref{assumption_sensitivity} and the 
second one follows from the defintion of $\epsilon_p$, $\epsilon_r$ and $\epsilon$.
From this the proposition follows.
\end{proof}

\section{Proofs for Repeated Retraining (RR) (Section~\ref{sec.rr})}\label{appdx.rr}
\subsection{Definitions}\label{appdx.rr-def}
We define the following numbers
\begin{definition}\label{def.alphaBeta}
We define    
\begin{align*}
    \alpha &:= \sqrt{3} +
    \frac{\sqrt{7}\sizeS\sqrt{\sizeS}}{(1-\gamma)^2}\text{ and}\\
    \beta &:=
    \frac{(4\sqrt{7}\gamma+3\sqrt{6})\sizeS}{(1-\gamma)^2}+
    \frac{18\sqrt{7}\gamma \sizeS^2\sqrt{\sizeS}}{(1-\gamma)^4} \ .
\end{align*}
\end{definition}
\begin{definition}\label{def.GD}
    Let $\GD(P,r)$ be the solution to the regularized optimization problem, with probability transition function $P$ and reward function $r$, i.e.
    \begin{align*}
    \GD(P,r) \defeq 
           \argmax_{d\ge 0}&\   \sum_{s,a} d(s,a) r(s,a) - \frac{\lambda}{2}\norm{d}_2^2\\
       \textrm{s.t. } & \sum_a d(s,a) = \rho(s) + \gamma \cdot \sum_{s',a} d(s',a) P(s',a,s)\ \forall s\ .
    \end{align*}
\end{definition}

\subsection{RR in the Exact Setting (Theorem~\ref{thm:standard-rr-simple})}
\label{appdx.rr-exact}
We show the following more general version of Theorem~\ref{thm:standard-rr-simple}.
\begin{theorem}\label{thm:standard-rr}
    Assume that Assumption~\ref{assumption_sensitivity} holds
    and
        \begin{equation*}\lambda> \max\left\{(1-\epsilon_{p})^{-1}
            \beta,  (1-\epsilon_{r})^{-1} \alpha \right\}\end{equation*}
    Then for any $\delta>0$, we have
    \begin{equation*}
    \norm{d_t - d_S}_2 \leq \delta 
    \text{\quad for all } t \geq
    \frac{\ln\left(
    \frac{\norm{d_0 - d_S}_2 + \norm{P_0- P_S}_2 + \norm{r_0-r_S}_2}{\delta}
    \right)}{\ln\left(\left(\max\left\{
    \iota, \epsilon_p + \frac{\beta}{\lambda}, \epsilon_r + \frac{\alpha}{\lambda}
    \right\}\right)^{-1}\right)} + 1,
    \end{equation*}
    %application of $f$ converges to the unique fixed point of $f$.
    with $\alpha$ and $\beta$ defined in Definition~\ref{def.alphaBeta}.
\end{theorem}
We first discuss how to obtain Theorem~\ref{thm:standard-rr-simple} from Theorem~\ref{thm:standard-rr}.
Assumption~\ref{assumption-simplicity} ensures that $\beta \geq \alpha$, $\epsilon_p = \epsilon_r = \epsilon$ and $\iota \leq \epsilon$.
We further bound $\norm{d_0 -d_S}_2 \leq \frac{2}{1-\gamma}$, 
$\norm{P_0-P_S}_2\leq \sqrt{2\sizeS\sizeA}$ and $\norm{r_0-r_S}_2\leq \sqrt{\sizeS\sizeA}$.
Choosing $\lambda = 2\beta (1-\epsilon)^{-1}$ then provides the desired bounds.

The proof of Theorem~\ref{thm:standard-rr} has a similar structure to the proof of Theorem~4 in~\cite{BHK20}.
\begin{proof}[Proof of Theorem~\ref{thm:standard-rr}]
    We define by $f$ the mapping from $(d_{t-1}, P_{t-1}, r_{t-1})$ to $(d_t, P_t, r_t)$, i.e.
    \begin{equation*}f(d, P, r) := (\GD(P,r), \Pc(d, P, r), \Rc(d, P, r))\ .\end{equation*}
    
    We analyze $ \on{dist}(f(d, P, r), f(d', P', r')) $.
    \begin{equation}
        \label{dist_ff}
        \begin{split}
            \on{dist}(f(d, P, r), f(d', P', r'))
                =& \|\GD(P, r) - \GD(P', r') \|_2 \\
                &+ \|\Pc(d, P, r) - \Pc(d', P', r')\|_2 \\
                &+ \|\Rc(d, P, r) - \Rc(d', P', r')\|_2
        \end{split}
    \end{equation}
    The last two terms of this sum can be bounded by using 
    Assumption~\ref{assumption_sensitivity}~:
    \begin{equation} \label{diff_tr}
    \begin{split}
        &\|\Pc(d, P, r)) - \Pc(d', P', r'))\|_2 + \|\Rc(d, P, r)) - \Rc(d', P', r'))\|_2\\
        \leq&\  (\iota_p+\iota_r)\|d-d'\|_2 + (\epsilon_{p,p}+\epsilon_{r,p})\|P-P'\|_2
        + (\epsilon_{p,r}+\epsilon_{r,r})\|r-r'\|_2
    \end{split}
    \end{equation}

    We now bound the first term of \eqref{dist_ff}, i.e. 
    $\|\GD(P, r) - \GD(P', r') \|_2$.

    From Lemma~\ref{lem:GG_alphabeta}, we get
    \begin{align}\label{diff_GG}
    \|\GD(P, r) - \GD(P', r')\|_2\leq 
    \frac{\alpha}{\lambda} \|r-r'\|_2 + \frac{\beta}{\lambda}\|P-P'\|_2
    \end{align}

    Combining \eqref{dist_ff}, \eqref{diff_tr} and \eqref{diff_GG} we get

    \begin{align}
    \begin{split}
        &\on{dist}(f(d,P,r), f(d', P', r')) 
        \leq \iota_d\|d-d'\|_2\\
        % P
        &+\left(\epsilon_{p}+
        \frac{\beta}{\lambda}
             \right)\|P-P'\|_2 
        % r
        + \left(
        \epsilon_{r}+
        \frac{\alpha}{\lambda}
            \right)\|r-r'\|_2
            \label{eq:contraction-standard-case}
    \end{split}
    \end{align}
    We define $q:= \max\left(\iota_d, \epsilon_{p}+\frac{\beta}{\lambda},
    \epsilon_{r}+ \frac{\alpha}{\lambda} 
            \right)$.
    From \eqref{eq:contraction-standard-case} and the definition of $q$, 
    it follows that
    \begin{align*}
        &\on{dist}((d_{t},P_{t},r_{t}), (d_S, P_S, r_S))=\on{dist}(f(d_{t-1},P_{t-1},r_{t-1}), f(d_S, P_S, r_S)) \\
        \leq& q\on{dist}((d_{t-1},P_{t-1},r_{t-1}), (d_S, P_S, r_S))
        %(\norm{d_{t-1} - d_S}_2 + \norm{P_{t-1}-P_S}_2 + \norm{r_{t-1} - r_S}_2)
        \leq q^t\left(\norm{d_0 - d_S}_2 + \norm{P_0-P_S}_2 + \norm{r_0 - r_S}_2\right),
    \end{align*}
    where the first equality follows from the fact that $(d_S, P_S, r_S)$ is a fixed point
    of $f$.
    
    Note that by the conditions on $\lambda, \iota, \epsilon_p$ and  $\epsilon_r$, 
    it holds that $q< 1$.
   
   Therefore, if we set
   $t\geq \ln(\dist((d_1, P_0, r_0), (d_S, P_S, r_S))/\delta)/\ln(1/q) + 1$,
   then we get that
   \begin{equation*}\dist((d_{t},P_{t-1},r_{t-1}), (d_S, P_S, r_S))\leq \delta.\end{equation*}
   Then also $\norm{d_t - d_S}_2 \leq \delta$.
\end{proof}

\begin{lemma}[similar to lemma~2 of~\cite{BHK20}]\label{lem:GG_alphabeta}
    Let $P, \hat{P}$ be two probability transition functions and 
    $r, \hat{r}$ be two reward functions. Then
    \begin{equation*}
    \|\GD(P, r) - \GD(\hat{P}, \hat{r})\|_2\leq 
    \frac{\alpha}{\lambda} \|r-\hat{r}\|_2 + \frac{\beta}{\lambda}\|P-\hat{P}\|_2
    \end{equation*}
    with $\alpha$ and $\beta$ from Definition~\ref{def.alphaBeta}.
\end{lemma}
\begin{proof}
    Let $M$ and $\hat{M}$ be two MDPs and 
    $r$ and $\hat{r}$ be the corresponding reward functions and 
    $P$ and $\hat{P}$ be the corresponding transition probability functions.

    In the following we use some arguments from~\cite{MTR23}.
    Those arguments apply here as well, since we use the same optimization problem as they do.

    Let $h$ and $\hat{h}$ be the optimal solution to the dual objective (12) in \cite{MTR23} to 
    $M$ and $\hat{M}$ respectively.

    From~\cite{MTR23} we get that (page 16, after ``We now substitute the above bound in equation 15.'')

    \begin{align}
        -\frac{\sizeA(1-\gamma)^2}{\lambda}\left\|h-\hat{h}\right\|_2^2 \geq 
        -\left\|h-\hat{h}\right\|_2 \left\|\nabla \mathcal{L}(\hat{h};M) 
        - \nabla\mathcal{L}(\hat{h}, \hat{M})\right\|_2
        \label{hh2}
    \end{align}

    and also from~\cite{MTR23} 
    \begin{align}
        \left\|\nabla \mathcal{L}(\hat{h};M) 
        - \nabla\mathcal{L}(\hat{h}, \hat{M})\right\|_2
        &\leq \frac{4\sizeS\sqrt{\sizeA}}{\lambda}\left\|r-\hat{r}\right\|_2 
        + \left(\frac{4\gamma\sqrt{\sizeS\sizeA}}{\lambda}+\frac{6\gamma\sqrt{\sizeA}\sizeS}{\lambda}
        \left\|\hat{h}\right\|_2\right)
        \left\|P-\hat{P}\right\|_2\nonumber\\
        &\leq \frac{4\sizeS\sqrt{\sizeA}}{\lambda}\left\|r-\hat{r}\right\|_2 
        + \left(\frac{4\gamma\sqrt{\sizeS\sizeA}}{\lambda}+\frac{6\gamma\sqrt{\sizeA}\sizeS}{\lambda}
        \frac{3\sizeS}{(1-\gamma)^2}\right)
        \left\|P-\hat{P}\right\|_2\label{lemma_3_4}
    \end{align}
    The first inequality is due to lemma~3 of~\cite{MTR23} and the 
    second inequality is due to lemma~4 in~\cite{MTR23}.

    Combining \eqref{hh2} and \eqref{lemma_3_4} we get:
    \begin{align}
        \left\|h-\hat{h}\right\|_2 \leq&
        \frac{\lambda}{\sizeA(1-\gamma)^2}
        \left\|\nabla \mathcal{L}(\hat{h};M) 
        - \nabla\mathcal{L}(\hat{h}, \hat{M})\right\|_2\nonumber\\
        \leq&
        \frac{\lambda}{\sizeA(1-\gamma)^2}\Biggl(
        \frac{4\sizeS\sqrt{\sizeA}}{\lambda}\left\|r-\hat{r}\right\|_2 
        + \left(\frac{4\gamma\sqrt{\sizeS\sizeA}}{\lambda}+\frac{6\gamma\sqrt{\sizeA}\sizeS}{\lambda}
        \frac{3\sizeS}{(1-\gamma)^2}\right)
        \left\|P-\hat{P}\right\|_2\Biggr)\label{bound_hh}
    \end{align}

    Another result from~\cite{MTR23}, which is found in the proof of lemma~1 is:
    \begin{align}
        \left\|\GD(P, r)-\GD(\hat{P}, \hat{r})\right\|_2^2\leq \frac{3}{\lambda^2}\|r-\hat{r}\|_2^2 
        + \frac{7\sizeA\sizeS}{\lambda^2}\left\|h-\hat{h}\right\|_2^2 + \frac{6}{\lambda^2}
        \left\|\hat{h}\right\|_2^2\left\|P-\hat{P}\right\|_2^2
        \label{bound_gg22}
    \end{align}

    Combining \eqref{bound_hh} and \eqref{bound_gg22} it follows that
    \begin{align}
        \left\|\GD(P, r)-\GD(\hat{P}, \hat{r})\right\|_2
        \leq& \frac{\sqrt{3}}{\lambda}\|r-\hat{r}\|_2
        + \frac{\sqrt{7\sizeA\sizeS}}{\lambda}\left\|h-\hat{h}\right\|_2 + \frac{\sqrt{6}}{\lambda}
        \left\|\hat{h}\right\|_2\left\|P-\hat{P}\right\|_2\nonumber\\
        \leq& \frac{\sqrt{3}}{\lambda}\|r-\hat{r}\|_2
        + \frac{\sqrt{7\sizeA\sizeS}}{\lambda}\left\|h-\hat{h}\right\|_2
        + \frac{\sqrt{6}}{\lambda}
        \frac{3\sizeS}{(1-\gamma)^2}\left\|P-\hat{P}\right\|_2\label{bound_gg}
    \end{align}
    where the last inequality follows from lemma~4 of~\cite{MTR23}.

    Combining \eqref{bound_hh} and \eqref{bound_gg} we get:
    \begin{align*}
        &\left\|\GD(P, r)-\GD(\hat{P}, \hat{r})\right\|_2
        \leq \frac{\sqrt{3}}{\lambda}\|r-\hat{r}\|_2
        + \frac{\sqrt{7\sizeA\sizeS}}{\lambda}
        \frac{\lambda}{\sizeA(1-\gamma)^2}\Biggl(
        \frac{4\sizeS\sqrt{\sizeA}}{\lambda}\left\|r-\hat{r}\right\|_2 \nonumber\\&
        + \left(\frac{4\gamma\sqrt{\sizeS\sizeA}}{\lambda}+\frac{6\gamma\sqrt{\sizeA}\sizeS}{\lambda}
        \frac{3\sizeS}{(1-\gamma)^2}\right)
        \left\|P-\hat{P}\right\|_2\Biggr) %\nonumber\\&
        + \frac{\sqrt{6}}{\lambda}
        \frac{3\sizeS}{(1-\gamma)^2}\left\|P-\hat{P}\right\|_2\nonumber\\
        =&
        \Biggl(\frac{\sqrt{3}}{\lambda} + 
        \frac{\sqrt{7\sizeA\sizeS}}{\lambda}
        \frac{\lambda}{\sizeA(1-\gamma)^2}
        \frac{4\sizeS\sqrt{\sizeA}}{\lambda}\Biggr)\|r-\hat{r}\|_2 \nonumber\\&
        + \Biggl(\frac{\sqrt{7\sizeA\sizeS}}{\lambda}
        \frac{\lambda}{\sizeA(1-\gamma)^2}
        \Biggl(\frac{4\gamma\sqrt{\sizeS\sizeA}}{\lambda}+\frac{6\gamma\sqrt{\sizeA}\sizeS}{\lambda}
        \frac{3\sizeS}{(1-\gamma)^2}\Biggr)
         + \frac{\sqrt{6}}{\lambda}
        \frac{3\sizeS}{(1-\gamma)^2}\Biggr)
        \left\|P-\hat{P}\right\|_2\nonumber\\
        =&
        % r
            \Biggl(
            \frac{\sqrt{3}}{\lambda} +
            \frac{\sqrt{7}\sizeS\sqrt{\sizeS}}{(1-\gamma)^2\lambda}
            \Biggr)\|r-\hat{r}\|_2 %\\&
        % P
            + \Biggl(
            \frac{(4\sqrt{7}\gamma+3\sqrt{6})\sizeS}{(1-\gamma)^2\lambda}+
            \frac{18\sqrt{7}\gamma \sizeS^2\sqrt{\sizeS}}{(1-\gamma)^4\lambda}
             \Biggr)
            \left\|P-\hat{P}\right\|_2
    \end{align*}
\end{proof}

\subsection{RR with Finite Samples (Theorem~\ref{thm:finite-samples-RR-simple})}
In general we note that using our sample generation model, it is easy to get an estimate of the current occupancy measure $\bar{d}$, by comparing how many samples were drawn for each pair $(s,a)$ and how many samples were drawn overall. It is also straightforward to bound those estimates using standard methods such as Hoeffding's inequality. 
For simplicity, we implicitly assume that those occupancy measures are provided. More concretely, in Lagrangians~\eqref{eq:empirical-Lagrangian} and \eqref{eq:lagrangian-finite-mdrr} we assume that $\bar{d}_j$ is given.

\label{appdx.rr-finite}
\begin{definition}
We denote by $\hGD(d_t, F)$ the solution to optimization problem corresponding to $\hat\cL$, i.e.
\begin{align*}
\hGD(d_t, F) \defeq
    \argmax_d \min_h \underbrace{\left(- \frac{\lambda}{2} \norm{d}_2^2 + \sum_s h(s) \rho(s) 
    +
    \sum_{(s,a,r,s')\in F} \frac{d(s,a)}{\bar{d}_t(s,a)} \cdot \frac{r - h(s) + \gamma h(s')  }{\abs{F}(1-\gamma)}\right)}_{=\hat{\calL}}
\end{align*}
\end{definition}

We use the following result from \cite{MTR23}.
\begin{lemma}
    Given an arbitrary occupancy measure $d$, probability transition function $P$ and reward function
    $r$, suppose that $\GD(P, r)(s,a)/\bar{d}(s,a) \leq B$ 
    for all $(s,a)\in \setS\times \setA$, where $\bar{d}$ is the occupancy measure of $\pi_d$ in an environment
    with transition probabilities $P$. Furthermore, let $F$ be a set of samples drawn according to the occupancy
    measure $\bar{d}$ with $r$ being the reward function. We assume
     \begin{align*}
        \abs{F} \ge \frac{1}{\mu^2} \left( \sizeA\ln\left(\frac{2}{\delta_1}\right) +  \ln\left(\frac{12\sizeS}{
            \mu(1-\gamma)^2}\right) 
        + 2 \sizeA \ln\left(\frac{\ln\left(\frac{3\sizeS^2\sizeA B}{\mu(1-\gamma)^2}\right)}{\mu}\right)\right),
    \end{align*}
    for arbitrary $\mu, \delta_1>0$.
    Then the following bound holds with probability at least $1-\delta_1$.
    \begin{align*}
        \|\GD(P, r) - \hGD(d, F)\|_2 &\le 
        \frac{6\sqrt{ \sizeS^{1.5}(B + \sqrt{\sizeA}) \mu }}{(1-\gamma)^{1.5} } 
        \frac{1}{\sqrt{\lambda}}
    \end{align*}
    \label{lem:bound-GD-hGD}
\end{lemma}
This lemma follows from the equation which comes second after equation (23) in the work from \cite{MTR23} on page 30, after the text ``Rearranging and using lemma 12 we get the following bound''.
The conditions follow from the conditions under which this equation holds in the work from \cite{MTR23}. 
Note that we write $\mu$ instead of $\epsilon$, which is the variable name used in~\cite{MTR23}.
The same arguments as in \cite{MTR23} hold, since they also look at the one step updates optimizing $\cL$ and $\hat\cL$, which are the same in this work.

We can then show a more general version of Theorem~\ref{thm:finite-samples-RR-simple}.
\begin{theorem}\label{thm:finite-samples-RR}
    Suppose that overlap Assumption~\ref{assumption-offline-rl} holds for $k=1$ and parameter $B$
    and 
    Assumption~\ref{assumption_sensitivity} holds.
    Let  $(x_p, x_r)\in\{(\iota_p, \iota_r), 
    (\epsilon_{p,p}, \epsilon_{r,p}),(\epsilon_{p,r}, \epsilon_{r,r})\}$ be the pair
    maximizing $\left(\frac{\alpha}{\lambda}+1\right)x_r  + \left(\frac{\beta}{\lambda}+1\right)x_p$.
    We then assume that
    \begin{equation*}\lambda> \max\left\{(1-\epsilon_{p})^{-1}
        \beta, 
        (1-\epsilon_{r})^{-1}
        \alpha,\frac{\alpha x_r+\beta x_p}{1
        -\zeta-x_r-x_p} \right\}.\end{equation*}
    %and $1>x_r+x_p$. 
    Furthermore assume that 
    \begin{align*}
        m_t \ge &
        \left(\frac{\xi}{\lambda \zeta^2}\right)^2
        \left( \sizeA\ln\left(\frac{4t^2}{p}\right) +  \ln\left(\frac{12\sizeS \xi}{\lambda 
            \zeta^2(1-\gamma)^2}\right)
        + 2 \sizeA \ln\left(\frac{\xi\ln\left(\frac{3\sizeS\sizeA B\xi}{\lambda 
            \zeta^2(1-\gamma)^2}\right)}{\lambda \zeta^2}\right)\right)\ ,
    \end{align*}
    with $\xi =\frac{36\sizeS^{1.5}(B+\sqrt{\sizeA})}{\delta^2(1-\gamma)^3}$.
    Then for any $\delta>0$, we have
    \begin{equation*}
    \norm{d_t - d_S}_2 \leq \delta
    \text{\quad for all } t\geq 
    \frac{\ln\left(\frac{\norm{d_1 - d_S}_2 + \norm{P_0-P_S}_2 + \norm{r_0 - r_S}_2}{\delta}\right)}
    {\ln\left(1/\left(\zeta+\Big(\frac{\alpha}{\lambda}+1\Big)x_{r}+
    \Big(\frac{\beta}{\lambda} +1\Big)x_{p}\right)\right)} + 1.
    \end{equation*}
    Here $\zeta$ can be chosen to be an arbitrary value between $0$ and $1-x_r-x_p$. 
    It defines a trade-off between
    the conditions on the regularization parameter $\lambda$ and on the number of 
    samples $m_t$.
\end{theorem}
Theorem~\ref{thm:finite-samples-RR-simple} follows from Theorem~\ref{thm:finite-samples-RR} in the following way.
We set $\zeta = (1-\epsilon)/2$, and $\lambda = \frac{2\epsilon(\alpha+\beta)}{1-\epsilon}$.

Then for the denominator of the number of retrainings, we derive
\begin{align*}
    \zeta+\Big(\frac{\alpha}{\lambda}+1\Big)\frac{\epsilon}{2}+
    \Big(\frac{\beta}{\lambda} +1\Big)\frac{\epsilon}{2}
    =
    \frac{1}{2} + \frac{\epsilon}{2} +\frac{\epsilon(\alpha+\beta)}{2\lambda} 
    =
    \frac{1}{2} + \frac{\epsilon}{2} + \frac{1-\epsilon}{4}
    =\frac{3}{4}+\frac{\epsilon}{4}
\end{align*}
We can bound $\norm{d_1 - d_S}_2$, $\norm{P_0 - P_S}_2$ and $\norm{r_0 - r_S}_2$ like above, to obtain
\begin{align*}
t\geq 
\frac{\ln\left(\frac{\frac{2}{1-\gamma}+\left(1+\sqrt{2}\right)\sqrt{\sizeS\sizeA}}{\delta}\right)}
        {\ln\left(1/\left(\frac{3}{4}+\frac{\epsilon}{4}\right)\right)}
    + 1
\end{align*}
\begin{proof}[Proof of Theorem~\ref{thm:finite-samples-RR}]
    From lemma~\ref{lem:bound-GD-hGD}, we get that with probability $1-\delta_1$
    \begin{align}
        \|\GD(P_t, r_t) - \hGD(d_t, F_t)\|_2 &\le 
        \frac{6\sqrt{ \sizeS^{1.5}(B + \sqrt{\sizeA}) \mu }}{(1-\gamma)^{1.5} } 
        \frac{1}{\sqrt{\lambda}},
        \label{eq:RR-finite-samples-GD-hGD}
    \end{align}
    as long as 
    \begin{align*}
        m_t \ge \frac{1}{\mu^2} \left( \sizeA\ln\left(\frac{2}{\delta_1}\right) +  \ln\left(\frac{12\sizeS}{
            \mu(1-\gamma)^2}\right) 
        + 2 \sizeA \ln\left(\frac{\ln\left(\frac{3\sizeS^2\sizeA B}{\mu(1-\gamma)^2}\right)}{\mu}\right)\right).
    \end{align*}
    If we set
    $\delta_1 = p/2t^2$ in step $t$, we get that event \eqref{eq:RR-finite-samples-GD-hGD} 
    holds with probability at least $1-p/2t^2$ in round $t$.
    Via a union bound over all rounds, we get that event~\eqref{eq:RR-finite-samples-GD-hGD}
    holds with probability at least $1-p$ in all rounds.

    Let $\hat{g}(d_{t+1}, P_t, r_t)$ be the result after one round, i.e.
    \begin{equation*}
    \hat{g}(d_{t+1}, P_t, r_t) = (\hGD(d_{t+1}, F_{t+1}), 
    \cP(d_{t+1}, P_t, r_t), \cR(d_{t+1}, P_t, r_t))\ .
    \end{equation*}
    I.e. it holds that $(d_{t+2}, P_{t+1}, r_{t+1}) = \hat{g}(d_{t+1}, P_t, r_t)$.

    We analyze
    \begin{align}
        \on{dist}(\hat{g}(d_{t+1}, P_t, r_t), (d_S, P_S, r_S)) =
        %&\ \ \ \| \hGD(d_{t+1}, F_{t+1}) - d_S\|_2 \nonumber\\
        %&+ \|\cP(d_{t+1}, P_t, r_t) - P_S \|_2 \nonumber\\
        %&+ \|\cR(d_{t+1}, P_t, r_t) - r_S \|_2 \nonumber\\ 
        %=
        &\ \ \ \| \hGD(d_{t+1}, F_{t+1}) - d_S\|_2 \nonumber\\
        &+ \|\cP(d_{t+1}, P_t, r_t) - \cP(d_S, P_S, r_S)\|_2 \nonumber
        + \|\cR(d_{t+1}, P_t, r_t) - \cR(d_S, P_S, r_S)\|_2 \nonumber\\
        \begin{split}
        \leq &\ \ \ \| \hGD(d_{t+1}, F_{t+1}) - d_S\|_2 \\
        &+ \iota_d\|d_{t+1}-d_S\|_2 
        + \epsilon_{p}\|P_t-P_S\|_2
        + \epsilon_{r}\|r_t-r_S\|_2
            \label{dist_gh_dsPsrs_first}
        \end{split}
    \end{align}
    where the last inequality is due to Assumption~\ref{assumption_sensitivity}.

    It remains to analyze $\|\hGD(d_{t+1}, F_{t+1}) - d_S\|_2$.
    Using equation~\eqref{eq:RR-finite-samples-GD-hGD}, we see that 
    \begin{align}
        \| \hGD(d_{t+1}, F_{t+1}) - d_S\|_2 &\leq 
        \|\hGD(d_{t+1}, F_{t+1}) - \GD(P_{t+1}, r_{t+1})\|_2 
        + \|\GD(P_{t+1}, r_{t+1}) - d_S\|_2\nonumber\\
        &\leq \frac{6\sqrt{\sizeS^{1.5}(B+\sqrt{\sizeA})\epsilon}}{(1-\gamma)^{1.5}}\frac{1}{\sqrt{\lambda}}
        +\|\GD(P_{t+1}, r_{t+1}) - \GD(P_S, r_S)\|_2 \label{gh_minus_ds}
    \end{align}

    Furthermore we can derive
    \begin{align}
        &\|\GD(P_{t+1}, r_{t+1}) - \GD(P_S, r_S)\|_2
        \leq
        \frac{\alpha}{\lambda}\|r_{t+1}-r_S\|_2
            +\frac{\beta}{\lambda}\norm{P_{t+1}-P_S}_2\nonumber\\
            =&
            \frac{\alpha}{\lambda}\|\cR(d_{t+1}, P_t, r_{t})-\cR(d_S, P_S, r_S)\|_2
            +\frac{\beta}{\lambda}
            \left\|\cP(d_{t+1}, P_t, r_{t})-\cP(d_S, P_S, r_S)\right\|_2\nonumber\\
        \begin{split}
        \leq&
        % r
            \frac{\alpha}{\lambda}(
            \iota_r\|d_{t+1}-d_S\|_2 + 
            \epsilon_{r,p}\|P_t-P_S\|_2 + \epsilon_{r,r}\|r_t-r_S\|_2)
            \\
        % P
            &+\frac{\beta}{\lambda} 
             ( \iota_p\|d_{t+1}-d_S\|_2 + 
            \epsilon_{p,p}\|P_t-P_S\|_2 + \epsilon_{p,r}\|r_t-r_S\|_2)
            \label{norm_G_t+1_G_S}
        \end{split}
    \end{align}
    where the first inequality follows from lemma~\ref{lem:GG_alphabeta}, in the equality we use 
    the fact that $P_{t+1} = \cP(d_{t+1}, P_t, r_t)$, $r_{t+1} = \cR(d_{t+1}, P_t, r_t)$,
     $P_{S} = \cP(d_{S}, P_S, r_S)$, $r_{S} = \cR(d_S, P_S, r_S)$ 
     and Assumption~\ref{assumption_sensitivity}.

    Inserting \eqref{norm_G_t+1_G_S} into \eqref{gh_minus_ds} and the result 
    into \eqref{dist_gh_dsPsrs_first}, we get

    \begin{align}
        \begin{split}
        \on{dist}(\hat{g}(d_{t+1}, P_t, r_t), (d_S, P_S, r_S))
        \leq& \frac{6\sqrt{\sizeS^{1.5}(B+\sqrt{\sizeA})\mu}}{(1-\gamma)^{1.5}}\frac{1}{\sqrt{\lambda}}\\
            &+ \left(\left(\frac{\alpha}{\lambda}+1\right)\iota_r+\left(\frac{\beta}{\lambda}
            +1\right)\iota_p\right)\|d_{t+1}-d_S\|_2 \\
            &+ \left(\left(\frac{\alpha}{\lambda}+1\right)\epsilon_{r,p}+\left(\frac{\beta}{\lambda}
            +1\right)\epsilon_{p,p}\right)\|P_t-P_S\|_2 \\
            &+ \left(\left(\frac{\alpha}{\lambda}+1\right)\epsilon_{r,r}+\left(\frac{\beta}{\lambda}
            +1\right)\epsilon_{p,r}\right)\|r_t-r_S\|_2
            \label{dist_gh_dsPsrs}
        \end{split}
    \end{align}

    We now introduce a new parameter $\zeta\in (0,1-x_r-x_p)$, which is mentioned in the theorem.
    We set $\mu = \frac{\zeta^2\delta^2\lambda (1-\gamma)^3}{36\sizeS^{1.5}(B+\sqrt{\sizeA})}$.

    This allows us to rewrite \eqref{dist_gh_dsPsrs} into 
    \begin{align}
        \on{dist}(\hat{g}(d_{t+1}, P_t, r_t), (d_S, P_S, r_S))
        \leq& \zeta\delta\nonumber\\
            &+ \left(\left(\frac{\alpha}{\lambda}+1\right)\iota_r+\left(\frac{\beta}{\lambda}+1\right)
            \iota_p\right)\|d_{t+1}-d_S\|_2 \nonumber\\
            &+ \left(\left(\frac{\alpha}{\lambda}+1\right)\epsilon_{r,p}+\left(\frac{\beta}{\lambda}+1\right)
            \epsilon_{p,p}\right)\|P_t-P_S\|_2 \nonumber\\
            &+ \left(\left(\frac{\alpha}{\lambda}+1\right)\epsilon_{r,r}+\left(\frac{\beta}{\lambda}+1\right)
            \epsilon_{p,r}\right)\|r_t-r_S\|_2
            \nonumber\\
            \begin{split}
                \leq& \zeta\delta
                + \left(\left(\frac{\alpha}{\lambda}+1\right)x_{r}+\left(\frac{\beta}{\lambda}+1\right)
                x_{p}\right)\\
                &\cdot\left(\|d_{t+1}-d_S\|_2 +\|P_t-P_S\|_2+\|r_t-r_S\|_2\right)
                \label{dist_gh_dsPsrs_simpler}
            \end{split}
    \end{align}
    Where we select $(x_p, x_r)\in\{(\iota_p, \iota_r), 
    (\epsilon_{p,p}, \epsilon_{r,p}),(\epsilon_{p,r}, \epsilon_{r,r})\}$ to be the pair
    maximizing $\left(\frac{\alpha}{\lambda}+1\right)x_r 
    + \left(\frac{\beta}{\lambda}+1\right)x_p$.

    Note that by this formulation of $\mu$, the bound on $m_t$ becomes
    \begin{align*}
        m_t \ge& 
        \left(\frac{36\sizeS^{1.5}(B+\sqrt{\sizeA})}{\zeta^2\delta^2\lambda (1-\gamma)^3}\right)^2
        \Bigg( \sizeA\ln\left(\frac{4t^2}{p}\right) +  \ln\left(\frac{432\sizeS^{2.5}(B+\sqrt{\sizeA})}{
            \zeta^2\delta^2\lambda(1-\gamma)^5}\right) \\
        &+ 2 \sizeA \ln\left(\frac{36\sizeS^{1.5}(B+\sqrt{\sizeA})\ln\left(\frac{108\sizeS^{3.5}\sizeA B(B+\sqrt{\sizeA})}{
            \zeta^2\delta^2\lambda(1-\gamma)^5}\right)}{\zeta^2\delta^2\lambda (1-\gamma)^3}\right)\Bigg)
    \end{align*}

    We now apply lemma~\ref{lem:contraction-case-distinction} on the sequence 
    $\{(d_{t+1}, P_t, r_t)\}_{t\in\mathbb{N}}$ using~\eqref{dist_gh_dsPsrs_simpler}.
    We can do this, because
    by our assumption, we know that $\lambda > \frac{\alpha x_r+\beta x_p}{1
    -\zeta-x_r-x_p}$ and $1>\zeta+x_r+x_p$, so
    \begin{align*}
        &\zeta+\Big(\frac{\alpha}{\lambda}+1\Big)x_{r}+
        \Big(\frac{\beta}{\lambda} +1\Big)x_{p}\\
        <& \zeta + x_r +x_p + \frac{\alpha x_r 
        (1-\zeta - x_r - x_p)}{\alpha 
        x_r + \beta x_p} 
        + \frac{\beta x_p(1-\zeta - x_r - x_p)}{\alpha 
        x_r + \beta x_p} = 1\ .
    \end{align*}
    The bound stated in the Theorem follows by the application of lemma~\ref{lem:contraction-case-distinction} and 
    the fact that $\norm{d_{t+1} - d_S}_2 \leq \on{dist}((d_{t+1}, P_t, r_t), (d_S, P_S, r_S))$.
\end{proof}

We use of the following argument, which is often used in the performative prediction  setting~\citep{PZM+20,BHK20,MTR23}.
\begin{lemma}\label{lem:contraction-case-distinction}
    Let $(\mathcal{M}, \dist)$ be a metric space
    and $x_1, x_2 \geq 0$ with $x_1 + x_2 < 1$.
    Assume that $\{p_i\}_{i\in\mathbb{N}}$ is a sequence of points in $\mathcal{M}$ such that 
    there exists a unique $p_S\in \mathcal{M}$ with
    \begin{equation*}
    \dist(p_{i+1}, p_S) \leq x_1 \delta + x_2 \dist(p_i, p_S)\ \ \ \text{ for all }i\geq 0\ .
    \end{equation*}
    Then for $n\geq \frac{\ln\left(\dist(p_0, p_S)/\delta\right)}{
    \ln(1/(x_1+x_2))}$, it holds that $\dist(p_n, p_S)\leq \delta $.
\end{lemma}
\begin{proof}
    We see this via the following case distinction.
    Let $i\geq 0$ be arbitrary.
    \begin{description}
    \item{\casebycase{1}{$\dist(p_i, p_S)\geq \delta$}}
        Then 
        \begin{equation*} 
        \dist(p_{i+1}, p_S) \leq \dist(p_i, p_S) (x_1 + x_2)\ .
        \end{equation*}
    \item{\casebycase{2}{$\dist(p_i, p_S)< \delta$}}
        Then 
        \begin{equation*} 
        \dist(p_{i+1}, p_S) \leq \delta (x_1 + x_2)\ .
        \end{equation*}
    \end{description}
    By this case distinction, via induction we get that
    $\dist(p_i, p_S)\leq \max((x_1+x_2)^i\dist(p_0, p_S), \delta)$.
    In particular for $n = \frac{\ln\left(\dist(p_0, p_S)/\delta\right)}{
    \ln(1/(x_1+x_2))}$ 
    it holds that 
    \begin{equation*}
    \dist(p_n, p_S)\leq \max((x_1+x_2)^n\dist(p_0, p_S), \delta) \leq \delta\ .
    \end{equation*}
\end{proof}


\section{Proofs for Delayed Repeated Retraining (DRR) (Section~\ref{sec.drr})}
\label{appdx.sec.drr}
\subsection{DRR in the Exact Setting (Theorem~\ref{thm:delayed_rr_standard-simple})}
\label{appdx.sec.drr-exact}
We show a more general version of the Theorem~\ref{thm:delayed_rr_standard-simple}.
\begin{theorem}\label{thm:delayed_rr_standard}
    Suppose Assumption~\ref{assumption_sensitivity} holds 
    and $\lambda>\frac{2\iota\phi}{1-\epsilon}$, 
    where $\phi := \max(\alpha, \beta)$ and $\alpha, \beta$ as in Definition~\ref{def.alphaBeta}.
    Then with $d_i$ being calculated by DRR in the exact setting, with
    $k=\ln^{-1}\left(\frac{1}{\epsilon}\right)\ln\left(\frac{\distpr}{\delta\iota}\right)$,
    it holds that
    \begin{equation*}
    \norm{d_i - d_S}_2 \leq \delta
    \text{\quad for all } i\geq \ln\left(\frac{\norm{d_0 - d_S}_2}{\delta}\right)/
    \ln\left(\frac{\lambda(1-\epsilon)}{2\phi\iota}\right)\ .
    \end{equation*}
\end{theorem}
We first discuss how Theorem~\ref{thm:delayed_rr_standard-simple} follows from Theorem~\ref{thm:delayed_rr_standard}.
Assumption~\ref{assumption-simplicity} ensures that $\beta \geq \alpha$, $\epsilon_p = \epsilon_r = \epsilon$ and $\iota \leq \epsilon$.
We bound $\norm{d_0-d_S}_2\leq \frac{2}{1-\gamma}$.
Choosing $\lambda=2e\iota\beta(1-\epsilon)^{-1}$ then provides the desired bounds.

For proving Theorem~\ref{thm:delayed_rr_standard}, we use arguments similar to the ones \cite{BHK20} use for proving Theorem~8.
\begin{proof}[Proof of Theorem~\ref{thm:delayed_rr_standard}]
    Let $P_0$ and $r_0$ be some arbitrary initial probability transition and reward function respectively.
    Denote by $(\tilde{P}_d, \tilde{r}_d)$ the transition
    probability and reward function after $k$
    repeated deployments of $d$.
    
    Note that $d_{i+1} = \GD(\tilde{P}_{d_i}, \tilde{r}_{d_i})$ and $d_S = \GD(P_S, r_S)$.

    lemma~\ref{lem:GG_alphabeta} gives 
    \begin{align}
    \|d_{i+1}-d_S\|_2 &= \|\GD(\tilde{P}_{d_i}, \tilde{r}_{d_i}) - \GD(P_S, r_S)\|_2
        \leq \frac{\alpha}{\lambda}\|\tilde{r}_{d_i} - r_S\|_2
        + \frac{\beta}{\lambda}\|\tilde{P}_{d_i} - P_S\|_2\nonumber\\
        &\leq \frac{\phi}{\lambda}(\dist((\tilde{P}_{d_i}, \tilde{r}_{d_i}),(P_S, r_S))
        \label{phi_dist_di_dS}
    \end{align}
    
    We can decompose 
    \begin{align}
        \dist((\tilde{P}_{d_i}, \tilde{r}_{d_i}), (P_S, r_S))\leq 
        \dist((\tilde{P}_{d_i}, \tilde{r}_{d_i}),(P_{d_i}, r_{d_i}))+
        \dist((P_{d_i}, r_{d_i}),(P_S, r_S))\label{Pr_tilde_Pr_S}
    \end{align}
    The first term of \ref{Pr_tilde_Pr_S} can be bounded by lemma~\ref{lem:Pr_d_Pr_tilde},
    the second term by lemma~\ref{lem:fixed_Pr_d}:
    \begin{align}
        \dist((\tilde{P}_{d_i}, \tilde{r}_{d_i}), (P_S, r_S))\leq 
        \frac{\iota}{1-\epsilon}\delta + 
        \frac{\iota}{1-\epsilon}\|d_i - d_S\|_2
        \label{Pr_tilde_Pr_S_better}
    \end{align}

    Using \ref{phi_dist_di_dS} and \ref{Pr_tilde_Pr_S_better} we get
    \begin{equation}
        \|d_{i+1}-d_S\|_2 \leq 
        \frac{\phi\iota}{\lambda(1-\epsilon)}\delta + 
        \frac{\phi\iota}{\lambda(1-\epsilon)}\|d_i - d_S\|_2
        \label{dist_di_dS_delta}
    \end{equation}
    We can apply lemma~\ref{lem:contraction-case-distinction} on $\{d_i\}_{i\in\mathbb{N}}$, since 
    $\frac{2\phi\iota}{\lambda(1-\epsilon)}<1$ holds due to the assumptions on $\lambda$.
    Lemma~\ref{lem:contraction-case-distinction}, bounds the number of iterations until $d_i$ converges to a $\delta$ radius around $d_S$ and the statement of the theorem follows from this bound.
\end{proof}

We now describe and prove the lemmas used in the proof of Theorem~\ref{thm:delayed_rr_standard}.
\begin{lemma}[similar to lemma~3 of~\cite{BHK20}]
    Suppose Assumption~\ref{assumption_sensitivity} holds.

    Let $d, d'\in D$ be arbitrary occupancy measures and let $P := P_{d}, r := r_{d}$
    (and respectively $P' := P_{d'}, r':=r_{d'}$) be the probability transition and reward functions to which the system
    asymptotically converges, if $d$ (respectively $d'$) is applied repeatedly.
    It holds that
    \begin{align}
        \|P - P'\|_2 + \|r - r'\|_2\leq \frac{\iota}{1-\max(\epsilon_p, \epsilon_r)}\|d-d'\|_2
    \end{align}
    \label{lem:fixed_Pr_d}
\end{lemma}
\begin{proof}
    Because of Assumption~\ref{assumption_sensitivity}, it holds that

    \begin{align*}
        &\|P - P'\|_2 + \|r - r'\|_2 \\
        &= \|\Pc(d, P, r)  - \Pc(d', P', r')\|_2
        + \|\Rc(d, P, r)  - \Rc(d', P', r')\|_2 \\
        &\leq \iota\|d-d'\|_2
        + \epsilon_{p} \|P-P'\|_2
        + \epsilon_{r}\|r-r'\|_2\\
        &\leq \iota\|d-d'\|_2
        + \max(\epsilon_{p},\epsilon_{r})( \|P-P'\|_2
        + \|r-r'\|_2)
    \end{align*}
    Where the equality holds because $(P, r)$ and $(P', r')$ are the long-term transition
    probabilities and reward functions for $d$ and $d'$ respectively.
    The inequality holds because of Assumption~\ref{assumption_sensitivity}.

    The statement of the lemma follows from this equation.
\end{proof}

\begin{lemma}[similar to lemma~4 of~\cite{BHK20}]\label{lem:Pr_d_Pr_tilde}
    Assume Assumption~\ref{assumption_sensitivity} holds with 
    $\epsilon_{p}, \epsilon_{r}<1$.
    Given a policy $\pi$, denote by $(\tilde{P}_\pi, \tilde{r}_\pi)$ the transition
    probability and reward function after $k=\ln^{-1}(\frac{1}{\epsilon})\ln\left(\frac{\dist((P_0, r_0),(P_1, r_1))}{\nu}\right)$ deployments of $\pi$, for any initial
    probability transition function~$P_0$ and reward function~$r_0$. 
    It holds that
    \begin{equation*}
    \norm{P_\pi - \tilde{P}_\pi}_2+\norm{r_\pi - \tilde{r}_\pi}_2 \leq \frac{\nu}{1-\epsilon}\ .
    \end{equation*}
\end{lemma}
\begin{proof}
    Using Proposition~\ref{prop.contraction}, we see that
    \begin{align*}
        &\dist((\tilde{P}_\pi, \tilde{r}_\pi), (P_\pi, r_\pi)) (1-\epsilon)\leq
        \epsilon^{k} \dist((P_0, r_0),(P_\pi, r_\pi)) (1-\epsilon)\nonumber\\
        \leq& \epsilon^{k} \dist((P_0, r_0),(P_\pi, r_\pi))
            -\epsilon^{k} \dist((P_1, r_1),(P_\pi, r_\pi))\nonumber\\
        \leq& \epsilon^{k} \dist((P_0, r_0),(P_1, r_1)).
    \end{align*}
    Therefore 
    \begin{align}
        \dist((\tilde{P}_\pi, \tilde{r}_\pi), (P_\pi, r_\pi))\leq
        \frac{\epsilon^{k}}{1-\epsilon} \dist((P_0, r_0),(P_1, r_1)).
        \label{dist_Pr_tilde_Pr_pi}
    \end{align}
    Using $k\geq\ln^{-1}\left(\frac{1}{\epsilon}\right)\ln\left(\frac{\dist((P_0, r_0),(P_1, r_1))}{\nu}\right)$,
    we get $\epsilon^k \leq \frac{\nu}{\dist((P_0, r_0),(P_1, r_1))}$.
    If we insert this into \ref{dist_Pr_tilde_Pr_pi}, we get the desired bound.
\end{proof}

\subsection{DRR with Finite Samples (Theorem~\ref{thm:finite-samples-drr-standard-simple})}
\label{appdx.sec.drr-finite}
We show a more general version of the Theorem~\ref{thm:finite-samples-drr-standard-simple}.
\begin{theorem}
    Let $d_i$ be computed by finite sample DRR with $k = 
    \ln^{-1}\left(\frac{1}{\epsilon}\right)\ln\left(\frac{5\distpr}{
    \delta \iota }\right)$.
    Suppose Assumption~\ref{assumption_sensitivity} holds and Assumption~\ref{assumption-offline-rl}  holds for $k$ and parameter $B$.
    Furthermore assume $\lambda>\max\left(5.76 \xi\mu, \xi\mu + \frac{\iota\phi }{(1-\epsilon)} 
    \left(1+\frac{1}{4.8\xi\mu}\right) \right),$
    with $\xi$ as defined above.
    %$\xi =\frac{36\sizeS^{1.5}(B+\sqrt{A})}{\delta^2(1-\gamma)^3}$   
    Furthermore assume that
    \begin{align*}
        m_i \geq \frac{1}{\mu^2} \Bigg(& \sizeA\ln\left(\frac{4i^2}{p}\right) +  \ln\left(\frac{12\sizeS}{(1-\gamma)^2\mu}\right)
        \\
        &+ 2 \sizeA \ln\left(\frac{\ln\left(\frac{3\sizeS^2\sizeA B}{(1-\gamma^2)\mu}\right)}{\mu}\right)\Bigg)\ .
    \end{align*}
    Then for any $\delta > 0$, we have
    \begin{equation*}
    \norm{d_i - d_S}_2 \leq \delta 
    \end{equation*}
    \begin{equation*}\quad\text{\quad for all } i\geq \frac{\ln\left(\frac{\norm{d_1-d_S}_2}{\delta}\right)}{ 
    \ln\left(\left( \sqrt{\frac{\xi\mu}{\lambda}} 
    + \frac{1.2\iota\phi }{\lambda(1-\epsilon)}\right)^{-1}
    \right)} + 1.
    \end{equation*}
    Here $\mu>0$ can be chosen arbitrarily and defines a trade-off between the
    conditions on the number
    of samples $m_i$ and on the regularization factor $\lambda$.
    % in the end (or in between) add: 'where $\epsilon \defeq \max(\epsilon_p, \epsilon_r)$.'
    \label{thm:finite-samples-drr-standard}
\end{theorem}
We first show how Theorem~\ref{thm:finite-samples-drr-standard-simple} follows from Theorem~\ref{thm:finite-samples-drr-standard}.
Assumption~\ref{assumption-simplicity} ensures that $\beta \geq \alpha$, $\epsilon_p = \epsilon_r = \epsilon$, $\iota \leq \epsilon$ and $\beta\geq \alpha$

For Theorem~\ref{thm:finite-samples-drr-standard-simple}, we use $\mu = \frac{\lambda}{10\xi}$.
We bound
\begin{align*}
    \lambda > \max\left(1,
   \frac{40\iota\beta}{9(1-\epsilon)},
   \frac{1.2\iota\beta}{(1-\epsilon)
   \left(\frac{\epsilon}{4}+\frac{3}{4} - \frac{1}{\sqrt{10}}\right)
   %\left(1+\frac{\epsilon}{2} - \frac{1}{\sqrt{10}}\right)
   }
   \right)
\end{align*}
We then derive the bound on $i$ in the following way.
For the denominator of the bound on $i$, we can then derive
$\sqrt{\frac{\xi\mu}{\lambda}} + \frac{1.2\iota\phi}{\lambda(1-\epsilon)}  =
    \frac{1}{\sqrt{10}} + \frac{1.2\iota\beta}{\lambda(1-\epsilon)}  \leq \frac{\epsilon+3}{4}$.
For the numerator of the bound on $i$, we use $\norm{d_1 - d_S}_2 \leq \frac{2}{1-\gamma}$.

The bound on the number of samples follows from the fact that 
    $\frac{1}{\mu^2} =\frac{100\xi^2}{\lambda^2}
    = {\mathcal{O}}\left(
    \frac{\sizeS^3(B+\sqrt{\sizeA})^2}{\delta^4(1-\gamma)^6\lambda^2} \right)$.
\begin{proof}[Proof of Theorem~\ref{thm:finite-samples-drr-standard}]
    In general, we bound
    \begin{align}
        \|d_{i+1} - d_S\|_2 \leq
        \underbrace{\|d_{i+1} - d_{i+1}^*\|_2}_{T_1} +
        \underbrace{\|d_{i+1}^* - d_S\|_2}_{T_2}
        \label{eq:delayed-finite-samples-decomp}
    \end{align}
    where $d_{i+1}^*$ is the occupancy measure optimizing 
 the exact Lagrangian after $k$ deployments of $\pi_{d_i}$, i.e. $d_{i+1}^* = \GD(P_{(i+1)\cdot k }, r_{(i+1)\cdot k })$.

    We can apply lemma~\ref{lem:bound-GD-hGD}, 
    since Assumption~\ref{assumption-offline-rl} holds.
    Let $F_t$ be the samples of round $t$.
    By setting $\delta_1 = p/2i^2$ we get with probablility at least $1-p/2i^2$ in step $i$,
    \begin{align}
        T_1 = \norm{\hGD(d_i, F_{(i+1)\cdot k }) - \GD(P_{(i+1)\cdot k }, r_{(i+1)\cdot k })}_2 \leq 
        \frac{6\sqrt{ \sizeS^{1.5}(B + \sqrt{\sizeA}) \mu }}{(1-\gamma)^{1.5} } 
        \frac{1}{\sqrt{\lambda}},
        \label{eq:bound-T1-finite-delayed}
    \end{align}
    if
    \begin{align*}
        \abs{F_{(i+1)\cdot k}}\ge \frac{1}{\mu^2} \left( \sizeA\ln(4i^2/p) +  \ln(12\sizeS/((1-\gamma)^2\mu)) 
        + 2 \sizeA \ln(\ln(3\sizeS^2\sizeA B/((1-\gamma^2)\mu)) /\mu)\right)
    \end{align*}
    By a union bound over all rounds, we get that \eqref{eq:bound-T1-finite-delayed} holds with
    probability $1-p$ for every $i\in\mathbb{N}$.
    
    To bound $T_2$, we can apply lemma~\ref{lem:combine_lems} with 
    parameter $\nu$, to get
    \begin{align*}
    T_2 =
    \norm{d_{i+1}^* - d_S}_2 \leq \frac{\phi\nu}{\lambda(1-\epsilon)} +
    \frac{\phi\iota}{\lambda(1-\epsilon)}\norm{d_i - d_S}_2\ .
    \end{align*}
    We determine $\nu$ later in the proof.

    Inserting those bounds on $T_1$ and $T_2$ into~\eqref{eq:delayed-finite-samples-decomp},
    we get
    \begin{align}
        \|d_{i+1} - d_S\|_2 &\leq 
        \frac{6\sqrt{ \sizeS^{1.5}(B + \sqrt{\sizeA}) \mu }}{(1-\gamma)^{1.5} } 
        \frac{1}{\sqrt{\lambda}}
        + \frac{\phi\nu}{\lambda(1-\epsilon)} 
        + \frac{\phi \iota}{\lambda(1-\epsilon)}\norm{d_i - d_S}_2
        =x_1\delta + x_2 \norm{d_i - d_S}_2
        \label{eq:fin_samples_last_d_di+1dS_by_didS}
    \end{align}
    where we define
    $x_1 := \frac{6\sqrt{ \sizeS^{1.5}(B + \sqrt{\sizeA}) \mu }}{(1-\gamma)^{1.5} \sqrt{\lambda}\delta} 
        + \frac{\phi\nu}{\lambda(1-\epsilon)\delta}$
        and $x_2 := \frac{\phi \iota}{\lambda(1-\epsilon)}$.
        
    Note that we can write $x_1+x_2$ as follows
    \begin{align}
    x_1+x_2 &= \frac{6\sqrt{ \sizeS^{1.5}(B + \sqrt{\sizeA}) \mu }}{(1-\gamma)^{1.5} \sqrt{\lambda}\delta} 
        + 
        \frac{(\frac{\nu}{\delta} + \iota)\phi }{\lambda(1-\epsilon)}
        \label{eq:edrr_last_rnd_x1_x2}
    \end{align}
    We now derive conditions on $\lambda$ for when $x_1+x_2<1$, because then we can apply
    lemma~\ref{lem:contraction-case-distinction} to bound the iterations until which 
    the sequence of $\{d_i\}_{i\in\mathbb{N}_{\geq 1}}$ converges.
    To this end, we can apply lemma~\ref{lem:bound_sqrtx_a_b} with $x=\lambda$, 
    $a=\frac{6\sqrt{ \sizeS^{1.5}(B + \sqrt{\sizeA}) \mu }}{(1-\gamma)^{1.5} 
    \delta}$, $b=\frac{(\frac{\nu}{\delta} + \iota)\phi }{1-\epsilon}$ and $y=2.4$, 
    to get that $x_1 + x_2 < 1$ holds, if 
    \begin{equation*}\lambda>\max\left(5.76 a^2, a^2 + \frac{(\frac{\nu}{\delta} + \iota)\phi }{1.2(1-\epsilon)} 
    +\frac{(\frac{\nu}{\delta} + \iota)\phi }{2.4^2(1-\epsilon)a^2} \right)\ .\end{equation*}
    We get the bound for $\lambda$ stated in the Theorem by setting $\nu = 0.2\delta\iota$.
    
    Thus we can apply lemma~\ref{lem:contraction-case-distinction} on
    the sequence $\{d_i\}_{i\in \mathbb{N}_{\geq 1}}$, to see
    that if 
    $
    i\geq \ln\left(\frac{\norm{d_1-d_S}_2}{\delta}\right)/ \ln\left(1/(x_1 + x_2)\right)+1$,
    it holds that $\norm{d_i - d_S}_2 \leq \delta$.
    The Theorem then follows from substituting $x_1+x_2$ using 
    equation~\eqref{eq:edrr_last_rnd_x1_x2} and $\nu = 0.2\delta\iota$.
\end{proof}

For the proof, we used the following lemmas.
\begin{lemma}
    Let $a, b, x \geq 0$ and $y>0$ be arbitrary.
    If $x>\max(y^2 a^2, a^2 + \frac{2b}{y} + \frac{b}{y^2a^2})$,
    it holds that 
    \begin{align}
    1 > \frac{a}{\sqrt{x}} + \frac{b}{x}\ .
    \label{eq:sqrt_x_bound_lem}
    \end{align}
    \label{lem:bound_sqrtx_a_b}
\end{lemma}
\begin{proof}
    When we multiply both sides of~\eqref{eq:sqrt_x_bound_lem} with $\sqrt{x}$ and square 
    the resulting term, we see
    that~\eqref{eq:sqrt_x_bound_lem} is equivalent to
    \begin{align}\label{eq.xa22absqrtx}
        x > a^2 + 2\frac{ab}{\sqrt{x}} + \frac{b}{x}\ .
    \end{align}
    If we assume that $x>y^2a^2$, we get
    \begin{equation*}
        a^2 + 2\frac{ab}{\sqrt{x}} + \frac{b}{x} < a^2 + \frac{2b}{y} + \frac{b}{y^2a^2}<x,
    \end{equation*}
    This shows that equation~\ref{eq.xa22absqrtx} and thus also equation~\ref{eq:sqrt_x_bound_lem} hold.
\end{proof}
\begin{lemma}
    Suppose Assumption~\ref{assumption_sensitivity} holds with $\iota_d,\epsilon_p, 
    \epsilon_r<1$.
    Let $d$ be some occupancy measure and $P_0, r_0$ be some initial probability transition and reward
    functions.
    Let $P_t, r_t$ be the probability transition and reward function after $t>0$ deployments
    of $\pi_d$.
    Then for $d' = \GD(P_k, r_k)$ with $k=\ln^{-1}\left(\frac{1}{\epsilon}\right)\ln\left(\frac{\dist((P_0, r_0),(P_1, r_1))}{\nu}\right)$, it holds that
    \begin{equation*}
    \norm{d' - d_S} \leq \frac{\phi\nu}{\lambda(1-\epsilon)} +
    \frac{\phi\iota}{\lambda(1-\epsilon)}\norm{d - d_S}_2\ ,
    \end{equation*}
    where $\phi = \max(\alpha, \beta)$ and $\alpha,\beta$ from Definition~\ref{def.alphaBeta}.
    \label{lem:combine_lems}
\end{lemma}
\begin{proof}
Note that 
\begin{align}
\norm{d' - d_S}_2 \leq \norm{d' - \GD(P_d, r_d)}_2 + \norm{\GD(P_d, r_d) - d_S}_2
\label{eq:bound_dprime_dS}
\end{align}
Using lemmas~\ref{lem:GG_alphabeta} and~\ref{lem:Pr_d_Pr_tilde}, we can bound
\begin{align}
\norm{d' - \GD(P_d, r_d)}_2 \leq \frac{\phi}{\lambda}\dist((P_k, r_k), (P_d, r_d))
\leq \frac{\phi\nu}{\lambda(1-\epsilon)}\ .
\label{eq:bound_dprime_GDPdrd}
\end{align}
Furthermore, by lemmas~\ref{lem:GG_alphabeta} and~\ref{lem:fixed_Pr_d} we see that 
\begin{align}
\norm{\GD(P_d, r_d) - d_S}_2 \leq \frac{\phi}{\lambda}\dist((P_d, r_d), (P_S, r_S))
\leq \frac{\phi\iota}{\lambda(1-\epsilon)}\norm{d - d_S}_2\ .
\label{eq:bound_GDPdrd_dS}
\end{align}
Inserting~\eqref{eq:bound_dprime_GDPdrd} and~\eqref{eq:bound_GDPdrd_dS} 
into~\eqref{eq:bound_dprime_dS} gives the desired bound.
\end{proof}

\section{Proof for MDRR (Theorem~\ref{thm:edrr-mixed-response-simple})}
\label{appdx.sec.mdrr}
\subsection{Preparations for the Proof}
For our derivations, we need an exact version of the empirical Lagrangian~\eqref{eq:lagrangian-finite-mdrr}.
To this end, consider the following optimization problem, 
which works with multiple reward and probability transition functions from different rounds.
\begin{align}
    \label{eq:optimization-mixed}
        \max_{d\ge 0}\ &  \sum_{s,a} d(s,a) \bar{r}_i(s,a) - \frac{\lambda}{2}\norm{d}_2^2\\
       \textrm{s.t. } & \sum_a d(s,a) = \rho(s) + \gamma \cdot \sum_{s',a} d(s',a) \bar{P}_i(s',a,s)\ \forall s\nonumber
\end{align}
where we define
$\bar{r}_i := \sum_{\subround =1}^{k} \frac{m_{ik+\subround }}{\numsam_i} r_{ik+\subround }$ and
$\bar{P}_i := \sum_{\subround =1}^{k} \frac{m_{ik+\subround }}{\numsam_i} P_{ik+\subround }$ where $m_{ik+\subround }\geq 0$ is arbitrary 
and $\numsam_i = \sum_{\subround =1}^{k} m_{ik+\subround }$.
Equation~\eqref{eq:optimization-mixed} defines an objective for a mixture of probability transition
and reward functions of the rounds in which the learner repeatedly deployed $\pi_{d_i}$.
Each reward and probability transition is weighted by a weight $\frac{m_{ik+\subround }}{\numsam_i}$.
This optimization problem does not use finite samples, but the true reward
and probability transition functions. 

We can now show that the Lagrangian of~\eqref{eq:optimization-mixed} looks 
similar to the empirical Lagrangian~\eqref{eq:lagrangian-finite-mdrr} of MDRR.
\begin{align}
    &\calL^M(d, h, i) = d^\top \bar{r}_i- \frac{\lambda}{2} \norm{d}_2^2 
    + \sum_s h(s) \bigg(
    %\nonumber \\&
    -\sum_a d(s,a) + \rho(s) 
    + \gamma \cdot \sum_{s',a} d(s',a) \bar{P}_i(s|s',a)
     \bigg)
     \nonumber\\&
    = d^\top \sum_{\subround =1}^{k} \frac{m_{ik+\subround }}{\numsam_i}{r}_i^\subround - \frac{\lambda}{2} \norm{d}_2^2 
    + \sum_s h(s) \bigg(
    %\nonumber\\&
    -\sum_a d(s,a) + \rho(s) 
    + \gamma \cdot \sum_{s',a} d(s',a) \sum_{\subround =1}^{k} \frac{m_{ik+\subround }}{\numsam_i}{P}_{i\cdot k+\subround }(s|s',a)
     \bigg)
     \nonumber\\&
    = - \frac{\lambda}{2} \norm{d}_2^2 
    + \sum_s h(s) \rho(s) + \sum_{\subround =1}^{k} \sum_{s,a}\frac{m_{ik+\subround }}{\numsam_i} d(s,a) \bigg(
    %\nonumber \\&
    r_{ik+\subround }(s,a) - h(s) + \gamma\sum_{s'}P_{i\cdot{}k+\subround }(s'|s,a)h(s')
    \bigg)\nonumber
    \\
    \begin{split}
    &= - \frac{\lambda}{2} \norm{d}_2^2 
    + \sum_s h(s) \rho(s) 
    % \\&
    + \sum_{\subround =1}^{k} \sum_{s,a} \bar{d}_{ik+\subround }(s,a) \frac{m_{ik+\subround }}{\numsam_i}\frac{d(s,a)}{\bar{d}_{ik+\subround }(s,a)} 
    \bigg(
    % \\&
    r_{ik+\subround }(s,a) - h(s) + \gamma\sum_{s'}P_{i\cdot{}k+\subround }(s'|s,a)h(s')
    \bigg)\label{eq:LD_formulation_bar_d}
    \end{split}
\end{align}
We then show a kind of closeness of $\calL^M$ and $\hat{\calL}^M$ in the following lemma.
The lemma is a more general version of lemma~10 from \cite{MTR23}.
The proof ideas follow theirs.
\begin{lemma}\label{lem:emperical-lagrangian-drr}
    Suppose we are given an occupancy measure $d$ with $\max_{s,a} d(s,a) / \bar{d}_{ik+\subround }(s,a) \leq B$ for all 
    $\subround \in [k]$, an $\norm{h}_2\leq H$
    and $m_{ik+\subround } = w_\subround  \numsam_i$ with 
    $\numsam_i \geq \frac{1}{\eta^2}
    \left(\sizeA\ln\left( \frac{2 
    \ln(\sizeS\sizeA BH/\eta) }{\eta}\right) +\ln\left(1+\frac{2H}{\eta}\right)
    +\frac{\ln\left(2/\delta_1\right)}{\sizeS}
    \right)$. Furthermore assume $w_\subround \geq 0$ and $\sum_{\subround =1}^{k} w_\subround =1$.
    Then the following
    bound holds with probability at least $1-\delta_1$.
    
    \begin{align*}
    &\abs{\hat{\calL}^M(d,h; i) - \calL^M(d,h; i)} \le 
    \frac{6(H+1)\sqrt{\sizeS}(B+\sqrt{\sizeA})\eta}{1-\gamma}\ .
    \end{align*}
    for any $\eta>0$.
\end{lemma}
\begin{proof}
    For this proof to simplify notation, we drop the `$i\cdot k$' in the subscript, and only use $\subround $, since we always consider the same iteration $i$.

    Note that $\frac{m^\subround }{M} = w_\subround $.

    We see that the expected value of the $\hat{\calL}^M$ equals $\calL^M$ as follows
    \begin{align*}
        &\E[\hat{\calL}^M(d, h, i)] \\
        =& - \frac{\lambda}{2} \norm{d}_2^2 
    + \sum_s h(s) \rho(s) + \sum_{\subround =1}^{k} \sum_{l=1}^{\abs{F_\subround }}\frac{1}{\abs{F_\subround }}w_\subround \E_{(s,a,s')\sim M^\subround }\left[
    \frac{d(s,a)}{\bar{d}_\subround (s,a)}\cdot\frac{r_\subround (s,a) - h(s) + \gamma h(s')}{1-\gamma}
    \right]\\
    =& 
    - \frac{\lambda}{2} \norm{d}_2^2 
    + \sum_s h(s) \rho(s) + \sum_{\subround =1}^{k} \sum_{s,a} \bar{d}_\subround (s,a) w_\subround \frac{d(s,a)}{\bar{d}_\subround (s,a)} 
    \left(r_\subround (s,a) - h(s) + \gamma\sum_{s'} P_\subround (s'|s,a)h(s')
    \right)\\
    =& \calL^M(d,h,i)
    \end{align*}
    where we use the notation $(s,a,s')\sim M^\subround $ to indicate that the tuple $(s,a,s')$ is distributed via the MDP in round $\subround $
    of this iteration.
    
    By the assumptions of this lemma, we see that
    \begin{align*}
        \abs{\frac{1}{1-\gamma}\frac{d(s, a)}{\bar{d}_\subround (s,a)}(r_\subround (s,a) - h(s) + \gamma 
        h(s'))}\leq \frac{B(H(1+\gamma)+1)}{1-\gamma}
    \end{align*}   
    By this, we can apply Hoeffding's inequality to get
    \begin{align*}
        \P\left(\abs{\hat\calL(d,h,i) - \calL(d,h,i)}\geq \frac{2B(H(1+\gamma)+1)}{1-\gamma}
        \sqrt{\sum_{\subround =1}^{k}\frac{w_\subround^2}{m_t} \frac{\ln(2/\delta_1)}{2}}
        \right)\leq \delta_1
    \end{align*}
    We now extend this bound to any occupancy measure $d$ and 
    $h\in \mathcal{H} = \{h:\norm{h}_2\leq H\}$.
    In order to do this, we first construct an $\eta$-net for the set of possible $h$s, 
    $\mathcal{H} := \{h\in\mathbb{R}^{\sizeS}:\norm{h}_2\leq H\}$ and for the set of possible occupancy measures
    $\mathcal{D}$ which formally equals $\mathcal{D} = \left\{d: \frac{d(s,a)}{\bar{d}_\subround (s,a)}\leq B\ \ \text{for all}\ (s,a,\subround )
    \in \sizeS\times \sizeA\times [k]\right\}$.
    
    For $\mathcal{H}$, we can use lemma~5.2 from~\cite{Ver10} to get a set $\mathcal{H}_\eta$ of size
    at most $\left(1+\frac{2H}{\eta}\right)^{\sizeS}$, such that for all
    $h\in \mathcal{H}$, there exists an $h_\eta\in \mathcal{H}_\eta$ for which it holds that 
    $\norm{h-h_\eta}_2\leq \eta$.
    
    For $\mathcal{D}$ we choose a multiplicative $\eta$-net as follows.
    For each pair $(s,a)$ we choose grid points $\bar{d}_\subround (s,a)$, $(1+\eta)\bar{d}_\subround (s,a)$, \dots, 
    $(1+\eta)^p \bar{d}_\subround (s,a)$ with $p=\frac{\ln(B/\bar{d}_\subround (s,a))}{\ln(1+\eta)}$.
    Note that $\bar{d}_\subround $ could be arbitrarily small, but without loss of generality, we can assume that 
    $\bar{d}_\subround (s,a)\geq \frac{\eta}{4\sizeS\sizeA BH}$. This is because if we ignore all $(s,a,\subround )$ tuples in 
    the sum in the second line of term \eqref{eq:LD_formulation_bar_d}, 
    the error we introduce to $\calL^M$ is at most $\eta/4$.
    Using this insight, we can thus choose $p=\frac{2\ln(\sizeS\sizeA BH/\eta)}{\ln(1+\eta)}$.
    So we can choose an $\eta$-net $\mathcal{D}_\eta$ of size at most $\left(\frac{2 \ln(\sizeS\sizeA BH/\eta) }{\ln(1+\eta)}\right)^{\sizeS\sizeA} \le \left( \frac{2 \ln(\sizeS\sizeA BH/\eta) }{\eta}\right)^{\sizeS\sizeA}$, such that
    for every $d\in\mathcal{D}$, there exists an $\tilde{d}\in \mathcal{D}_\eta$ such that $\frac{d(s,a)}{\tilde{d}(s,a)}\leq B$.
    
    With a union bound over the elements of $\mathcal{H}_\eta$ and $\mathcal{D}_\eta$, we have that 
    for all $d\in\calD_\eta$ and $h\in\calH_\eta$, 
    %with probability at least $1-\delta_1$, 
    \begin{align}
    \begin{split}
    &\P\Bigg(\abs{\hat\calL^M(d,h,i) - \calL^M(d,h,i)}\\
    &\geq \frac{B(H(1+\gamma)+1)}{1-\gamma}
    \sqrt{\sum_{\subround =1}^{k}\frac{w_\subround^2}{m_\subround } \left(\sizeS\sizeA\ln\left( \frac{2 
    \ln(\sizeS\sizeA BH/\eta) }{\eta}\right) +\sizeS\ln\left(1+\frac{2H}{\eta}\right)
    +\ln\left(\frac{2}{\delta_1}\right)\right) }
    \Bigg)\leq \delta_1
    \label{eq:eps-net-bound-lagrangian}
    \end{split}
    \end{align}
    
    We next extend this bound to all elements in $\calD$ and $\calH$.
    For every $d\in \calD$ and $h\in\calH$ there exits $\tilde{d}\in\calD_\eta$ and 
    $\tilde{h}\in\calH_\eta$ such that $\max_{s,a}d(s,a)/\tilde{d}(s,a)\leq \eta$
    and $\norm{h-\tilde{h}}_2\leq \eta$.
    Let $\calL^M_0(d,h;i) =\calL^M(d,h; i) +\frac{\lambda}{2} \norm{d}_2^2 - \sum_s h(s) \rho(s)$ 
    and $\hat\calL_0^M(d,h;i)$ analogously.

    Then
    \begin{align}
    \begin{split}
    &\abs{\hat{\calL}^M(d,h; i) - \calL^M(d,h; i)} \le \abs{\hat{\calL}_0^M(d,h; i) - \hat{\calL}^M_0(\tilde{d},\tilde{h}; i)}\\
    &+ \abs{\hat{\calL}^M(\tilde{d},\tilde{h}; i) - \calL^M(\tilde{d},\tilde{h}; i)} + \abs{{\calL}_0^M(\tilde{d},\tilde{h}; i) - {\calL}_0^M({d},{h}; i)}
    \label{eq:bound_estimation_lagrangian}
    \end{split}
    \end{align}
    Using lemma~11 from~\cite{MTR23} we can bound
    \begin{align*}
        \abs{\hat{\calL}^M_0(d,h; i) - \hat{\calL}^M_0(\tilde{d},\tilde{h}; i)}=&
    \sum_{\subround =1}^{k} w_\subround \Bigg|\Bigg(\sum_{(s,a,r,s')\in F_\subround }
    \frac{d(s,a)}{\bar{d}_\subround (s,a)}
    \frac{r - h(s) + \gamma\sum_{s'} h(s')}{m^\subround (1-\gamma)}\\
    &-\sum_{(s,a,r,s')\in F_\subround }
    \frac{\tilde{d}(s,a)}{\bar{d}_\subround (s,a)}
    \frac{r - \tilde{h}(s) + \gamma\sum_{s'} \tilde{h}(s')}{m^\subround (1-\gamma)}\Bigg)\Bigg|\\
    \leq& \frac{4BH\sqrt{\sizeS} \eta}{1-\gamma}
    \end{align*}
    and
    \begin{align*}
    \abs{{\calL}_0^M(d,h; i) - {\calL}_0^M(\tilde{d},\tilde{h}; i)}
    =& \Bigg|\sum_{s,a} d(s,a) 
        \Bigg(\underbrace{\sum_{\subround =1}^{k} w_\subround  r_\subround (s,a)}_{=\bar{r}(s,a)} - h(s) 
        + \gamma\sum_{s'}h(s')\underbrace{\sum_{\subround =1}^{k} w_\subround  P_\subround (s'|s,a)}_{=\bar{P}(s'|s,a)}
        \Bigg)\\
        &-\sum_{s,a} \tilde{d}(s,a) 
        \Bigg(\underbrace{\sum_{\subround =1}^{k} w_\subround  r_\subround (s,a)}_{=\bar{r}(s,a)} - \tilde{h}(s) 
        + \gamma\sum_{s'}\tilde{h}(s')\underbrace{\sum_{\subround =1}^{k} w_\subround  P_\subround (s'|s,a)}_{=\bar{P}(s'|s,a)}
        \Bigg)\Bigg|\\
        \leq& \frac{6 \sqrt{\sizeS\sizeA} H \eta}{1-\gamma}\ .
    \end{align*}
    Inserting these bounds and the bound from~\eqref{eq:eps-net-bound-lagrangian} 
    into~\eqref{eq:bound_estimation_lagrangian}, we get
    \begin{align*}
    &\abs{\hat{\calL}^M(d,h; i) - \calL^M(d,h; i)} \le \\
    &\frac{B(H(1+\gamma)+1)}{1-\gamma}
    \sqrt{\sum_{\subround =1}^{k}\frac{w_\subround^2}{m_\subround } \left(\sizeS\sizeA\ln\left( \frac{2 
    \ln(\sizeS\sizeA BH/\eta) }{\eta}\right) +\sizeS\ln\left(1+\frac{2H}{\eta}\right)
    +\ln\left(\frac{2}{\delta_1}\right)\right) }\\
    &+\frac{4BH\sqrt{\sizeS} \eta}{1-\gamma}
    +\frac{6 \sqrt{\sizeS\sizeA} H \eta}{1-\gamma}
    \end{align*}
    In particular, if we use $m_{ik+\subround }=\numsam_i w_\subround $, we get
    \begin{align*}
    &\abs{\hat{\calL}^M(d,h; i) - \calL^M(d,h; i)} \le \\
    &\frac{2B(H+1)}{1-\gamma}
    \sqrt{\frac{1}{\numsam_i}\left(\sizeS\sizeA\ln\left( \frac{2 
    \ln(\sizeS\sizeA BH/\eta) }{\eta}\right) +\sizeS\ln\left(1+\frac{2H}{\eta}\right)
    +\ln\left(\frac{2}{\delta_1}\right)\right)} \\
    &+\frac{4BH\sqrt{\sizeS} \eta}{1-\gamma}
    +\frac{6 \sqrt{\sizeS\sizeA} H \eta}{1-\gamma}   
    \end{align*}
    If we now choose $\numsam_i \geq \frac{1}{\eta^2}
    \left(\sizeA\ln\left( \frac{2 
    \ln(\sizeS\sizeA BH/\eta) }{\eta}\right) +\ln\left(1+\frac{2H}{\eta}\right)
    +\frac{\ln\left(2/\delta_1\right)}{\sizeS}
    \right)$, we get 
    \begin{align*}
    &\abs{\hat{\calL}^M(d,h; i) - \calL^M(d,h; i)} \le 
    \frac{6(H+1)\sqrt{\sizeS}(B+\sqrt{\sizeA})\eta}{1-\gamma}\ .
    \end{align*}
\end{proof}

We need some further definitions and then go on to show the theorem on MDRR.

\begin{definition}
    We define $\MR_{\boldsymbol{w}}^k\left(d_i, P_{i\cdot k}, r_{i\cdot k}\right)$ to be the solution to~\eqref{eq:optimization-mixed}.

     Furthermore we define $\what{\MR}_{\boldsymbol{w}}^k\left(d_i, P_{i\cdot k}, r_{i\cdot k}\right)$ to be the occupancy measure $d$ optimizing the empirical Lagrangian for MDRR, i.e.
    \begin{align}\label{eq:mixed-response-lagrangian-optimization}
        \max_d \min_h \hat{\calL}^M(d,h,i)\ .
    \end{align}
    
    After deploying $\pi_{d_i}$ for $k$ rounds, the learner updates its occupancy measure by
    $d_{i+1} =\what{\MR}_{\boldsymbol{w}}^k\left(d_i, P_{i\cdot k}, r_{i\cdot k}\right)$.
\end{definition}

\subsection{Formal Statement and Proof of Theorem~\ref{thm:edrr-mixed-response-simple} (MDRR)}
\label{appdx.sec.mdrr-theorem-sec}
We now show a more general version of the Theorem~\ref{thm:edrr-mixed-response-simple}.
\begin{theorem}
    Let $d_i$ be computed by MDRR with
    $k\geq 
    \frac{\ln\left(\frac{\epsilon(v-1)}{v\epsilon-1}\right)+\ln\left(
    \frac{5(1-\epsilon)\distpr}{\iota\delta}\right)
    }{\ln\left(1/\epsilon\right)}$.
    Suppose Assumption~\ref{assumption_sensitivity} holds and Assumption~\ref{assumption-offline-rl}  holds for $k$ and parameter $B$.
    Furthermore assume that $\lambda > \max\left(6.08 \xi\eta, \frac{19}{18}\xi\eta + 
    \frac{\phi\iota}{1-\epsilon}\left(1+\frac{1}{5.0\bar{6}%4.8\frac{19}{18}
    \xi\eta}\right)
    \right)$ with 
    $\xi$ being defined as above.

    Further let
    $\numsam_i \geq \frac{1}{\eta^2}
    \bigg(\sizeA\ln\bigg( \frac{2 
    \ln(\sizeS\sizeA BH/\eta) }{\eta}\bigg) 
    +\ln\bigg(1+\frac{2H}{\eta}\bigg)
    +\frac{\ln(4i^2/p)}{\sizeS}
    \bigg)$
    be the total number of samples in round $i$,
    where the number of samples is given by $m_{ik+\subround } = w_\subround  \numsam_i$ 
    with  $w_\subround  = \frac{v-1}{v^k-1}v^{\subround -1}$ and $v>\frac{1}{\epsilon}$.
    Then for any $\delta > 0$ and $p>0$, with probability 
    at least $1-p$, 
    \begin{equation*}
    \norm{d_i - d_S}_2 \leq \delta\end{equation*} 
    \begin{equation*} \text{\quad for all } i \geq \frac{\ln(\norm{d_1, d_S}_2/\delta)}{\ln\left(1/\left(
    \sqrt{\frac{19\xi\eta}{18\lambda}}
    +\frac{1.2\phi\iota}{\lambda(1-\epsilon)}\right)\right)} +1\ .
    \end{equation*}
    Here $\eta>0$ and $v>\frac{1}{\epsilon}$ can be chosen arbitrarily.
    \label{thm:edrr-mixed-response}
\end{theorem}
The parameter $\eta>0$ defines a trade-off between the number of samples, number of iterations and the conditions on $\lambda$.
The parameter $v > \frac{1}{\epsilon}$ defines a trade-off
between the number of deployments per retraining and the required number of samples per deployment.

We first explain how Theorem~\ref{thm:edrr-mixed-response-simple} follows from Theorem~\ref{thm:edrr-mixed-response}.
Assumption~\ref{assumption-simplicity} ensures that $\beta \geq \alpha$, $\epsilon_p = \epsilon_r = \epsilon$, $\iota \leq \epsilon$ and $\beta\geq \alpha$.
For Theorem~\ref{thm:edrr-mixed-response-simple}, we use
$\eta = \frac{\lambda}{10\xi}$ and $\lambda > \max\left( 1, 3.6 \frac{\beta\iota}{1-\epsilon},
\frac{1.2\beta\iota}{(1-\epsilon)\left(\frac{\epsilon+3}{4}-\sqrt{\frac{19}{180}}\right)}
\right)$.
We now bound
    $\sqrt{\frac{19\xi\eta}{18\lambda}} + \frac{1.2\phi\iota}{\lambda(1-\epsilon)}$
    in order to bound the number of retrainings $i$. We see that 
\begin{align*}
    \sqrt{\frac{19\xi\eta}{18\lambda}} + \frac{1.2\beta\iota}{\lambda(1-\epsilon)} = 
    \sqrt{\frac{19}{180}} + \frac{1.2\beta\iota}{\lambda(1-\epsilon)} < \frac{\epsilon+3}{4}\ .
\end{align*}
where in the inequality, we use $\lambda> 
\frac{1.2\beta\iota}{(1-\epsilon)\left(\frac{\epsilon+3}{4}-\sqrt{\frac{19}{180}}\right)}$.

Inserting the bounds on $\lambda$ and $\eta$, we get the results described in Theorem~\ref{thm:edrr-mixed-response-simple}.

\begin{proof}[Proof of Theorem~\ref{thm:edrr-mixed-response}]
In general, we bound
\begin{align*}
    \norm{d_{i+1} - d_S}_2 \leq &
    \underbrace{\norm{\what{\MR}_{\boldsymbol{w}}^k(d_{i}, P_{ik}, r_{ik}) - \MR_{\boldsymbol{w}}^k(d_{i}, P_{ik}, r_{ik})}_2}_{T_1}\\
    &+ \underbrace{\norm{\MR_{\boldsymbol{w}}^k(d_{i}, P_{ik}, r_{ik}) -\GD(P_{d_i}, r_{d_i})}_2}_{T_2}
    + \underbrace{\norm{\GD(P_{d_i}, r_{d_i}) - d_S}_2}_{T_3}
\end{align*}
where $d_S$ is some stable occupancy measure.

We begin by bounding $T_1$.
For this we argue similarly to the proof of Theorem~3 in \cite{MTR23}.

Let $\hat{h}_{i+1}$ be the dual solution to $\hat{\calL}^M$ corresponding to $\what{\MR}_{\boldsymbol{w}}^k(d_{i}, P_{ik}, r_{ik})$. I.e.
\begin{equation*}
(\what{\MR}_{\boldsymbol{w}}^k(d_{i}, P_{ik}, r_{ik}), \hat{h}_{i+1}) = 
\argmax_d \argmin_h \hat{\calL}^M(d,h;i)
\end{equation*}

By strong duality, there has to exist a $h_{i+1}$ such that 
\begin{equation*}
(\MR_{\boldsymbol{w}}^k(d_i, P_{ik}, r_{ik}), h_{i+1}) = \argmax_d\argmin_h \calL^M(d,h;i)
\end{equation*}

Using lemma~4 of \cite{MTR23}, we can bound the $L_2$-norms of the dual solutions $\hat{h}_{i+1}$ and
$h_{i+1}$
by $\frac{3\sizeS}{(1-\gamma)^2}$.
We can thus consider the restricted set $\calH = \left\{h:\norm{h}_2\leq \frac{3\sizeS}{(1-\gamma)^2}\right\}$.
Then because Assumption~\ref{assumption-offline-rl} holds, we can apply lemma~\ref{lem:emperical-lagrangian-drr}
with $\delta_1 = p/2i^2$ and $H = 3\sizeS/(1-\gamma)^2$ to get,
\begin{align}
    \abs{\hat\calL^M(d_{i+1},h_{i+1};i) - \calL^M(d_{i+1},h_{i+1};i)} \leq
    \frac{19\sizeS^{1.5}(B+\sqrt{\sizeA})\eta}{(1-\gamma)^3}
    \label{eq:bound-emperical-lagrangian-drr}
\end{align}
if
\begin{equation*}
    \numsam_i \geq \frac{1}{\eta^2}
    \left(\sizeA\ln\left( \frac{2 
    \ln(\sizeS\sizeA BH/\eta) }{\eta}\right) +\ln\left(1+\frac{2H}{\eta}\right)
    +\frac{\ln\left(4i^2/p\right)}{\sizeS}
    \right)\ .
\end{equation*}
Note that event \eqref{eq:bound-emperical-lagrangian-drr} holds with probability at least $1-\frac{p}{2i^2}$.
By a union bound over all rounds, the event holds with probability at least $1-p$ for all rounds.

The objective $\calL^M(\cdot, h_{i+1}, i)$ is $\lambda$-strongly concave.
Therefore, we have
\begin{align*}
&\calL^M(\what{\MR}_{\boldsymbol{w}}^k(d_{i}, P_{ik}, r_{ik}), h_{i+1}; i) - 
\calL^M(\MR_{\boldsymbol{w}}^k(d_{i}, P_{ik}, r_{ik}), h_{i+1}) \\
\leq &
-\frac{\lambda}{2}\norm{\what{\MR}_{\boldsymbol{w}}^k(d_{i}, P_{ik}, r_{ik}) - 
\MR_{\boldsymbol{w}}^k(d_{i}, P_{ik}, r_{ik})}_2^2
\end{align*}
We therefore find by rearranging and using lemma~12 from \cite{MTR23},
\begin{align*}
    &T_1 = \norm{\what{\MR}_{\boldsymbol{w}}^k(d_{i}, P_{ik}, r_{ik}) - 
\MR_{\boldsymbol{w}}^k(d_{i}, P_{ik}, r_{ik})}_2\\
\leq& \sqrt{
\frac{2\left(\calL^M({\MR}_{\boldsymbol{w}}^k(d_{i}, P_{ik}, r_{ik}), h_{i+1}; i) - 
\calL^M(\what{\MR}_{\boldsymbol{w}}^k(d_{i}, P_{ik}, r_{ik}), h_{i+1}; i)
\right)}{\lambda}}\\
\leq& \frac{\sqrt{38\sizeS^{1.5}(B+\sqrt{\sizeA})\eta}}{(1-\gamma)^{1.5}}\frac{1}{\sqrt{\lambda}}
\end{align*}

We now bound $T_2$ using lemma~\ref{lem:GG_alphabeta}
\begin{align*}
    T_2 &= \norm{\MR_{\boldsymbol{w}}^k(d_{i}, P_{ik}, r_{ik}) -\GD(P_{d_i}, r_{d_i})}_2
     = \norm{\GD(\bar{P}_i, \bar{r}_i) - \GD(P_{d_i}, r_{d_i})}_2 \\
     &\leq 
     \frac{\phi}{\lambda}\dist((\bar{P}_i, \bar{r}_i), (P_{d_i}, r_{d_i}))
\end{align*}
with $\phi = \max(\alpha, \beta)$, with $\alpha$ and $\beta$ from Definition~\ref{def.alphaBeta}.

We can further bound this using lemma~\ref{lem:barPr_Pr_di_bound}.
\begin{align*}
     \frac{\phi}{\lambda}\dist((\bar{P}_i, \bar{r}_i), (P_{d_i}, r_{d_i}))
     \leq 
     \frac{\phi}{\lambda}
     \frac{v^{k}\epsilon^{k+1}(v-1)-v\epsilon +\epsilon}{
             v^{k}(v\epsilon-1)-v\epsilon+1}
        \dist((P_{ik}, r_{ik}),(P_{ik+1}, r_{ik+1}))
\end{align*}

We can now bound $T_3$ using lemmas~\ref{lem:GG_alphabeta} and~\ref{lem:fixed_Pr_d}.
\begin{align*}
    T_3 = \norm{\GD(P_{d_i}, r_{d_i}) - d_S}_2 \leq 
    \frac{\phi}{\lambda}\dist((P_{d_i},r_{d_i}), (P_S, r_S))
    \leq 
    \frac{\phi\iota}{\lambda(1-\epsilon)}\|d_i-d_S\|_2
\end{align*}

In total we get
\begin{align*}
    &\norm{d_{i+1} - d_S}_2 \leq \frac{\sqrt{38\sizeS^{1.5}(B+\sqrt{\sizeA})\eta}}{(1-\gamma)^{1.5}}
    \frac{1}{\sqrt{\lambda}}
    +\frac{\phi}{\lambda}\dist((\bar{P}_i, \bar{r}_i), (P_{d_i}, r_{d_i}))
    + \frac{\phi\iota}{\lambda(1-\epsilon)}\norm{d_i - d_S}_2
    \\ \leq &
    % T_1
    \frac{\sqrt{38\sizeS^{1.5}(B+\sqrt{\sizeA})\eta}}{(1-\gamma)^{1.5}}\frac{1}{\sqrt{\lambda}}
    % T_2
    +\frac{\phi}{\lambda}
     \frac{v^{k}\epsilon^{k+1}(v-1)-v\epsilon +\epsilon}{
             v^{k}(v\epsilon-1)-v\epsilon+1}
        \dist((P_{ik}, r_{ik}),(P_{ik+1}, r_{ik+1}))
    % T_3
    + \frac{\phi\iota}{\lambda(1-\epsilon)}\norm{d_i - d_S}_2
    \\ = &
    x_1 \delta
    +x_2 \norm{d_i - d_S}_2
\end{align*}

Where we define $x_1 = \frac{\sqrt{38\sizeS^{1.5}(B+\sqrt{\sizeA})\eta}}{(1-\gamma)^{1.5}\sqrt{\lambda}\delta}+
\frac{\phi}{\lambda\delta} \frac{v^{k}\epsilon^{k+1}(v-1)-v\epsilon +\epsilon}{
v^{k}(v\epsilon-1)-v\epsilon+1} \dist((P_{ik}, r_{ik}),(P_{ik+1}, r_{ik+1}))$
and $x_2 = \frac{\phi\iota}{\lambda(1-\epsilon)}$.

We now prove that after a certain number of update iterations $i$, the occupancy measure $d_i$ is in a 
$\delta$ radius around a stable occupancy measure $d_S$.
For this we can apply lemma~\ref{lem:contraction-case-distinction}, if we know that $x_1 + x_2 < 1$.

So we first derive criteria under which $x_1+x_2<1$ holds.

From the conditions of the Theorem if follows that $v\epsilon > 1$.
Using this we can derive that for any $z>0$, if 
$k>\frac{\ln\left(\frac{\epsilon(v-1)}{v\epsilon-1}\right)+\ln\left(1/z\right)
}{\ln\left(1/\epsilon\right)}$, then $\frac{v^{k}\epsilon^{k+1}(v-1)-v\epsilon +\epsilon}{
v^{k}(v\epsilon-1)-v\epsilon+1} < z$.

We now bound
\begin{align*}
\frac{\phi}{\lambda\delta} \frac{v^{k}\epsilon^{k+1}(v-1)-v\epsilon +\epsilon}{
v^{k}(v\epsilon-1)-v\epsilon+1} \dist((P_{ik}, r_{ik}),(P_{ik+1}, r_{ik+1})) \leq
\frac{0.2 \phi\iota}{\lambda(1-\epsilon)},
\end{align*}
which holds if 
\begin{equation*}
k\geq 
\frac{\ln\left(\frac{\epsilon(v-1)}{v\epsilon-1}\right)+\ln\left(
\frac{5(1-\epsilon)\dist((P_{ik}, r_{ik}),(P_{ik+1}, r_{ik+1}))}{\iota\delta}\right)
}{\ln\left(1/\epsilon\right)}\ .
\end{equation*}
We then have that 
\begin{equation*}
x_1+ x_2 \leq \frac{\sqrt{38\sizeS^{1.5}(B+\sqrt{\sizeA})\eta}}{(1-\gamma)^{1.5}\sqrt{\lambda}\delta}
+\frac{1.2\phi\iota}{\lambda(1-\epsilon)}.
\end{equation*}
Using lemma~\ref{lem:bound_sqrtx_a_b}, with $x=\lambda$, 
$a=\frac{\sqrt{38\sizeS^{1.5}(B+\sqrt{\sizeA})\eta}}{(1-\gamma)^{1.5}\delta}$, $b=\frac{1.2\phi\iota}{1-\epsilon}$
and $y=2.4$ we get that
if $\lambda > \max\left(5.76 a^2, a^2 + \frac{\phi\iota}{1-\epsilon}\left(1+\frac{1}{4.8a^2}\right)
\right)$, then $1 > x_1 + x_2$.

We can then apply lemma~\ref{lem:contraction-case-distinction} to see that
for $i\geq \frac{\ln(\norm{d_1 - d_S}_2/\delta)}{\ln(1/(x_1+x_2))}+$, it holds that $\norm{d_i - d_S}_2\leq \delta $.
\end{proof}
In the proof of Theorem~\ref{thm:edrr-mixed-response} we use the following lemma.
\begin{lemma}\label{lem:barPr_Pr_di_bound}
    If Assumption~\ref{assumption_sensitivity} holds with $\epsilon_p, \epsilon_r<1$, then if $\frac{m_{ik+t}}{\numsam_i} = \frac{v-1}{v^{k}-1} v^{t-1}$, it holds that
    \begin{equation*}
    \dist((\bar{P}_i,\bar{r}_i), (P_{d_i},  r_{d_i}))\leq 
            \frac{v^{k}\epsilon^{k+1}(v-1)-v\epsilon +\epsilon}{
             v^{k}(v\epsilon-1)-v\epsilon+1}
        \dist((P_{ik}, r_{ik}),(P_{ik+1}, r_{ik+1}))\ .
    \end{equation*}
    where
    $\bar{r}_i := \sum_{\subround =1}^{k} \frac{v^{\subround -1} (v-1)}{v^{k}-1} r_{ik+\subround }$ and
    $\bar{P}_i := \sum_{\subround =1}^{k} \frac{v^{\subround -1} (v-1)}{v^{k}-1} P_{ik+\subround }$ for some $v>1$.
\end{lemma}
\begin{proof}
    \begin{align*}
        \dist((\bar{P}_i,\bar{r}_i), (P_{d_i},  r_{d_i}))
        \leq \sum_{\subround =1}^{k} \frac{v^{\subround -1}(v-1)}{v^k-1} 
        (\norm{P_{ik+\subround } - P_{d_i}}_2+ \norm{r_{ik+\subround } - r_{d_i}}_2)
    \end{align*}
    Note that if Assumption~\ref{assumption_sensitivity} holds with $\epsilon_p, \epsilon_r<1$,
    then the map $g_d$ is contractive with unique fixed point $(P_{d_i}, r_{d_i})$ 
    and Lipschitz coefficient $\epsilon$ (see Proposition~\ref{prop.contraction}).
    So we have for $v\epsilon \neq 1$:
    \begin{align*}
        &(1-\epsilon)\sum_{\subround =1}^{k} \frac{v^{\subround -1} (v-1)}{v^{k}-1} 
        (\norm{P_{ik+\subround } - P_{d_i}}_2+
         \norm{r_{ik+\subround } - r_{d_i}}_2)\\
        &\leq 
        (1-\epsilon)\sum_{\subround =1}^{k} \frac{\epsilon (\epsilon v)^{\subround -1} (v-1)}{v^k-1} 
        (\norm{P_{ik} - P_{d_i}}_2+
         \norm{r_{ik} - r_{d_i}}_2)\\
         &\leq
        \sum_{\subround =1}^{k} \frac{\epsilon(\epsilon v)^{\subround -1} (v-1)}{v^{k}-1} 
        (\norm{P_{ik} - P_{d_i}}_2 +
         \norm{r_{ik} - r_{d_i}}_2 - \norm{P_{ik+1} - P_{d_i}}_2 - \norm{r_{ik+1} - r_{d_i}}_2
         ) \\
         &\leq
        \frac{\epsilon(v-1)}{v^k-1} 
        (\norm{P_{ik} - P_{ik+1}}_2 +
         \norm{r_{ik} - r_{ik+1}}_2 ) \sum_{\subround =1}^{k} (\epsilon v)^{\subround -1} \\
         &=
        \frac{\epsilon(v-1)(v^k\epsilon^k-1)}{(v^k-1)(v\epsilon -1)} 
        (\norm{P_{ik} - P_{ik+1}}_2 +
         \norm{r_{ik} - r_{ik+1}}_2 ) \\
         &=
         \frac{v^{k+1}\epsilon^{k+1}-v\epsilon -v^k\epsilon^{k+1}+\epsilon}{
             v^{k+1}\epsilon-v^k-v\epsilon+1}
        (\norm{P_{ik} - P_{ik+1}}_2 +
         \norm{r_{ik} - r_{ik+1}}_2 ) \\
         &=
         \frac{v^{k}\epsilon^{k+1}(v-1)-v\epsilon +\epsilon}{
             v^{k}(v\epsilon-1)-v\epsilon+1}
        (\norm{P_{ik} - P_{ik+1}}_2 +
         \norm{r_{ik} - r_{ik+1}}_2 ) \\
    \end{align*}
\end{proof}

