% !TEX root =  main.tex
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% \newpage
% ~\\
% \newpage
\newcommand{\prelimsectioning}[1]{\paragraph{#1}}
\section{Preliminaries}\label{sec.formal-setting}
We follow \cite{PZM+20, BHK20} and \cite{MTR23} in defining the formal setting. 

\prelimsectioning{Markov Decision Processes}
We consider tabular Markov Decision Processes (MDPs), 
which consist of a finite state space $\setS$, finite action space
$\setA$, discount factor $\gamma$ and initial state distribution $\rho$. 
We assume that the reward and transition probability functions change over time, as a response to the policy which the learner deploys.
The learner deploys policy $\pi_t$ in round $t$ and the previous probability transition and reward function are $P_{t-1}$ and $r_{t-1}$.
They then change to $P_t = \Pc(\pi_t, P_{t-1}, r_{t-1})$ and $r_t = \Rc(\pi_t, P_{t-1}, r_{t-1})$ respectively,
according to the \emph{response models} $\Pc$ and $\Rc$.
Thus, the MDP in round $t$ is $M_t = (\setS, \setA, P_t, r_t, \rho)$.

When the learner deploys policy $\pi_{t+1}$, the probability of a trajectory $\tau = (s_k, a_k)_{k=0}^\infty$ to be realized in round $t$
is given by $\P_t^{\pi_{t}}(\tau) = \rho(s_0) \prod_{k=1}^\infty \pi_{t}(a_k|s_k) P_t(s_k, a_k, s_{k+1})$.

Given policy $\pi$ and initial state distribution $\rho$, 
we denote the value function at round $t$ as $V_t^\pi(\rho)$.
It is defined as
\begin{equation*}
V_t^\pi(\rho) = \E_{\tau\sim \P_t^\pi}\left[\sum_{k=0}^\infty\gamma^kr_t(s_k,a_k)|\rho\right].
\end{equation*}

The learner in round $t$ has access to the past MDPs $M_0, \dots, M_{t-1}$, or a finite number
of samples thereof.

\prelimsectioning{Solution Concept}
We assume that when the learner deploys $\pi$ in every round, the MDP converges
to the \emph{limiting MDP} $M_\pi=(\setS, \setA, P_{\pi}, r_{\pi}, \rho)$, which is independent of the initial MDP.
Using this, we can define the \emph{performative value function} as
\begin{equation*}
V_{\pi'}^{\pi}(\rho) = \lim_{t\rightarrow \infty}V_t^\pi(\rho| \pi_i = \pi'\  \forall i)\ .
\end{equation*}
It is the value function of MDP $M_\pi$. 

One common solution concept in this setting is to find a \emph{performatively stable policy}, defined as follows.
\begin{definition}[Performatively Stable Policy]
We call a policy $\pi$ \emph{performatively stable}, 
if it is the best response to the MDP $M_\pi$.
That is, $\pi \in \argmax_{\pi'} V_{\pi'}^\pi$.
\end{definition}
Given two performatively stable policices $\pi_1$ and $\pi_2$, their convex combination might not be performatively stable. Because of this, it is hard to use the standard formulation of RL. 
This problem is alleviated by using the linear programming formulation of RL.
To describe this,
we define the long term-state occupancy measure of a policy $\pi$ in MDP $M_t$ as 
$d_t^\pi(s,a) = \E_{\tau\sim \P_t^\pi}\left[\sum_{k=0}^\infty \gamma^k\one\{s_k=s, a_k=a\}\right]$.
When given occupancy measure $d$, one can consider the following policy $\pi^d$, which has occupancy measure $d$.
\begin{align}\label{eq:dtopi}
    \pi^d(a|s) = \left\{\begin{array}{cc}
        \frac{d(s,a)}{\sum_b d(s,b)} & \textrm{ if } \sum_a d(s,a) > 0 \\
        \frac{1}{\sizeA} & \textrm{ otherwise }
    \end{array} \right.
\end{align}
We consider that the learner parameterizes its policy by the occupancy measure and calculates the policy via~\eqref{eq:dtopi}.

In an unregularized setting, we would say that a occupancy measure $d_S$ is performatively stable if it is the optimal solution to the following linear program.
\begin{align}\label{eq:perf-stable-policy}
        &d_S^* \in \argmax_{d \ge 0} \sum_{s,a} d(s,a) r_{d_S^*}(s,a)\\
        \textrm{s.t.} & \sum_a d(s,a) = \rho(s) + \gamma \cdot \sum_{s',a} d(s',a) P_{d_S^*}(s',a,s) \ \forall s
        \nonumber
\end{align}
where we denote $P_d = P_{\pi_d}$ and $r_d = r_{\pi_d}$\ .
This describes an occupancy measure which is itself the best response against the current MDP.

But, similar to prior work, to make the theoretical analysis feasible, we assume the following regularized version of optimization problem~\eqref{eq:perf-stable-policy}.
A stable occupancy measure $d_S$ is defined by
\begin{align}
    \label{eq:perf-stable-policy-regularized}
        &d_S \in \argmax_{d\ge 0}\ \sum_{s,a} d(s,a) r_d(s,a) - \frac{\lambda}{2}\norm{d}_2^2\\
       \textrm{s.t. } & \sum_a d(s,a) = \rho(s) + \gamma \cdot \sum_{s',a} d(s',a) P_d(s',a,s)\ \forall s.
       \nonumber
\end{align}
Here $\lambda$ is a constant regularization factor which describes the strong-concavity of the objective.
This describes an occupancy measure which is itself the best response against a regularized objective of the current MDP.
If a learner updates their occupancy measure using the best response against a $L2$-regularized objective, \eqref{eq:perf-stable-policy-regularized} describes an occupancy measure which would not change under such an update, i.e. be stable.

In our results, we provide lower bounds for how small $\lambda$ can be to guarantee convergence.
Furthermore, in Appendix~\ref{appdx.apprx-unregularized} we show that \eqref{eq:perf-stable-policy-regularized} approximates the unregularized objective~\eqref{eq:perf-stable-policy}.

\prelimsectioning{Sensitivity Assumption}
We overload the notation to write the response models in the following form.
For every occupancy measure $d$, let $\Pc(d, P, r) = \Pc(\pi_d, P, r)$
and $\Rc(d, P, r) = \Rc(\pi_d, P, r)$.

For the learner to make use of this past information, we use the following sensitivity assumption, which are commonly used in performative prediction.
\begin{assumption}[sensitivity]\label{assumption_sensitivity}
    Consider some $\iota_p,\iota_r,\epsilon_{p,p},\epsilon_{p,r},\epsilon_{r,p},
    \epsilon_{r,r} \geq 0$ 
    with $\iota=\iota_{p} + \iota_{r}< 1$, $\epsilon_p=\epsilon_{p,p} + \epsilon_{r,p}< 1$ and 
    $\epsilon_r=\epsilon_{p,r} + \epsilon_{r,r}<1$.
    Assume
    \begin{align*}
        &\|\Pc(d, P, r) - \Pc(d', P', r') \|_2 \\ 
        &\leq \iota_{p}
        \|d-d'\|_2 + 
        \epsilon_{p,p}\|P-P'\|_2 + \epsilon_{p,r}\|r-r'\|_2 \text{ and}\\
    &\|\Rc(d, P, r) - \Rc(d',P', r') \|_2 \\ 
    &\leq \iota_{r}\|d-d'\|_2 + 
    \epsilon_{r,p}\|P-P'\|_2 + \epsilon_{r,r}\|r-r'\|_2
    \end{align*}
    for any occupancy measures $d,d'$, reward functions $r,r'$ and probability transition functions $P,P'$.
\end{assumption}
Assumption~\ref{assumption_sensitivity} ensures that when the learner deploys a new policy, the new MDP does not drift too far from the old MDP.
In Appendix~\ref{appdx.example_sensitivity} we discuss one example where this assumption commonly holds.

When $\epsilon_p, \epsilon_r <1$, the mapping from $(P, r)$ to $(\Pc(d,P,r), \Rc(d,P,r))$ is a contraction for any occupancy measure~$d$ 
(Proof in Appendix~\ref{appdx.sec.contraction}).
Therefore, if the learner deploys the same policy $\pi$ in every round, $P_t$ and $r_t$ asymptotically converge to some $P_\pi$ and $r_\pi$ respectively and we don't need
to assume this explicitly.

To simplify the exposition of the results in the main paper, we assume that the following
assumption holds, without explicitly stating it in the results.
\begin{assumption}\label{assumption-simplicity}
For the results in the main part of the paper, we assume that 
    $\epsilon_{p,p} = \epsilon_{p,r} = \epsilon_{r,p}= \epsilon_{r,r} = \frac{\epsilon}{2}$,
$\iota_p, \iota_r \leq \frac{\epsilon}{2}$ for some $\epsilon<1$ and 
            $\frac{9\gamma \sizeS}{(1-\gamma)^2}
        \geq 1$ .
\end{assumption}
Assumption~\ref{assumption-simplicity} is not critical -- as we show in the appendix, our results easily generalize when we do not assume it.

\prelimsectioning{Sample Generation Model}
We also consider finite-sample versions of the algorithms we propose.
For this we use the following sample generation model.
In round $t$, let $\bar{d}_t$ be the occupancy measure 
of $\pi_t$ under dynamics~$P_{t}$.
Note that this is different than the occupancy measure which the learner uses to calculate its policy~$\pi_{d_t}$, since this was calculated using different dynamics.
We then define the normalized occupancy measure $\tilde{d}_t(s,a) = (1-\gamma)\bar{d}_t(s,a)$.
Each sample in round $t$ is a tuple $(s,a,r,s')$ and is generated in the following way. First a state, action pair is sampled i.i.d. according
to $(s,a)\sim \tilde{d}_t$, then reward as $r=r_{t}(s,a)$ and then the next state $s'\sim P_{t}(\cdot|s,a)$.
This is a standard model of sample generation in offline RL~\citep{munos2008finite, farahmand2010error, xie2021batch, MTR23}.