\section{Preliminaries}
\label{Section: Preliminaries}

We present standard notations and definitions related to deterministic markov decision processes.

\paragraph{Deterministic Markov Decision Processes.} 
A deterministic markov decision process~(DMDP) is a finite directed weighted graph $P = (\Vertices, \Edges, \weight)$ consisting of  
\begin{compactitem}
    \item the set of vertices $\Vertices$, of size $n$; 
    \item the set of edges $\Edges \subseteq \Vertices \times \Vertices$, of size $m$, such that for all $\vertex \in \Vertices$, the set $\Edges(\vertex) \defas \{ \otherver \mid (\vertex, \otherver) \in \Edges \}$ is non-empty; and
    \item the weight function $\weight: \Edges \to \ZZ$ that assigns a weight $\weight(\vertex, \otherver)$ for all edges $(\vertex, \otherver) \in \Edges$.
\end{compactitem}
We denote the largest absolute weight by $W \defas \max \{ |\weight(\vertex, \otherver)|  \mid  (\vertex, \otherver) \in \Edges \}$. The size of $P$ is defined as $|P| \defas n + m +  \sum_{(\vertex, \otherver) \in \Edges} \lceil \log_2 |\weight(\vertex, \otherver)| \rceil$. The vertices are indexed and have an ordering.


\paragraph{Steps and Runs.} 
Given an initial vertex $\vertex_0 \in \Vertices$, the process proceeds as follows.
In each step, the controller chooses the next vertex from the set $\Edges(\vertex)$.
A \emph{run} is an infinite sequence of vertices $\run = \langle \vertex_0, \vertex_1, \ldots \rangle$ where for every step $t \ge 0$, the vertex $\vertex_{t+1} \in \Edges(\vertex_t)$. We denote by $\runs$ 
the set of all runs, and by $\policies_\vertex$ the set of all runs $\run = \langle \vertex_0, \vertex_1, \ldots \rangle$
where $\vertex_0 = \vertex$.

\paragraph{Mean-payoff Objectives.} 
An objective is a measurable function that assigns a real number to all runs. For a run $\run = \langle \vertex_0, \vertex_1, \ldots \rangle$, the average for $t$ steps is $\avg_t(\run) \defas \frac{1}{t} \sum_{i = 0}^{t-1} \weight(\vertex_i, \vertex_{i+1})$. The $\liminf$ average is $\liminfavg(\run) \defas \liminf_{t \to \infty} \avg_t(\run)$. The objective of controller is to maximize the $\liminf$ average of the run.
    

\paragraph{Positional Policies.}
Policies are recipes that specify how to choose the next vertex.
A {\em positional}  policy $\policy \colon \Vertices \to \Vertices$ for the controller is a policy which chooses
a vertex~$\policy(\vertex) \in \Edges(\vertex)$ whenever the run visits vertex~$\vertex$. 
We denote by $\policies^P$ the set of all positional policies.
In general, policies can depend on past history and not only the current vertex. 
However, for mean-payoff objectives, positional policies are as powerful as general policies~\citep{Puterman94}.
Hence, in the sequel, every policy is positional.

\paragraph{Runs Given Policies in DMDPs.}
We define $P^\policy$ as the restricted DMDP where the controller follows the policy $\policy$. Note that once the controller has fixed their policy, we obtain a graph where each vertex has exactly one outgoing edge.
Given an initial vertex $\vertex$, we obtain a run $P_\vertex^{\policy} = \langle \vertex_0, \vertex_1, \ldots \rangle$ such that $\vertex_0 = \vertex$, and for any step $t \ge 0$, $\vertex_{t+1} = \policy(\vertex_t)$. The obtained run $P_\vertex^{\policy}$ is a {\em lasso-shaped}  run that consists in a finite cycle-free path $\calP \defas \langle \vertex_0, \ldots, \vertex_{p} \rangle$ followed by a simple cycle $\C \defas \langle \vertex_p, \ldots, \vertex_{p + c - 1} \rangle$
repeated forever, where $\vertex_p$ is the {\em head} of the cycle (the vertex with the least index in the cycle). The mean-payoff of the policy $\policy$ is defined as
\[
    \val^\policy(\vertex) \defas \liminfavg(P_\vertex^{\policy}) = \frac{1}{c} \sum_{i=0}^{c-1} \weight(\vertex_{p + i}, \vertex_{p + i + 1}) \,.
\]
We define the potential function as
\[
    \pot^\policy(\vertex) \defas \sum_{i=0}^{p-1} \left (\weight(\vertex_i, \vertex_{i+1}) - \val^\policy(\vertex) \right )\,.
\]
In words, the payoff $\val^{\policy}(\vertex)$ is the mean-payoff the controller obtains, in case they follow the policy $\policy$, and the potential $\pot^\policy(\vertex)$ is the relative distance from $\vertex$ to $\vertex_p$, where the weight of each edge is subtracted by the mean-payoff.

\paragraph{Value and Optimal Policies.}
The mean-payoff value for a vertex $\vertex$ is defined as
    $\val(\vertex) \defas \max_{\policy \in \policies^P} \val^{\policy}(\vertex)$.
A policy~$\policy$ for the controller is \emph{optimal} for mean-payoff objectives if, for all vertices $\vertex \in \Vertices$, we have $\val^\policy(\vertex) = \val(\vertex)$. 

\paragraph{Bellman Operator.} Given a policy $\policy$, we define the {\em appraisal} of an edge $(\vertex, \otherver)$ as a tuple
\[
    \appr^\policy(\vertex, \otherver) \defas (\val^\policy(\otherver), \weight(\vertex, \otherver) - \val^\policy(\otherver) + \pot^\policy(\otherver) ) \,. 
\]
The Bellman operator, which is an operator from $\policies^P$ to $\policies^P$, is defined as
\[\B(\policy)(\vertex) \defas \arg \max_{\otherver \in \Edges(\vertex)} \appr^\policy(\vertex, \otherver)\,.\]
The appraisals are compared lexicographically, and ties are resolved by first favoring $\otherver=\policy(\vertex)$, then vertices with the least index. For increased legibility, we will refer to the second term of the appraisal as \[\apprtwo^\policy(\vertex, \otherver) \defas \weight(\vertex, \otherver) - \val^\policy(\otherver) + \pot^\policy(\otherver)\,.\]


\paragraph{Howard's Policy Iteration.} Howard's policy iteration is a classical algorithm for computing the optimal policies in DMDPs with mean-payoff objectives. The algorithm starts with an arbitrary policy $\policy_0$. 
%\footnote{Frequently, the algorithm is defined to use starting strategy $\policy_0(\vertex) = \arg\max_{\otherver\in\Edges(\vertex)} \weight(\vertex, \otherver)$, where the highest weight outgoing edge is picked for each vertex \citep{Howard60}. Our lower bound also holds for this version of the algorithm, details are provided in \Cref{Section:Extensions}}. 
In each iteration, the algorithm locally improves the current policy: Starting with $\policy_k$ at iteration $k$, the algorithm computes the payoff and the potential of the policy $\policy_k$. Using these, it updates the policy using the Bellman operator defined above by setting $\policy_{k+1}=\B(\policy_k)$. The algorithm terminates if $\policy_{k+1}=\policy_k$, meaning no update to the policy was made. The correctness of Howard's algorithm is shown in \citet{derman1970finite,Puterman94}.




% \smallskip\noindent{\em Notation.} For simplicity, we denote by $\discounted_\discfac(G^{\strategyone, \strategytwo})$ a vector whose $\vertex$-th coordinate is the discounted payoff for vertex $\vertex$, i.e., $\discounted_\discfac(G_\vertex^{\strategyone, \strategytwo})$.

% We recall the fundamental determinacy in positional strategies for TBGs with discounted-sum and mean-payoff objectives. 

% \begin{theorem}[\cite{condon1992ComplexityStochasticGames}]
% \label{Result: determinacy of discounted tbgs}
% For all TBGs $G$, initial vertices $v$, reward functions, and discount factors $\discfac$, we have 
% \[
% \max_{\strategyone \in \Strategyone^P} \min_{\strategytwo \in \Strategytwo^P} \discounted_\discfac(G_{\vertex}^{\strategyone, \strategytwo}) =
% \min_{\strategytwo \in \Strategytwo^P} \max_{\strategyone \in \Strategyone^P}  \discounted_\discfac(G_{\vertex}^{\strategyone, \strategytwo}) \,.
% \]
% \end{theorem}

% \begin{theorem}[\cite{condon1992ComplexityStochasticGames}]
% \label{Result: determinacy of mean-payoff tbgs}
% For all TBGs $G$, initial vertices $v$, and reward functions, we have 
% \[
% \max_{\strategyone \in \Strategyone^P} \min_{\strategytwo \in \Strategytwo^P} \liminfavg(G_{\vertex}^{\strategyone, \strategytwo}) =
% \min_{\strategytwo \in \Strategytwo^P} \max_{\strategyone \in \Strategyone^P}  \limsupavg(G_{\vertex}^{\strategyone, \strategytwo}) \,.
% \]
% \end{theorem}


% \smallskip\noindent{\em Overview.}
% Our analysis focuses on the difference of values between two lasso-shaped plays and uses a class of polynomials and the properties of their roots.
% In \Cref{section: polynomials}, we present the results related to upper and lower bounds on the roots of polynomials.
% In \Cref{section: si}, we present the improved analysis of the SI algorithm by employing the results of \Cref{section: polynomials}.
