%\documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
% version; also before submission to
% see how the non-anonymous paper
% would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
% ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}
\usepackage{xr}


\externaldocument{mondal_582}
%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
%\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{graphicx}
\usepackage{subcaption}
\newtheorem{assumption}{Assumption}[section]
\newtheorem{theorem}{Theorem}[section]
\newtheorem{lemma}{Lemma}[section]
\newtheorem{corollary}{Corollary}[section]
\newtheorem{example}{Example}[section]
\newtheorem{remark}{Remark}[section]


%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Can Mean Field Control (MFC) Approximate Cooperative Multi Agent Reinforcement Learning (MARL) with Non-Uniform Interaction? (Supplementary Material)}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1, 2]{\href{mailto:<wmondal@purdue.edu>?Subject=Your UAI 2022 paper}{Washim~Uddin~Mondal}{}}
\author[1]{Vaneet Aggarwal}
\author[2]{Satish V. Ukkusuri}
% Add affiliations after the authors
\affil[1]{%
	School of Industrial Engineering\\
	Purdue University\\
	West Lafayette, Indiana, USA 47907
}
\affil[2]{%
	Lyles School of Civil Engineering\\
	Purdue University\\
	West Lafayette, Indiana, USA 47907
}

\begin{document}
	\maketitle
	\appendix
	\section{Proof of Corollary \ref{corollary_1}}
	The following inequalities hold $\forall x\in \mathcal{X}$, $\forall u\in \mathcal{U}$, $\forall \boldsymbol{\mu}_1\in\mathcal{P}(\mathcal{X})$, and $\forall \boldsymbol{\nu}_1\in\mathcal{P}(\mathcal{U})$.
	\begin{align*}
		|r(x, u, \boldsymbol{\mu}_1, \boldsymbol{\nu}_1)|&\leq |\boldsymbol{a}^T\boldsymbol{\mu}_1| + |\boldsymbol{b}^T\boldsymbol{\nu}_1| + |f(x, u)|\\
		&\leq |\boldsymbol{a}|_1|\boldsymbol{\mu}_1|_1 + |\boldsymbol{b}|_1|\boldsymbol{\nu}_1|_1 + |f(x, u)|\\
		& \overset{(a)}{=} |\boldsymbol{a}|_1 + |\boldsymbol{b}|_1 + |f(x, u)|
	\end{align*}
	
	Equality (a) follows from the fact that both $\boldsymbol{\mu}_1$ and $\boldsymbol{\nu}_1$ are probability distributions. As the sets $\mathcal{X}$, $\mathcal{U}$ are finite, there must exist $M_F>0$ such that, $|f(x, u)|\leq M_F$, $\forall x\in \mathcal{X}$, $\forall u\in \mathcal{U}$. Taking $M_R = |\boldsymbol{a}|_1 + |\boldsymbol{b}|_1 + M_F$, we can establish proposition (a).
	
	Proposition (b) follows from the fact that $\forall x\in \mathcal{X}$, $\forall u\in \mathcal{U}$, $\forall \boldsymbol{\mu}_1, \boldsymbol{\mu}_2\in\mathcal{P}(\mathcal{X})$, $\forall \boldsymbol{\nu}_1, \boldsymbol{\nu}_2\in\mathcal{P}(\mathcal{U})$, the following relations hold.
	\begin{align*}
		|r(x, u,& \boldsymbol{\mu}_1, \boldsymbol{\nu}_2) - r(x, u, \boldsymbol{\mu}_2, \boldsymbol{\nu}_2)|\\
		&\leq  |\boldsymbol{a}^T(\boldsymbol{\mu}_1-\boldsymbol{\mu}_2)| + |\boldsymbol{b}^T(\boldsymbol{\nu}_1-\boldsymbol{\nu}_2)|\\
		&\leq |\boldsymbol{a}|_1 |\boldsymbol{\mu}_1-\boldsymbol{\mu}_2|_1 + |\boldsymbol{b}|_1 |\boldsymbol{\nu}_1 - \boldsymbol{\nu}_2|_1
	\end{align*}
	
	Taking $L_R = \max\{|\boldsymbol{a}|_1, |\boldsymbol{b}|_1\}$, we conclude the result.
	
	\section{Proof of Theorem \ref{theorem_1}}
	
	The following results are necessary to establish the theorem. 
	
	\subsection{Lipschitz Continuity}
	In the following three lemmas, we shall establish that the functions, $\nu^{\mathrm{MF}}$, $P^{\mathrm{MF}}$ and $r^{\mathrm{MF}}$ defined in $(\ref{nu_MF}), (\ref{mu_t_plus_1})$ and $(\ref{v_MF})$ are Lipschitz continuous. In all of these lemmas, the term $\Pi$ denotes the set of policies that satisfies Assumption \ref{assumption_3}. The proofs of these lemmas are delegated to Appendix \ref{proof_lemma_1}, \ref{proof_lemma_2}, and \ref{proof_lemma_3} respectively.
	
	\begin{lemma}
		If $\nu^{\mathrm{MF}}(.,.)$ is defined by $(\ref{nu_MF})$, then $\forall \boldsymbol{\mu}_1,\boldsymbol{\mu}_2\in \mathcal{P}(\mathcal{X})$, $\forall \pi\in \Pi$, the following inequality holds.
		\begin{align*}
			|\nu^{\mathrm{MF}}(\boldsymbol{\mu}_1, \pi) - \nu^{\mathrm{MF}}(\boldsymbol{\mu}_2, \pi)|_1 \leq (1+L_Q)|\boldsymbol{\mu}_1-\boldsymbol{\mu}_2|_1
		\end{align*}
		where $L_Q$ is defined in Assumption \ref{assumption_3}.
		\label{lemma_1}
	\end{lemma}
	
	\begin{lemma}
		If $P^{\mathrm{MF}}(.,.)$ is defined by $(\ref{mu_t_plus_1})$, then $\forall \boldsymbol{\mu}_1,\boldsymbol{\mu}_2\in \mathcal{P}(\mathcal{X})$, $\forall \pi\in \Pi$, the following inequality holds.
		\begin{align*}
			\begin{split}
				&|P^{\mathrm{MF}}(\boldsymbol{\mu}_1, \pi) - P^{\mathrm{MF}}(\boldsymbol{\mu}_2, \pi)|_1 \leq S_P|\boldsymbol{\mu}_1-\boldsymbol{\mu}_2|_1\\
				\text{where}~&S_P\triangleq (1+L_Q) + L_P(2+L_Q).
			\end{split}
		\end{align*}
		The terms $L_P$, and $L_Q$ are defined in Assumption \ref{assumption_1}, and \ref{assumption_3} respectively.
		\label{lemma_2}
	\end{lemma}
	
	\begin{lemma}
		If $r^{\mathrm{MF}}(.,.)$ is defined by $(\ref{v_MF})$, then $\forall \boldsymbol{\mu}_1,\boldsymbol{\mu}_2\in \mathcal{P}(\mathcal{X})$, $\forall \pi\in \Pi$, the following inequality holds.
		\begin{align*}
			\begin{split}
				&|r^{\mathrm{MF}}(\boldsymbol{\mu}_1, \pi) - r^{\mathrm{MF}}(\boldsymbol{\mu}_2, \pi)|_1 \leq S_R|\boldsymbol{\mu}_1-\boldsymbol{\mu}_2|_1\\
				\text{where}~&S_R\triangleq M_R(1+L_Q) + L_R(2+L_Q).
			\end{split}
		\end{align*}
		The terms $M_R, L_R$, and $L_Q$ are defined in Corollary \ref{corollary_1} and Assumption \ref{assumption_3} respectively.
		\label{lemma_3}
	\end{lemma}
	
	\subsection{Approximation Results}
	\label{appndx:approx}
	
	The following Lemma \ref{lemma_5}, \ref{lemma_6}, \ref{lemma_7} establish that the state, action distributions and the average reward of an $N$-agent system closely approximate their mean-field counterparts when $N$ is large. All of these results use Lemma \ref{lemma_4} as the key ingredient.
	
	\begin{lemma}
		\citep{mondal2021approximation} Assume that $\forall m\in [M]$, $\{X_{m,n}\}_{n\in[N]}$ are independent random variables that lie in the interval $[0, 1]$, and satisfy the following constraint: $\sum_{m\in[M]}\mathbb{E}[X_{m,n}]=1$, $\forall n\in [N]$. If $\{C_{m,n}\}_{m\in[M], n\in [N]}$ are constants that obey $|C_{m,n}|\leq C$, $\forall m\in [M]$, $\forall n\in [N]$, then the following inequality holds.
		\begin{align*}
			\sum_{m\in[M]} \mathbb{E} \Big| C_{m,n}(X_{m,n}-E[X_{m,n}])\Big|\leq C\sqrt{MN}
		\end{align*}
		\label{lemma_4}
	\end{lemma}
	
	The proofs of Lemma \ref{lemma_5}, \ref{lemma_6}, and \ref{lemma_7} have been delegated to Appendix \ref{proof_lemma_5}, \ref{proof_lemma_6}, and \ref{proof_lemma_7} respectively.
	
	\begin{lemma}
		Assume $\{\boldsymbol{\mu}_t^N, \boldsymbol{\nu}_t^N\}_{t\in \mathbb{T}}$ are empirical state and action distributions of an $N$-agent system defined by (\ref{mu}), and (\ref{nu}) respectively. If these distributions are generated by a sequence of policies $\boldsymbol{\pi} = \{\pi_t\}_{t\in \mathbb{T}}$, then $\forall t\in \mathbb{T}$ the following inequality holds. 
		\begin{align*}
			\mathbb{E}|\boldsymbol{\nu}_t^N-\nu^{\mathrm{MF}}(\boldsymbol{\mu}_t^N,\pi_t)|_1 \leq \dfrac{\sqrt{|\mathcal{U}|}}{\sqrt{N}}
		\end{align*} 
		where $\nu^{\mathrm{MF}}$ is defined in $(\ref{nu_MF})$.
		\label{lemma_5}
	\end{lemma}
	
	\begin{lemma}
		Assume $\{\boldsymbol{\mu}_t^N, \boldsymbol{\nu}_t^N\}_{t\in \mathbb{T}}$ are empirical state and action distributions of an $N$-agent system defined by (\ref{mu}), and (\ref{nu}) respectively. If these distributions are generated by a sequence of policies $\boldsymbol{\pi} = \{\pi_t\}_{t\in \mathbb{T}}$, then $\forall t\in \mathbb{T}$ the following inequality holds. 
		\begin{align*}
			\mathbb{E}|\boldsymbol{\mu}_{t+1}^N-P^{\mathrm{MF}}(\boldsymbol{\mu}_t^N,\pi_t)|_1 \leq \dfrac{C_P}{\sqrt{N}}\left[\sqrt{|\mathcal{X}|}+\sqrt{|\mathcal{U}|}\right]
		\end{align*} 
		where $P^{\mathrm{MF}}$ is defined in $(\ref{mu_t_plus_1})$, $C_P\triangleq 2+L_P$, and $L_P$ is given in Assumption \ref{assumption_1}.
		\label{lemma_6}
	\end{lemma}
	
	\begin{lemma}
		Assume $\{\boldsymbol{\mu}_t^N, \boldsymbol{\nu}_t^N\}_{t\in \mathbb{T}}$ are empirical state and action distributions of an $N$-agent system defined by (\ref{mu}), and (\ref{nu}) respectively. Also, $\forall i\in [N]$, let $\{\boldsymbol{\mu}_t^{i,N}, \boldsymbol{\nu}_t^{i,N}\}$ be weighted state and action distributions defined by $(\ref{mu_i}), (\ref{nu_i})$. If these distributions are generated by a sequence of policies $\boldsymbol{\pi} = \{\pi_t\}_{t\in \mathbb{T}}$, then $\forall t\in \mathbb{T}$ the following inequality holds. 
		\begin{align*}
			&\mathbb{E}\left|\dfrac{1}{N}\sum_{i=1}^N r(x_t^i, u_t^i, \boldsymbol{\mu}_t^{i,N}, \boldsymbol{\nu}_t^{i,N})-r^{\mathrm{MF}}(\boldsymbol{\mu}_t^N,\pi_t)\right| \\
			&\hspace{1cm}\leq  C_R\dfrac{\sqrt{|\mathcal{U}|}}{\sqrt{N}}
		\end{align*} 
		where $r^{\mathrm{MF}}$ is given in $(\ref{v_MF})$, $C_R\triangleq |\boldsymbol{b}|_1 + M_F$ and $M_F$ is such that $|f(x,u)|\leq M_F$, $\forall x\in \mathcal{X}$, $\forall u\in \mathcal{U}$. The function $f(.,.)$ and the parameter $\boldsymbol{b}$ are defined in Assumption \ref{assumption_2}. We would like to mention that $M_F$ always exists since $\mathcal{X}, \mathcal{U}$ are finite.
		\label{lemma_7}
	\end{lemma}
	
	\subsection{Proof of the Theorem}
	
	Note that,
	\begin{align*}
		&|v_{\mathrm{MARL}}(\boldsymbol{x}_0, \boldsymbol{\pi}) - v_{\mathrm{MF}}(\boldsymbol{\mu}_0, \boldsymbol{\pi})|\\
		& \overset{(a)}{=}\Bigg|\sum_{t=0}^{\infty}\dfrac{1}{N}\sum_{i=1}^N \gamma^t\mathbb{E}[r(x_t^i,u_t^i,\boldsymbol{\mu}_t^{i,N},\boldsymbol{\nu}_t^{i,N})]\\
		&-\sum_{t=0}^{\infty}\gamma^tr^{\mathrm{MF}}(\boldsymbol{\mu}_t, \pi_t)\Bigg|\leq J_1 +J_2
	\end{align*}
	
	Equality (a) directly follows from the definitions $(\ref{v_MARL})$ and $(\ref{v_MF})$. The first term $J_1$ can be written as follows.
	\begin{align*}
		&J_1 \triangleq \sum_{t=0}^{\infty}\gamma^t\mathbb{E}\Bigg|\dfrac{1}{N}\sum_{i=1}^N [r(x_t^i,u_t^i,\boldsymbol{\mu}_t^{i,N},\boldsymbol{\nu}_t^{i,N})] - r^{\mathrm{MF}}(\boldsymbol{\mu}_t^N, \pi_t)\Bigg|\\
		& \overset{(a)}{\leq} C_R \dfrac{\sqrt{|\mathcal{U}|}}{\sqrt{N}}\dfrac{1}{1-\gamma}
	\end{align*}
	
	Equation (a) is a result of Lemma \ref{lemma_7}. The second term can be expressed as follows.
	\begin{align*}
		&J_2\triangleq \sum_{t=0}^{\infty}\gamma^t\mathbb{E}|r^{\mathrm{MF}}(\boldsymbol{\mu}_t^N, \pi_t) - r^{\mathrm{MF}}(\boldsymbol{\mu}_t, \pi_t)|\\
		&\overset{(a)}{\leq} S_R\sum_{t=0}^{\infty}\gamma^t|\boldsymbol{\mu}_t^N - \boldsymbol{\mu}_t|_1 
	\end{align*}
	
	Inequality (a) follows from Lemma \ref{lemma_3}. Observe that, $\forall t\in \mathbb{T}$,
	\begin{align*}
		&|\boldsymbol{\mu}_{t+1}^N-\boldsymbol{\mu}_{t+1}|_1\\
		&\leq |\boldsymbol{\mu}_{t+1}^N-P^{\mathrm{MF}}(\boldsymbol{\mu}_t^N, \pi_t)|_1 + |P^{\mathrm{MF}}(\boldsymbol{\mu}_t^N, \pi_t)-\boldsymbol{\mu}_{t+1}|_1\\
		&\overset{(a)}{\leq} \dfrac{C_P}{\sqrt{N}}\left[\sqrt{|\mathcal{X}|}+\sqrt{|\mathcal{U}|}\right] \\
		&\hspace{1cm}+ |P^{\mathrm{MF}}(\boldsymbol{\mu}_t^N, \pi_t)-P^{\mathrm{MF}}(\boldsymbol{\mu}_t, \pi_t)|_1\\
		&\overset{(b)}{\leq} \dfrac{C_P}{\sqrt{N}}\left[\sqrt{|\mathcal{X}|}+\sqrt{|\mathcal{U}|}\right] + S_P|\boldsymbol{\mu}_t^N-\boldsymbol{\mu}_t|_1\\
		&\overset{(c)}{\leq} \dfrac{C_P}{\sqrt{N}}\left[\sqrt{|\mathcal{X}|}+\sqrt{|\mathcal{U}|}\right] \dfrac{(S_P^{t+1}-1)}{S_P-1}
	\end{align*}
	
	Inequality (a) follows from Lemma \ref{lemma_6} and Eq. (\ref{mu_t_plus_1}) while (b) is a result of Lemma \ref{lemma_2}. Finally, inequality (c) can be derived by recursively applying (b). Therefore, the term $J_2$ can be upper bounded as follows.
	\begin{align*}
		J_2 \leq \dfrac{1}{\sqrt{N}}\left[\sqrt{|\mathcal{X}|}+\sqrt{|\mathcal{U}|}\right]\dfrac{S_RC_P}{S_P-1} \left[\dfrac{1}{1-\gamma S_P}-\dfrac{1}{1-\gamma}\right]
	\end{align*}
	
	This concludes the theorem. 
	
	\section{Proof of Lemma \ref{lemma_1}}
	\label{proof_lemma_1}
	
	The following inequalities hold true.
	\begin{align*}
		|&\nu^{\mathrm{MF}}(\boldsymbol{\mu}_1,\pi)-\nu^{\mathrm{MF}}(\boldsymbol{\mu}_2,\pi)|_1 \\
		&= \left|\sum_{x\in\mathcal{X}}\pi(x,\boldsymbol{\mu}_1)\boldsymbol{\mu}_1(x) - \sum_{x\in\mathcal{X}}\pi(x, \boldsymbol{\mu}_2)\boldsymbol{\mu}_2(x)\right|_1\\
		&=\sum_{u\in\mathcal{U}}\left|\sum_{x\in\mathcal{X}}\pi(x,\boldsymbol{\mu}_1)(u)\boldsymbol{\mu}_1(x) - \sum_{x\in\mathcal{X}}\pi(x, \boldsymbol{\mu}_2)(u)\boldsymbol{\mu}_2(x)\right|\\
		&\leq \sum_{u\in\mathcal{U}}\left|\sum_{x\in\mathcal{X}}\pi(x,\boldsymbol{\mu}_1)(u)\boldsymbol{\mu}_1(x) - \sum_{x\in\mathcal{X}}\pi(x, \boldsymbol{\mu}_2)(u)\boldsymbol{\mu}_1(x)\right|\\
		&+ \sum_{u\in\mathcal{U}}\left|\sum_{x\in\mathcal{X}}\pi(x,\boldsymbol{\mu}_2)(u)\boldsymbol{\mu}_1(x) - \sum_{x\in\mathcal{X}}\pi(x, \boldsymbol{\mu}_2)(u)\boldsymbol{\mu}_2(x)\right|\\
		&\leq \sum_{x\in \mathcal{X}}\boldsymbol{\mu}_1(x)\sum_{u\in\mathcal{U}}\left|\pi(x,\boldsymbol{\mu}_1)(u)-\pi(x,\boldsymbol{\mu}_2)(u)\right|\\
		&+ \sum_{x\in \mathcal{X}} \left|\boldsymbol{\mu}_1(x)-\boldsymbol{\mu}_2(x)\right| \sum_{u\in\mathcal{U}}\pi(x,\boldsymbol{\mu}_2)(u)\\
		&\overset{(a)}{\leq} L_Q|\boldsymbol{\mu}_1-\boldsymbol{\mu}_2|_1 \sum_{x\in\mathcal{X}} \boldsymbol{\mu}_1(x) + |\boldsymbol{\mu}_1-\boldsymbol{\mu}_2|_1\\
		&\overset{(b)}{=} (1+L_Q)|\boldsymbol{\mu}_1-\boldsymbol{\mu}_2|_1
	\end{align*}
	
	Inequality (a) is a consequence of the fact that $\pi\in \Pi$ and $\pi(x,\boldsymbol{\mu}_2)$ is a distribution. Finally, the equality (b) follows because $\boldsymbol{\mu}_1$ is a distribution. This concludes the result.
	
	
	\section{Proof of Lemma \ref{lemma_2}}
	\label{proof_lemma_2}
	
	Note the following inequalities.
	\begin{align*}
		&|P^{\mathrm{MF}}(\boldsymbol{\mu}_1, \pi) - P^{\mathrm{MF}}(\boldsymbol{\mu}_2,\pi)|_1 \\
		&= \Bigg|\sum_{x\in\mathcal{X}}\sum_{u\in\mathcal{U}}P(x, u, \boldsymbol{\mu}_1, \nu^{\mathrm{MF}}(\boldsymbol{\mu}_1,\pi))\pi(x, \boldsymbol{\mu}_1)(u)\boldsymbol{\mu}_1(x)\\
		&-\sum_{x\in\mathcal{X}}\sum_{u\in\mathcal{U}}P(x, u, \boldsymbol{\mu}_2, \nu^{\mathrm{MF}}(\boldsymbol{\mu}_2,\pi))\pi(x,\boldsymbol{\mu}_2)(u)\boldsymbol{\mu}_2(x)\Bigg|_1\\
		&\leq J_1 + J_2 
	\end{align*}
	where the term $J_1$ is as follows.
	
	\begin{align*}
		&J_1\triangleq \sum_{x\in\mathcal{X}}\sum_{u\in\mathcal{U}}\pi(x,\boldsymbol{\mu}_1)(u)\boldsymbol{\mu}_1(x)\\
		&\times \Big| P(x, u, \boldsymbol{\mu}_1, \nu^{\mathrm{MF}}(\boldsymbol{\mu}_1,\pi)) - P(x, u, \boldsymbol{\mu}_2, \nu^{\mathrm{MF}}(\boldsymbol{\mu}_2,\pi)) \Big|_1\\
		&\overset{(a)}{\leq} \sum_{x\in\mathcal{X}}\sum_{u\in\mathcal{U}}\pi(x,\boldsymbol{\mu}_1)(u)\boldsymbol{\mu}_1(x)\\
		&\times L_P\Big\{|\boldsymbol{\mu}_1-\boldsymbol{\mu}_2|_1 + |\nu^{\mathrm{MF}}(\boldsymbol{\mu}_1,\pi)-\nu^{\mathrm{MF}}(\boldsymbol{\mu}_2,\pi)|_1\Big\}\\
		&\overset{(b)}{\leq} L_P(2+L_Q)|\boldsymbol{\mu}_1-\boldsymbol{\mu}_2|_1
	\end{align*}
	
	Inequality (a) follows from Assumption \ref{assumption_1} whereas (b) uses Lemma \ref{lemma_1} and the fact that $\boldsymbol{\mu}_1$, $\pi(x,\boldsymbol{\mu}_1)$ are distributions. The term $J_2$ is given as follows.
	\begin{align*}
		J_2&\triangleq \sum_{x\in\mathcal{X}}\sum_{u\in\mathcal{U}} \left| P(x, u, \boldsymbol{\mu}_2,\nu^{\mathrm{MF}}(\boldsymbol{\mu}_2,\pi))\right|_1\\
		&\times \Big|\pi(x, \boldsymbol{\mu}_1)(u)\boldsymbol{\mu}_1(x)-\pi(x, \boldsymbol{\mu}_2)(u)\boldsymbol{\mu}_2(x)\Big|\\
		&\overset{(a)}{=}\sum_{x\in\mathcal{X}}\sum_{u\in\mathcal{U}}  \Big|\pi(x, \boldsymbol{\mu}_1)(u)\boldsymbol{\mu}_1(x)-\pi(x, \boldsymbol{\mu}_2)(u)\boldsymbol{\mu}_2(x)\Big|\\
		&\leq \sum_{x\in \mathcal{X}}\boldsymbol{\mu}_1(x)\sum_{u\in\mathcal{U}}|\pi(x,\boldsymbol{\mu}_1)(u)-\pi(x,\boldsymbol{\mu}_2)(u)|\\
		&+\sum_{x\in \mathcal{X}}|\boldsymbol{\mu}_1(x)-\boldsymbol{\mu}_2(x)|\sum_{u\in\mathcal{U}}\pi(x,\boldsymbol{\mu}_2)(u)\\
		&\overset{(b)}{\leq} L_Q|\boldsymbol{\mu}_1-\boldsymbol{\mu}_2|_1\sum_{x\in \mathcal{X}}\boldsymbol{\mu}_1(x) + |\boldsymbol{\mu}_1-\boldsymbol{\mu}_2|_1\\
		&\overset{(c)}{=}(1+L_Q)|\boldsymbol{\mu}_1-\boldsymbol{\mu}_2|_1
	\end{align*}
	
	Equality (a) uses the fact that $P(x, u, \boldsymbol{\mu}_2, \nu^{\mathrm{MF}}(\boldsymbol{\mu}_2,\pi))$ is a distribution. Inequality (b) follows from Assumption \ref{assumption_3} while equation (c) holds because $\boldsymbol{\mu}_1$ is a distribution.
	
	
	\section{Proof of Lemma \ref{lemma_3}}
	\label{proof_lemma_3}
	
	The following inequalities hold true.
	\begin{align*}
		&|r^{\mathrm{MF}}(\boldsymbol{\mu}_1, \pi) - r^{\mathrm{MF}}(\boldsymbol{\mu}_2,\pi)|_1 \\
		&= \Bigg|\sum_{x\in\mathcal{X}}\sum_{u\in\mathcal{U}}r(x, u, \boldsymbol{\mu}_1, \nu^{\mathrm{MF}}(\boldsymbol{\mu}_1,\pi))\pi(x, \boldsymbol{\mu}_1)(u)\boldsymbol{\mu}_1(x)\\
		&-\sum_{x\in\mathcal{X}}\sum_{u\in\mathcal{U}}r(x, u, \boldsymbol{\mu}_2, \nu^{\mathrm{MF}}(\boldsymbol{\mu}_2,\pi))\pi(x,\boldsymbol{\mu}_2)(u)\boldsymbol{\mu}_2(x)\Bigg|_1\\
		&\leq J_1 + J_2 
	\end{align*}
	where the term $J_1$ is given as follows.
	
	\begin{align*}
		&J_1\triangleq \sum_{x\in\mathcal{X}}\sum_{u\in\mathcal{U}}\pi(x,\boldsymbol{\mu}_1)(u)\boldsymbol{\mu}_1(x)\\
		&\times \Big| r(x, u, \boldsymbol{\mu}_1, \nu^{\mathrm{MF}}(\boldsymbol{\mu}_1,\pi)) - r(x, u, \boldsymbol{\mu}_2, \nu^{\mathrm{MF}}(\boldsymbol{\mu}_2,\pi)) \Big|\\
		&\overset{(a)}{\leq} \sum_{x\in\mathcal{X}}\sum_{u\in\mathcal{U}}\pi(x,\boldsymbol{\mu}_1)(u)\boldsymbol{\mu}_1(x)\\
		&\times L_R\Big\{|\boldsymbol{\mu}_1-\boldsymbol{\mu}_2|_1 + |\nu^{\mathrm{MF}}(\boldsymbol{\mu}_1,\pi)-\nu^{\mathrm{MF}}(\boldsymbol{\mu}_2,\pi)|_1\Big\}\\
		&\overset{(b)}{\leq} L_R(2+L_Q)|\boldsymbol{\mu}_1-\boldsymbol{\mu}_2|_1
	\end{align*}
	
	Inequality (a) follows from Corollary \ref{corollary_1}(b) whereas (b) uses Lemma \ref{lemma_1} and the fact that $\boldsymbol{\mu}_1$, $\pi(x,\boldsymbol{\mu}_1)$ are distributions. The term $J_2$ is given as follows.
	\begin{align*}
		&J_2\triangleq \sum_{x\in\mathcal{X}}\sum_{u\in\mathcal{U}} \left| r(x, u, \boldsymbol{\mu}_2,\nu^{\mathrm{MF}}(\boldsymbol{\mu}_2,\pi))\right|\\
		&\times \Big|\pi(x, \boldsymbol{\mu}_1)(u)\boldsymbol{\mu}_1(x)-\pi(x, \boldsymbol{\mu}_2)(u)\boldsymbol{\mu}_2(x)\Big|\\
		&\overset{(a)}{\leq}M_R\sum_{x\in\mathcal{X}}\sum_{u\in\mathcal{U}}  \Big|\pi(x, \boldsymbol{\mu}_1)(u)\boldsymbol{\mu}_1(x)-\pi(x, \boldsymbol{\mu}_2)(u)\boldsymbol{\mu}_2(x)\Big|\\
		&\leq M_R\sum_{x\in \mathcal{X}}\boldsymbol{\mu}_1(x)\sum_{u\in\mathcal{U}}|\pi(x,\boldsymbol{\mu}_1)(u)-\pi(x,\boldsymbol{\mu}_2)(u)|\\
		&+ M_R\sum_{x\in \mathcal{X}}|\boldsymbol{\mu}_1(x)-\boldsymbol{\mu}_2(x)|\sum_{u\in\mathcal{U}}\pi(x,\boldsymbol{\mu}_2)(u)\\
		&\overset{(b)}{\leq} M_RL_Q|\boldsymbol{\mu}_1-\boldsymbol{\mu}_2|_1\sum_{x\in \mathcal{X}}\boldsymbol{\mu}_1(x) + M_R|\boldsymbol{\mu}_1-\boldsymbol{\mu}_2|_1\\
		&\overset{(c)}{=}M_R(1+L_Q)|\boldsymbol{\mu}_1-\boldsymbol{\mu}_2|_1
	\end{align*}
	
	Inequality (a) uses Corollary \ref{corollary_1}(a). Inequality (b) follows from Assumption \ref{assumption_3} while equation (c) holds because $\boldsymbol{\mu}_1$ is a distribution. This concludes the lemma.
	
	\section{Proof of Lemma \ref{lemma_5}}
	\label{proof_lemma_5}
	
	Applying the definitions of $\boldsymbol{\nu}_t^N$ and $\nu^{\mathrm{MF}}$, we can write the following.
	\begin{align}
		\begin{split}
			\mathbb{E}&|\boldsymbol{\nu}_t^N-\nu^{\mathrm{MF}}(\boldsymbol{\mu}_t^N, \pi_t)|_1 \\
			&= \sum_{u\in\mathcal{U}} \mathbb{E}|\boldsymbol{\nu}_t^N(u)-\nu^{\mathrm{MF}}(\boldsymbol{\mu}_t^N, \pi_t)(u)| \\
			&= \sum_{u\in\mathcal{U}} \mathbb{E}\left|\dfrac{1}{N}\sum_{i=1}^N\delta(u_t^i=u) - \sum_{x\in \mathcal{X}}\pi_t(x,\boldsymbol{\mu}_t^N)(u)\boldsymbol{\mu}_t^N(x)\right|	
		\end{split}
		\label{eq_13}
	\end{align}
	
	Similarly, using the definition of $\boldsymbol{\mu}_t^N$, we get,
	\begin{align}
		\begin{split}
			&\sum_{x\in\mathcal{X}}\pi_t(x,\boldsymbol{\mu}_t^N)(u)\boldsymbol{\mu}_t^N(x)\\
			&=\sum_{x\in \mathcal{X}}\pi_t(x,\boldsymbol{\mu}_t^N)(u)\dfrac{1}{N}\sum_{i=1}\delta(x_t^i=x)\\
			&=\dfrac{1}{N} \sum_{i=1}^N \sum_{x\in \mathcal{X}}\pi_t(x,\boldsymbol{\mu}_t^N)(u)\delta(x_t^j=x)\\
			&=\dfrac{1}{N}\sum_{i=1}^N \pi_t(x_t^j, \boldsymbol{\mu}_t^N)
		\end{split}
	\end{align}
	
	Substituting into $(\ref{eq_13})$, we obtain the following.
	\begin{align*}	\mathbb{E}&|\boldsymbol{\nu}_t^N-\nu^{\mathrm{MF}}(\boldsymbol{\mu}_t^N, \pi_t)|_1\\
		& = \dfrac{1}{N}\sum_{u\in\mathcal{U}} \mathbb{E} \left|\sum_{i=1}^N \delta(u_t^j=u) - \pi_t (x_t^i, \boldsymbol{\mu}_t^N)(u)\right|\\
		&\overset{(a)}{\leq} \dfrac{\sqrt{|\mathcal{U}|}}{\sqrt{N}}
	\end{align*}
	
	Inequality (a) is a consequence of Lemma \ref{lemma_4}. Particularly, we use the fact that $\forall u\in \mathcal{U}$, the random variables $\{\delta(u_t^i=u)\}_{i\in [N]}$ lie in $[0, 1]$, are conditionally independent given $\boldsymbol{x}_t\triangleq\{x_t^i\}_{i\in [N]}$ (thereby given $\boldsymbol{\mu}_t^N$), and satisfy the following constraints. 
	\begin{align*}
		\mathbb{E}\left[\delta(u_t^i=u)|\boldsymbol{x}_t\right] &= \pi_t(x_t^i, \boldsymbol{\mu}_t^N)\\
		\sum_{u\in\mathcal{U}} \mathbb{E}\left[\delta(u_t^i=u)|\boldsymbol{x}_t\right] &=1, ~\forall i\in [N] 
	\end{align*}
	
	
	
	\section{Proof of Lemma \ref{lemma_6}}
	\label{proof_lemma_6}
	
	Using the definition of $P^{\mathrm{MF}}$, we get the following.
	\begin{align*}
		&P^{\mathrm{MF}}(\boldsymbol{\mu}_t^N, \pi_t) \\
		&= \sum_{x\in \mathcal{X}}\sum_{u\in\mathcal{U}}P(x, u, \boldsymbol{\mu}_t^N, \nu^{\mathrm{MF}}(\boldsymbol{\mu}_t^N, \pi_t))\pi_t(x, \boldsymbol{\mu}_t^N)(u)\boldsymbol{\mu}_t^N(x)\\
		&=\sum_{x\in \mathcal{X}}\sum_{u\in\mathcal{U}}P(x, u, \boldsymbol{\mu}_t^N, \nu^{\mathrm{MF}}(\boldsymbol{\mu}_t^N, \pi_t))\pi_t(x, \boldsymbol{\mu}_t^N)(u)\boldsymbol{\mu}_t^N(x)\\
		&\hspace{2cm}\times \dfrac{1}{N}\sum_{i=1}^N\delta(x_t^i=x)\\
		&=\dfrac{1}{N}\sum_{i=1}^N\sum_{u\in\mathcal{U}}P(x_t^i,u,\boldsymbol{\mu}_t^N, \nu^{\mathrm{MF}}(\boldsymbol{\mu}_t^N, \pi_t))\pi_t(x_t^i, \boldsymbol{\mu}_t^N)(u)
	\end{align*}
	
	Using the definition of $L_1$ norm, we can write the following.
	\begin{align*}
		&\mathbb{E}\left|\boldsymbol{\mu}_{t+1}^N-P^{\mathrm{MF}}(\boldsymbol{\mu}_t^N, \pi_t)\right|_1\\
		&=\sum_{x\in\mathcal{X}} \mathbb{E}\left|\boldsymbol{\mu}_{t+1}^N(x)-P^{\mathrm{MF}}(\boldsymbol{\mu}_t^N, \pi_t)(x)\right|_1\\
		&=\dfrac{1}{N}\sum_{x\in\mathcal{X}}\mathbb{E}\Bigg| \sum_{i=1}^N \delta(x_{t+1}^i=x) \\
		& - \sum_{i=1}^N\sum_{u\in\mathcal{U}}P(x_t^i,u,\boldsymbol{\mu}_t^N, \nu^{\mathrm{MF}}(\boldsymbol{\mu}_t^N, \pi_t))(x)\pi_t(x_t^i, \boldsymbol{\mu}_t^N)(u) \Bigg|\\
		&\leq J_1 + J_2 + J_3
	\end{align*}
	
	The first term, $J_1$ is given as follows.
	\begin{align*}
		J_1 &\triangleq \dfrac{1}{N} \sum_{x\in\mathcal{X}} \mathbb{E}\left| \sum_{i=1}^N \delta(x_{t+1}^j=x) - P(x_t^i, u_t^i, \boldsymbol{\mu}_t^N, \boldsymbol{\nu}_t^N)(x)\right|\\
		&\overset{(a)}{\leq} \dfrac{\sqrt{|\mathcal{X}|}}{\sqrt{N}}
	\end{align*}
	
	Inequality (a) follows from Lemma \ref{lemma_4}. Specifically, we use the fact that, $\forall x\in \mathcal{X}$, the random variables $\{\delta(x_{t+1}^i=x)\}_{i\in[N]}$ lie in $[0, 1]$, are conditionally independent given $\boldsymbol{x}_t\triangleq \{x_t^i\}_{i\in[N]}$, $\boldsymbol{u}_t\triangleq \{u_t^i\}_{i\in [N]}$, (thereby given $\boldsymbol{\mu}_t^N$, $\boldsymbol{\nu}_t^N$) and satisfy the following.
	\begin{align*}
		\mathbb{E}[\delta(x_{t+1}^i=x)|\boldsymbol{x}_t, \boldsymbol{u}_t] &= P(x_t^i, u_t^i, \boldsymbol{\mu}_t^N, \boldsymbol{\nu}_t^N),\\
		\sum_{x\in \mathcal{X}} 	\mathbb{E}[\delta(x_{t+1}^i=x)|\boldsymbol{x}_t, \boldsymbol{u}_t] &= 1, ~\forall i \in [N]
	\end{align*}
	
	The second term $J_2$ can be expressed as follows.
	\begin{align*}
		&J_2 \triangleq \dfrac{1}{N}\sum_{x\in \mathcal{X}}\mathbb{E}\Big|\sum_{i=1}^N P(x_t^i, u_t^i, \boldsymbol{\mu}_t^N, \boldsymbol{\nu}_t^N)(x) \\
		&- P(x_t^i, u_t^i, \boldsymbol{\mu}_t^N, \nu^{\mathrm{MF}}(\boldsymbol{\mu}_t^N, \pi_t))(x)\Big|\\
		&\leq \dfrac{1}{N}\sum_{i=1}^N \mathbb{E} \Big|P(x_t^i, u_t^i, \boldsymbol{\mu}_t^N, \boldsymbol{\nu}_t^N)\\
		&- P(x_t^i, u_t^i, \boldsymbol{\mu}_t^N, \nu^{\mathrm{MF}}(\boldsymbol{\mu}_t^N, \pi_t))\Big|_1\\
		&\overset{(a)}{\leq} L_P\mathbb{E}|\boldsymbol{\nu}_t^N-\nu^{\mathrm{MF}}(\boldsymbol{\mu}_t^N, \pi_t)| \overset{(b)}{\leq} L_P\dfrac{\sqrt{|\mathcal{U}|}}{\sqrt{N}}
	\end{align*}
	
	Inequality (a) follows from Assumption \ref{assumption_1} whereas (b) results from Lemma \ref{lemma_5}. Finally, the term $J_3$ is defined as follows.
	\begin{align*}
		& J_3\triangleq \dfrac{1}{N}\sum_{x\in \mathcal{X}} \mathbb{E} \Bigg|\sum_{i=1}^N  P(x_t^i, u_t^i, \boldsymbol{\mu}_t^N, \nu^{\mathrm{MF}}(\boldsymbol{\mu}_t^N, \pi_t))(x)\\
		&-\sum_{i=1}^N\sum_{u\in\mathcal{U}}P(x_t^i,u,\boldsymbol{\mu}_t^N, \nu^{\mathrm{MF}}(\boldsymbol{\mu}_t^N, \pi_t))(x)\pi_t(x_t^i, \boldsymbol{\mu}_t^N)(u)\Bigg|\\
		&\overset{(a)}{\leq} \dfrac{\sqrt{|\mathcal{X}|}}{\sqrt{N}}
	\end{align*}
	
	Relation (a) results from Lemma \ref{lemma_4}. Particularly we use the fact that $\forall x\in \mathcal{X}$,  $\{P(x_t^i, u_t^i, \boldsymbol{\mu}_t^N, \nu^{\mathrm{MF}}(\boldsymbol{\mu}_t^N, \pi_t))(x)\}_{i\in[N]}$ lie in the interval $[0, 1]$, are conditionally independent given $\boldsymbol{x}_t\triangleq\{x_t^i\}_{i\in[N]}$ (therefore, given $\boldsymbol{\mu}_t^N$), and satisfy the following constraints.
	\begin{align*}
		&\mathbb{E}[P(x_t^i, u_t^i, \boldsymbol{\mu}_t^N, \nu^{\mathrm{MF}}(\boldsymbol{\mu}_t^N, \pi_t))(x)|\boldsymbol{x}_t] \\
		& = \sum_{u\in\mathcal{U}} P(x_t^i, u, \boldsymbol{\mu}_t^N, \nu^{\mathrm{MF}}(\boldsymbol{\mu}_t^N, \pi_t))(x) \pi_t(x_t^i, \boldsymbol{\mu}_t^N)(u), \\
		&\text{and }\sum_{x\in \mathcal{X}} \mathbb{E}[P(x_t^i, u_t^i, \boldsymbol{\mu}_t^N, \nu^{\mathrm{MF}}(\boldsymbol{\mu}_t^N, \pi_t))(x)|\boldsymbol{x}_t] = 1
	\end{align*}
	
	This concludes the Lemma.
	
	\section{Proof of Lemma \ref{lemma_7}}
	\label{proof_lemma_7}
	
	
	Note that,
	\begin{align*}
		&r^{\mathrm{MF}}(\boldsymbol{\mu}_t^N,\pi_t) \\
		&= \sum_{x\in \mathcal{X}}\sum_{u\in\mathcal{U}} r(x, u, \boldsymbol{\mu}_t^N, \nu^{\mathrm{MF}}(\boldsymbol{\mu}_t^N, \pi_t))\pi_t(x,\boldsymbol{\mu}_t^N)(u)\boldsymbol{\mu}_t^N(x)\\
		&=\sum_{x\in \mathcal{X}}\sum_{u\in\mathcal{U}} r(x, u, \boldsymbol{\mu}_t^N, \nu^{\mathrm{MF}}(\boldsymbol{\mu}_t^N, \pi_t))\pi_t(x,\boldsymbol{\mu}_t^N)(u)\\
		&\hspace{2cm}\times\dfrac{1}{N}\sum_{i=1}^N\delta(x_t^i=x)\\
		&=\dfrac{1}{N}\sum_{u\in\mathcal{U}}\sum_{i=1}^Nr(x_t^i, u, \boldsymbol{\mu}_t^N, \nu^{\mathrm{MF}}(\boldsymbol{\mu}_t^N, \pi_t))\pi_t(x_t^i,\boldsymbol{\mu}_t^N)(u)\\
		&\overset{(a)}{=}\dfrac{1}{N}\sum_{u\in\mathcal{U}}\sum_{i=1}^N \left[ \boldsymbol{a}^T\boldsymbol{\mu}_t^N + \boldsymbol{b}^T\nu^{\mathrm{MF}}(\boldsymbol{\mu}_t^N, \pi_t) + f(x_t^i, u)\right]\\
		&\hspace{2cm}\times\pi_t(x_t^i,\boldsymbol{\mu}_t^N)(u)\\
		&\overset{(b)}{=}\boldsymbol{a}^T\boldsymbol{\mu}_t^N + \boldsymbol{b}^T\nu^{\mathrm{MF}}(\boldsymbol{\mu}_t^N, \pi_t) \\
		&\hspace{1cm}+ \dfrac{1}{N}\sum_{u\in\mathcal{U}}\sum_{i=1}^N f(x_t^i, u)\pi_t(x_t^i, \boldsymbol{\mu}_t^N)(u)
	\end{align*}
	
	Equality (a) follows from Assumption \ref{assumption_2} while (b) uses the fact that $\pi_t(x_t^i, \boldsymbol{\mu}_t^N)$ is a distribution. On the other hand, 
	\begin{align*}
		&\dfrac{1}{N}\sum_{i=1}^N r(x_t^i, u_t^i, \boldsymbol{\mu}_t^{i,N}, \boldsymbol{\nu}_t^{i,N})\\
		&=\dfrac{1}{N}\sum_{i=1}^N \left[\boldsymbol{a}^T\boldsymbol{\mu}_t^{i,N} + \boldsymbol{b}^T\boldsymbol{\nu}_t^{i,N} + f(x_t^i, u_t^i)\right]\\
		&=\dfrac{1}{N}\sum_{i=1}^N \Bigg[ \sum_{x\in \mathcal{X}} a(x)\boldsymbol{\mu}_t^{i,N}(x) + \sum_{u\in\mathcal{U}} b(u)\boldsymbol{\nu}_t^{i,N}(u)\Bigg]\\
		& + \dfrac{1}{N}\sum_{i=1}^N f(x_t^i, u_t^i)
	\end{align*}
	
	Now the first term can be simplified as follows.
	\begin{align*}
		&\dfrac{1}{N}\sum_{x\in \mathcal{X}} a(x) \sum_{i=1}^N \sum_{j=1}^N W(i,j)\delta(x_t^j=x)\\
		&=\dfrac{1}{N}\sum_{x\in \mathcal{X}} a(x) \sum_{j=1}^N \delta(x_t^j=x) \sum_{i=1}^N W(i, j)\\
		&\overset{(a)}{=}\sum_{x\in \mathcal{X}} a(x) \dfrac{1}{N}\sum_{j=1}^N \delta(x_t^j=x) = \boldsymbol{a}^T\boldsymbol{\mu}_t^N
	\end{align*}
	
	Equality (a) follows as $W$ is doubly-stochastic (Assumption \ref{assumption_4}). Similarly, the second term can be simplified as shown below.
	\begin{align*}
		&\dfrac{1}{N}\sum_{u\in \mathcal{U}} b(u) \sum_{i=1}^N \sum_{j=1}^N W(i,j)\delta(u_t^j=u)\\
		&=\dfrac{1}{N}\sum_{u\in \mathcal{U}} b(u) \sum_{j=1}^N \delta(u_t^j=u) \sum_{i=1}^N W(i, j)\\
		&\overset{(a)}{=}\sum_{u\in \mathcal{U}} b(u) \dfrac{1}{N}\sum_{j=1}^N \delta(u_t^j=u) = \boldsymbol{b}^T\boldsymbol{\nu}_t^N
	\end{align*}
	
	Equality (a) follows from Assumption \ref{assumption_4}. Therefore, we get,
	\begin{align*}
		&\mathbb{E}\left|\dfrac{1}{N}\sum_{i=1}^N r(x_t^i, u_t^i, \boldsymbol{\mu}_t^{i,N}, \boldsymbol{\nu}_t^{i,N}) - r^{\mathrm{MF}}(\boldsymbol{\mu}_t^N,\pi_t)\right|\\
		&\leq |\boldsymbol{b}|_1 \mathbb{E}|\boldsymbol{\nu}_t^N-\nu^{\mathrm{MF}}(\boldsymbol{\mu}_t^N, \pi_t)|_1 \\
		&+ \dfrac{1}{N} \mathbb{E}\left|\sum_{i=1}^N f(x_t^i, u_t^i) - \sum_{i=1}^N\sum_{u\in \mathcal{U}}f(x_t^i, u)\pi_t(x_t^i, \boldsymbol{\mu}_t^N)(u)\right|
	\end{align*}
	
	Using Lemma \ref{lemma_5}, the first term can be upper bounded by $|\boldsymbol{b}|_1\sqrt{|\mathcal{U}|/N}$. The second term can be bounded as follows.
	\begin{align*}
		&\dfrac{1}{N} \mathbb{E}\left|\sum_{i=1}^N f(x_t^i, u_t^i) - \sum_{i=1}^N\sum_{u\in \mathcal{U}}f(x_t^i, u)\pi_t(x_t^i, \boldsymbol{\mu}_t^N)(u)\right|\\
		&\leq\dfrac{1}{N} \sum_{u\in \mathcal{U}}\mathbb{E}\left|\sum_{i=1}^Nf(x_t^i, u)\left[\delta(u_t^i=u)-\pi_t(x_t^i, \boldsymbol{\mu}_t^N)(u)\right]\right|\\
		&\overset{(a)}{\leq} M_F \dfrac{\sqrt{|\mathcal{U}|}}{\sqrt{N}}
	\end{align*}
	
	The term $M_F>0$ is such that $|f(x,u)|\leq M_F$, $\forall x\in\mathcal{X}$, $\forall u\in \mathcal{U}$. Such $M_F$ always exists since $\mathcal{X}$, and $\mathcal{U}$ are finite. Equality (a) is a result of Lemma $\ref{lemma_4}$. In particular, we use the following facts to prove this result. The random variables $\{\delta(u_t^i=u)\}_{i\in [N]}$ are conditionally independent given $\boldsymbol{x}_t\triangleq\{x_t^i\}_{i\in[N]}$ (therefore, given $\boldsymbol{\mu}_t^N$), $\forall u\in \mathcal{U}$ and they lie in the interval $[0,1]$. Moreover,
	\begin{align*}
		&|f(x_t^i, u)| \leq M_F, \forall i\in [N], \forall u\in \mathcal{U},\\
		& \mathbb{E}[\delta(u_t^i=u)|\boldsymbol{x}_t] = \pi_t(x_t^i, \boldsymbol{\mu}_t^N),\\
		&\sum_{u\in\mathcal{U}}\mathbb{E}[\delta(u_t^i=u)|\boldsymbol{x}_t] = 1
	\end{align*}
	
	
	
	
	\section{Sampling Procedure}
	\label{sampling_process}
	
	\begin{algorithm}
		\caption{Sampling Algorithm}
		\label{algo_2}
		\textbf{Input:} $\boldsymbol{\mu}_0$, $\boldsymbol{\pi}_{\Phi_j}$,
		$P$, $r$
		\begin{algorithmic}[1]
			\STATE Sample $x_0\sim \boldsymbol{\mu}_0$. 
			\STATE Sample $u_0\sim {\pi}_{\Phi_j}(x_0,\boldsymbol{\mu}_0)$ 
			\STATE  $\boldsymbol{\nu}_0\gets\nu^{\mathrm{MF}}(\boldsymbol{\mu}_0,\pi_{\Phi_j})$ where $\nu^{\mathrm{MF}}$ is defined in $(\ref{nu_MF})$.
			\STATE $t\gets 0$ 
			\STATE $\mathrm{FLAG}\gets \mathrm{FALSE}$
			\WHILE{$\mathrm{FLAG~is~} \mathrm{FALSE}$}
			{
				\STATE $\mathrm{FLAG}\gets \mathrm{TRUE}$ with probability $1-\gamma$.
				\STATE Execute $\mathrm{Update}$
				
			}
			\ENDWHILE
			
			\STATE $T\gets t$
			
			\STATE Accept   $(x_T,\boldsymbol{\mu}_T,u_T)$ as a sample. 
			
			\STATE $\hat{V}_{\Phi_j}\gets 0$, $\hat{Q}_{\Phi_j}\gets 0$
			
			\STATE $\mathrm{FLAG}\gets \mathrm{FALSE}$
			\STATE $\mathrm{SumRewards}\gets 0$
			\WHILE{$\mathrm{FLAG~is~} \mathrm{FALSE}$}
			{
				\STATE $\mathrm{FLAG}\gets \mathrm{TRUE}$ with probability $1-\gamma$.
				\STATE Execute $\mathrm{Update}$
				\STATE $\mathrm{SumRewards}\gets \mathrm{SumRewards} + r(x_t,u_t,\boldsymbol{\mu}_t,\boldsymbol{\nu}_t)$
				
			}
			\ENDWHILE
			
			\STATE With probability $\frac{1}{2}$, $\hat{V}_{\Phi_j}\gets \mathrm{SumRewards}$. Otherwise $\hat{Q}_{\Phi_j}\gets \mathrm{SumRewards}$.  
			
			\STATE $\hat{A}_{\Phi_j}(x_T,\boldsymbol{\mu}_T,u_T)\gets 2(\hat{Q}_{\Phi_j}-\hat{V}_{\Phi_j})$.
			
		\end{algorithmic} 
		\textbf{Output}: $(x_T,\boldsymbol{\mu}_T,u_T)$ and $\hat{A}_{\Phi_j}(x_T,\boldsymbol{\mu}_T,u_T)$
		
		
		\textbf{Procedure} $\mathrm{Update}$:
		
		\begin{algorithmic}[1]
			\STATE $x_{t+1}\sim P(x_t,u_t,\boldsymbol{\mu}_t,\boldsymbol{\nu}_t)$.
			\STATE  $\boldsymbol{\mu}_{t+1}\gets P^{\mathrm{MF}}(\boldsymbol{\mu}_t,\pi_{\Phi_j})$ where $P^{\mathrm{MF}}$ is defined in $(\ref{mu_t_plus_1})$.
			\STATE  $u_{t+1}\sim {\pi}_{\Phi_j}(x_{t+1},\boldsymbol{\mu}_{t+1})$
			\STATE $\boldsymbol{\nu}_{t+1}\gets\nu^{\mathrm{MF}}(\boldsymbol{\mu}_{t+1},\pi_{\Phi_j})$
			\STATE $t\gets t+1$
		\end{algorithmic}
		\textbf{EndProcedure}
	\end{algorithm}
	
	% NOTE: necessary when ptmx or no mathfont class option is given
	\providecommand{\upGamma}{\Gamma}
	\providecommand{\uppi}{\pi}
	
	\bibliography{mondal_582}
	
\end{document}
