\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams


\usepackage{mymacros}
\usepackage[inline]{enumitem}
\usepackage{wrapfig}
\usepackage[]{algorithm2e}
\usepackage{booktabs}
\usepackage{url}
\usepackage{caption}
\usepackage{hyperref}

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{metelli_688}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\allowdisplaybreaks[4]

\title{On the Relation between Policy Improvement \\ and Off-Policy Minimum-Variance Policy Evaluation \\ (Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
% Add authors
\author[1]{\href{mailto:albertomaria.metelli@polimi.it}{Alberto Maria Metelli}{}}
\author[1]{\href{mailto:samuele.meta@mail.polimi.it}{Samuele Meta}}{}
\author[1]{\href{mailto:marcello.restelli@polimi.it}{Marcello Restelli}{}}
% Add affiliations after the authors
\affil[1]{%
    Dipartimento di Elettronica, Informazione e Bioingegneria\\
    Politecnico di Milano\\
    Milan, Italy
}
   
\newcommand{\algname}{MBPExPI\@\xspace}
\newcommand{\algnameext}{Minimum-Variance Policy Evaluation for Policy Improvement\@\xspace}
\newcommand{\alphaInterval}{[0,\infty]}%(0,1) \cup (1,\infty)}

\newcommand{\qqa}{\textbf{\textcolor{vibrantBlue}{(Q1)}}\@\xspace}
\newcommand{\qqb}{\textbf{\textcolor{vibrantTeal}{(Q2)}}\@\xspace}
\newcommand{\qqc}{\textbf{\textcolor{vibrantRed}{(Q3)}}\@\xspace}

\newcommand{\parref}[1]{(\ref{#1})}


\newcommand{\fix}{\marginpar{FIX}}
\newcommand{\new}{\marginpar{NEW}}

\definecolor{revisionColor}{rgb}{0.6, 0.4, 0.8}

\newcommand{\vvv}{\textcolor{black}{\bm{\xi}}}
\newcommand{\Cvvv}{\textcolor{black}{\bm{\Xi}}}

  
  \input{math_commands}
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\section{Proofs and Derivations}\label{apx:proofs}
In this appendix, we report the proofs and derivations, we have omitted in the main paper.


\subsection{Proofs of Section~\ref{sec:optRepOpt}}


\textbf{Proof of Proposition~\ref{prop:propIncreasing}}
\begin{proof}
Let us consider the following derivation:
\begin{align*}
\E_{x \sim \mathcal{I}_{h \circ f}[P]}[f(x)] - \E_{x \sim P}[f(x)] & = \int_{\Xs} \frac{p(x) h(f(x))}{\E_{x \sim P}[h(f(x))]} f(x) \de x -  \E_{x \sim P}[f(x)] \\
& = \frac{\E_{x \sim P}[h(f(x))f(x)] -  \E_{x \sim P}[f(x)] \E_{x \sim P}[h(f(x))] }{\E_{x \sim P}[h(f(x))]} \\
& = \frac{\Cov_{x \sim P}[h(f(x)),f(x)]}{\E_{x \sim P}[h(f(x))]},
\end{align*}
where we have exploited the definition of $\mathcal{I}_{h \circ f}$ and the definition of covariance. The result is obtained by recalling that $h$ is increasing and the covariance between two increasing functions of the same random variable (\ie $h$ and the identity function)  is non-negative~\citep{cuadras2002covariance}.
\end{proof}

\textbf{Proof of Theorem~\ref{thr:convergence}}
\begin{proof}
We are going to actually prove a more general statement in which we consider a non-negative monotonic increasing function $h$ that is composed to function $f$, \ie $h \circ f$. The theorem statement can be obtained by setting $h$ to be the identity function.

	We start with (i). First of all, we observe that since $h$ is monotonically strictly-increasing it holds that $\Var_{x \sim P}[f(x)] = 0$ if and only if $\Var_{x \sim P}[h(f(x))] = 0$. $P$ is a fixed point of $\mathcal{I}_{h \circ f}$, \ie $P = \mathcal{I}_{h \circ f}[P]$ a.s. if and only if for all $x \in \Xs$ it holds a.s.:
	\begin{align*}
		p(x) = \frac{p(x) h(f(x))}{\E_{x \sim P}[h(f(x))]},
	\end{align*}
	that occurs if and only if either $p(x) = 0$ ($x \not\in \supp(P)$) or $h(f(x)) = \E_{x \sim P}[h(f(x))]$. ($\Rightarrow$) Whenever $p(x)$ is not zero, function $h(f(x))$ is a constant in $\supp(P)$ and, consequently, its variance under $P$ is zero. ($\Leftarrow$) Suppose that $\Var_{x \sim P}[h(f(x))]=0$, then $h(f(x)) = \E_{x \sim P}[h(f(x))]$ almost surely and, consequently $\frac{p(x) h(f(x))}{\E_{x \sim P}[h(f(x))]} = p(x)$ almost surely.
	Let us now consider (ii). First of all, we can easily observe that for every $k \in \Nat$:
	\begin{align*}
		\left( \mathcal{I}_{h \circ f}\right)^k[P](x) = \frac{p(x) f(x)^k}{\E_{x \sim P}[f(x)^k]}.
	\end{align*}
	Let $f^* = \max_{x \in \supp(P)} \{f(x)\}$, consider the function $g_k(x) = p(x) \left(\frac{f(x)}{f^*}\right)^k$ and the limit:
	\begin{align*}
		\lim_{k \rightarrow \infty} g_k(x) = \lim_{k \rightarrow \infty} p(x) \left(\frac{f(x)}{f^*}\right)^k = \begin{cases}
			p(x) & \text{if } x \in \Xs^* \\
			0 & \text{otherwise}
		\end{cases}.
	\end{align*}
	Thus, we have:
	\begin{align*}
	Q_{\infty} &= \lim_{k \rightarrow \infty}\left( \mathcal{I}_{h \circ f}\right)^k[P](x) = \lim_{k \rightarrow \infty} \frac{p(x) f(x)^k }{\int_{\Xs} p(x) f(x)^k \de x} \\
	& = \lim_{k \rightarrow \infty} \frac{g_k(x) }{\int_{\Xs} g_k(x) \de x} = \begin{cases}
			\frac{p(x)}{\int_{\Xs^*} p(x) \de x} & \text{if } x \in \Xs^* \\
			0 & \text{otherwise}
		\end{cases}.
	\end{align*}
	Thus, the support of $Q_{\infty}$ is given by $\Xs^*$. Consequently, the expectation of $f$ under $Q_{\infty}$ is given by:
	\begin{align*}
		\E_{x \sim Q_{\infty}} [f(x)] = \int_{\Xs} q_{\infty}(x) f(x) \de x = f^*.
	\end{align*}
\end{proof}


\textbf{Proof of Theorem~\ref{thr:trustRegion}}
\begin{proof}
We are going to actually prove a more general statement in which we consider a non-negative monotonic increasing function $h$ that is composed to function $f$, \ie $h \circ f$. The theorem statement can be obtained by setting $h$ to be the identity function.

	Let us consider the following derivation:
	\begin{align*}
		J := \int_{\Xs} \left(\left(I_{h \circ f}[P]\right)(x)\right)^\alpha p(x)^{1-\alpha} \de x & = \int_{\Xs} \left(\frac{p(x) h(f(x))}{\E_{x \sim P}[h(f(x))]}\right)^\alpha p(x)^{1-\alpha} \de x \\
		& = \frac{ \E_{x \sim P}[h(f(x))^\alpha]}{\E_{x \sim P}[h(f(x))]^\alpha}.
	\end{align*}
	By observing that $D_{\alpha}\left(I_{h \circ f}[P] \| P\right) = \frac{1}{\alpha-1}\log J$, we obtain the result. For $\alpha=1$, we provide an independent derivation:
	\begin{align*}
		D_{\text{KL}}(I_{h \circ f}[P] \| P) & = \int_{\Xs} \frac{p(x) h(f(x))}{\E_{x \sim P}[h(f(x))]} \log \frac{p(x) h(f(x))}{\E_{x \sim P}[h(f(x))] p(x)} \de x\\
		& = \frac{\E_{x \sim P}[h(f(x)) \log h(f(x))]- \E_{x \sim P}[h(f(x))] \E_{x\sim P}[\log h(f(x))]}{\E_{x \sim P}[h(f(x))]} \\
		& = \frac{\Cov_{x \sim P}[h(f(x)), \log h(f(x))]}{\E_{x \sim P}[h(f(x))]},
	\end{align*}
	where we exploited the definition of covariance in the last line.
\end{proof}

\subsection{Proofs of Section~\ref{sec:constrPolicySpace}}

\textbf{Proof of Proposition~\ref{prop:boundMoment}}
\begin{proof}
We are going to actually prove a more general statement in which we consider a non-negative monotonic increasing function $h$ that is composed to function $f$, \ie $h \circ f$. The theorem statement can be obtained by setting $h$ to be the identity function.

	First of all, we observe that since $\E_{x \sim Q}\left[ \frac{p(x)}{q(x)}h(f(x)) \right] = \E_{x \sim P}[h(f(x))]$, for $\alpha \in [2,+\infty)$, the absolute central $\alpha$-moment is smaller or equal than the (non-central) $\alpha$-moment. Thus, for $\alpha \in [2,+\infty)$, we have:
	\begin{align*}
	\E_{x \sim Q}& \left[ \left| \frac{p(x)}{q(x)} h(f(x)) - \E_{x \sim P} [h(f(x))] \right|^\alpha \right]  \le \E_{x \sim Q}\left[ \left( \frac{p(x)}{q(x)} h(f(x)) \right)^\alpha \right] \\
	& =  \int_{\Xs} \left(\frac{p(x) h(f(x))}{\E_{x \sim P}[h(f(x))]} \right)^\alpha q(x)^{1-\alpha} \de x \E_{x \sim P}[h(f(x))]^\alpha \\
	& = \int_{\Xs} \left((\mathcal{I}_{h \circ f}[P])(x) \right)^\alpha q(x)^{1-\alpha} \de x \E_{x \sim P}[h(f(x))]^\alpha \\
	&  = \exp \left\{(\alpha-1) \frac{1}{\alpha-1} \log \int_{\Xs} \left((\mathcal{I}_{h \circ f}[P])(x) \right)^\alpha q(x)^{1-\alpha} \de x \right\} \E_{x \sim P}[h(f(x))]^\alpha,
	\end{align*}
	where the first inequality follows from Lemma~\ref{lemma:tecLemma} with $y = \left(\frac{p(x)}{q(x)} h(f(x))\right) / \E_{x \sim P} [h(f(x))]$.
	By applying the definition of \Renyi divergences, we get the result.
\end{proof}

\textbf{Proof of Theorem~\ref{thr:thrImprovementConstrained}}
\begin{proof}
Let us consider the following derivation:
\begin{align}
	\E_{x \sim Q}& [h(f(x))^\alpha]  = \int_{\Xs} q(x) h(f(x))^\alpha \de x \notag \\
	& = \int_{\Xs} p(x) \frac{q(x)}{p(x)} h(f(x))^\alpha \de x \notag\\
	& = \int_{\Xs} p(x) h(f(x))^\alpha \de x + \int_{\Xs} p(x) \left(\frac{q(x)}{p(x)} - 1\right) h(f(x))^\alpha \de x \notag\\
	& \ge  \int_{\Xs} p(x) h(f(x))^\alpha \de x + \frac{1}{\alpha-1}\int_{\Xs} p(x) \left(1 - \left(\frac{p(x)}{q(x)}\right)^{\alpha-1} \right)h(f(x))^\alpha \de x \label{p:11001} \\
	& = \E_{x \sim P}[h(f(x))^\alpha] + \frac{1}{\alpha-1}\int_{\Xs} p(x) h(f(x))^\alpha \de x \notag\\
	& \quad - \frac{1}{\alpha-1}\int_{\Xs} p(x) \left(\frac{p(x)}{q(x)}\right)^{\alpha-1} h(f(x))^\alpha \de x \notag \\
	& =  \E_{x \sim P}[h(f(x))^\alpha] +  \E_{x \sim P}[h(f(x))]^\alpha \frac{1}{\alpha-1}\int_{\Xs} \left(\frac{p(x) h(f(x))}{\E_{x \sim P}[h(f(x))]} \right)^\alpha p(x)^{1-\alpha}\de x \notag\\
	& \quad - \E_{x \sim P}[h(f(x))]^\alpha \frac{1}{\alpha-1}\int_{\Xs} \left(\frac{p(x) h(f(x))}{\E_{x \sim P}[h(f(x))]} \right)^\alpha q(x)^{1-\alpha} \de x \notag \\
		& =  \E_{x \sim P}[h(f(x))^\alpha]  \notag\\
		& \quad + \E_{x \sim P}[h(f(x))]^\alpha \frac{1}{\alpha-1} \exp \left\{ (\alpha-1) \frac{1}{\alpha-1} \log \int_{\Xs} \left(\frac{p(x) h(f(x))}{\E_{x \sim P}[h(f(x))]} \right)^\alpha p(x)^{1-\alpha}\de x \notag \right\}\\
	 & \quad - \E_{x \sim P}[h(f(x))]^\alpha \frac{1}{\alpha-1} \exp \left\{ (\alpha-1) \frac{1}{\alpha-1} \log \int_{\Xs} \left(\frac{p(x) h(f(x))}{\E_{x \sim P}[h(f(x))]} \right)^\alpha q(x)^{1-\alpha} \de x \right\} \notag \\
	& =  \E_{x \sim P}[h(f(x))^\alpha] +  \frac{\E_{x \sim P}[h(f(x))]^\alpha}{\alpha-1} \left(e^{(\alpha-1){D_{\alpha}(\mathcal{I}_{h \circ f} \| P)}} -e^{(\alpha-1)D_{\alpha}(\mathcal{I}_{h \circ f} \|Q)} \right),\notag
\end{align}
where line~\parref{p:11001} derived from Lemma~\ref{lemma:technical}. The second inequality was provided in Proposition 6 of~\citep{ghosh2020operator}.
\end{proof}

\textbf{Proof of Theorem~\ref{thr:convergenceConstr}}
\begin{proof}
We are going to actually prove a more general statement in which we consider a non-negative monotonic increasing function $h$ that is composed to function $f$, \ie $h \circ f$. The theorem statement can be obtained by setting $h$ to be the identity function.

	Let us consider the sequence of distributions $(Q_{k})_{k \in \Nat}$, generated by the iterate in Equation~\parref{eq:probMomentProjection},	where possible ties are broken with an arbitrary (possibly with a tie-breaking rule $T_{k}$ different for every $k$). From Theorem~\ref{thr:thrImprovementConstrained}, we have for every $k \in \Nat$:
	\begin{align*}
		\E_{x \sim Q_{k+1}}\left[h(f(x))^\alpha \right] & -  \E_{x \sim Q_k}[h(f(x))^\alpha] \\
		& \ge \frac{\E_{x \sim Q_{k}}[h(f(x))]^\alpha}{\alpha-1} \left( e^{(\alpha-1)D_{\alpha}(\mathcal{I}_{h \circ f} [Q_k]\| Q_k)} - e^{(\alpha-1)D_{\alpha}(\mathcal{I}_{h \circ f}[Q_k] \|Q_{k+1}) } \right) \ge 0,
	\end{align*}
	where we simply exploited that $Q_k \in \argmin_{Q \in \mathcal{Q}} \left\{D_{\alpha}(\mathcal{I}_{h \circ f}[Q_k] \|Q)\right\} $. Thus, $\E_{x \sim Q_{k}}\left[h(f(x))^\alpha \right]$ is a non-decreasing function of $k$. Since $h \circ f$ is bounded, it must be that $\lim_{k \rightarrow \infty} \E_{x \sim Q_{k}}\left[h(f(x))^\alpha \right] = \mu_{\infty} < \infty$, that proves convergence.\footnote{Notice that the improvement holds also for $\alpha < 1$. Indeed, while it is true that $\frac{\E_{x \sim Q_{k}}[h(f(x))]^\alpha}{\alpha-1} < 0$, but in such a case function $e^{(\alpha-1) (\cdot)}$ is decreasing in its argument.}
	
	Furthermore, being convergent, for $k\rightarrow \infty$ it must be that $\E_{x \sim Q_{k}}\left[h(f(x))^\alpha \right] = \E_{x \sim Q_{k+1}}\left[h(f(x))^\alpha \right]$ and consequently $D_{\alpha}(\mathcal{I}_{h \circ f} [Q_k]\| Q_k) = D_{\alpha}(\mathcal{I}_{h \circ f}[Q_k] \|Q_{k+1})$. Therefore, even if the tie-braking rule prescribes to select $Q_{k+1} \neq Q_k$ we could select $ Q_k$ instead, since it lead to the same divergence value. Consequently, being $ Q_k$ a solution, we can assert that it is a stationary point of the function $D_{\alpha}(\mathcal{I}_{h \circ f}[Q_k] \| \cdot)$ (as well as $Q_{k+1}$):
	\begin{align*}
		 0 & = \nabla_{q(\cdot)} D_{\alpha}(\mathcal{I}_{h \circ f}[Q_k] \| Q)\rvert_{Q=Q_k} \\
		 &  = \frac{1}{(\alpha-1) e^{(\alpha-1)D_{\alpha}(\mathcal{I}_{h \circ f}[Q_k] \| Q)} \E_{x \sim Q_k}[h(f(x))]} \nabla_{q(\cdot)}  \int_{\Xs} h(f(x))^\alpha q_k(x)^\alpha q(x)^{1-\alpha} \de x \rvert_{Q=Q_k} \\
		 & = -\frac{1}{e^{(\alpha-1)D_{\alpha}(\mathcal{I}_{h \circ f}[Q_k] \| Q)} \E_{x \sim Q_k}[h(f(x))]}  \int_{\Xs} h(f(x))^\alpha q_k(x)^\alpha q(x)^{-\alpha} \de x\rvert_{Q=Q_k} \\
		 & = -\frac{1}{e^{(\alpha-1)D_{\alpha}(\mathcal{I}_{h \circ f}[Q_k] \| Q)} \E_{x \sim Q_k}[h(f(x))]}  \int_{\Xs} h(f(x))^\alpha  \de x.
	\end{align*}
	We observe that the latter expression is zero if and only if the gradient of $\E_{x \sim Q}[h(f(x))^\alpha]$ \wrt $Q$ is zero. Indeed:
	\begin{align*}
		\nabla_{q(\cdot)}\E_{x \sim Q}[h(f(x))^\alpha] = \int_{\Xs} h(f(x))^\alpha  \de x.
	\end{align*}
	Thus, the process converges to a stationary point of $\E_{x \sim Q_k}[h(f(x))^\alpha]$.
\end{proof}

\textbf{Proof of Theorem~\ref{thr:thrImplicitTrustConstr}}
\begin{proof}
	The proof is a simple application of Lemma~\ref{lemma:convexBound}, by taking $Q \leftarrow P$, $Q^* \leftarrow Q^{\dagger}$, and  $P\leftarrow\mathcal{I}_{ f}[P]$.
\end{proof}


\subsection{Proofs of Section~\ref{sec:sampleBasedOptimization}}
\textbf{Proof of Theorem~\ref{thr:thrConcentration}}

\begin{proof}
	We start observing that each addendum of $\widehat{d}_{\alpha}\left( \mathcal{I}_{ f}[Q_{\textcolor{blue}{\vvv_{i}}}] \| Q_{\textcolor{red}{\vvv}}; \Phi_{i,j}\right)$ is non negative. Since all terms are i.i.d., we can apply unilateral Bernstein's inequality that allows achieving an exponential concentration. Thus, for every $\delta \in [0,1]$, with probability at least $1-\delta$ it holds that:
	\begin{align*}
		\E_{x \sim\textcolor{red}{\vvv} } \left[ \left( \frac{q_{\textcolor{blue}{\vvv_{i}}}(x)}{q_{\textcolor{red}{\vvv}}(x)} f(x) \right)^\alpha \right] & \le  \widehat{d}_{\alpha}\left( \mathcal{I}_{f}[Q_{\textcolor{blue}{\vvv_{i}}}] \| Q_{\textcolor{red}{\vvv}}; \Phi_{i,j}\right) \\
		& \quad + \sqrt{2 \Var_{x_i \sim \Phi_{i,j}}\left[ \widehat{d}_{\alpha}\left( \mathcal{I}_{f}[Q_{\textcolor{blue}{\vvv_{i}}}] \| Q_{\textcolor{red}{\vvv}}; \Phi_{i,j}\right) \right] \log \frac{1}{\delta}}.
	\end{align*}
	Thus, it remains to provide a bound on the variance term. We exploit the fact that $h(f(x)) \le \overline{m}$ and that each addendum represents an i.i.d. random variable:
	\begin{align*}
	\Var_{x_i \sim \Phi_{i,j}} & \left[ \widehat{d}_{\alpha}\left( \mathcal{I}_{f}[Q_{\textcolor{blue}{\vvv_{i}}}] \| Q_{\textcolor{red}{\vvv}}; \Phi_{i,j}\right) \right]  \\
	& \le  \frac{1}{(nj)^2} \sum_{k\in [j]} \sum_{l \in[n]}  \E_{x_{k,l} \sim \Phi_{i,j}} \left[\left( \frac{q_{\textcolor{blue}{\vvv_i}}(x_{k,l})^\alpha}{\Phi_{i,j}(x_{k,l}) q_{\textcolor{red}{\vvv}}(x_{k,l})^{\alpha-1}}  f(x)^\alpha\right)^2 \right] \\
	& \le \frac{{\overline{m}}^{2\alpha}}{(nj)^2} \sum_{k\in [j]} \sum_{l \in[n]}  \E_{x_{k,l} \sim \Phi_{i,j}} \left[\left( \frac{q_{\textcolor{blue}{\vvv_i}}(x_{k,l})^\alpha}{\Phi_{i,j}(x_{k,l}) q_{\textcolor{red}{\vvv}}(x_{k,l})^{\alpha-1}}  \right)^2 \right]\\
	& = \frac{{\overline{m}}^{2\alpha}}{nj}  \E_{x\sim \Phi_{i,j}} \left[\left( \frac{q_{\textcolor{blue}{\vvv_i}}(x)^\alpha}{\Phi_{i,j}(x) q_{\textcolor{red}{\vvv}}(x)^{\alpha-1}}  \right)^2 \right].
	\end{align*}
\end{proof}

\subsection{Technical Lemmas}

\begin{lemma}\label{lemma:tecLemma}
Let $\alpha \in [2,+\infty)$ and let $y$ be a non-negative random variable with expectation $1$. Then, it holds that $\E[|y-1|^\alpha]^{1/\alpha} \le  \E[y^\alpha]^{1/\alpha}$.
\end{lemma}

\begin{proof}
	When $y \ge 0$ and $\alpha \in [2,+\infty)$, it holds that $y^\alpha - |y-1|^\alpha \ge y-1$. Consequently, we have:
	\begin{align*}
	\E[|y-1|^\alpha] \le \E[y^\alpha-y+1] \le \E[y^\alpha].
	\end{align*}
\end{proof}

\begin{lemma}\label{lemma:technical}
	For every $x \ge 0$ and $\alpha \in (0,1) \cup (1, \infty)$, it holds that:
	\begin{align*}
		x-1 \ge \frac{1}{\alpha-1} \left( 1 - \frac{1}{x^{\alpha-1}}\right).
	\end{align*}
	Furthermore, for $\alpha = 1$, it holds that:
	\begin{align*}
		x-1 \ge \log x.
	\end{align*}
\end{lemma}

\begin{proof}
	Consider the auxiliary function $g_\alpha(x)=  x-1 - \frac{1}{\alpha-1} \left( 1 - \frac{1}{x^{\alpha-1}}\right)$. We are going to prove that the minimum of $g_\alpha(x)$ is zero. Suppose $\alpha > 1$, then $g_\alpha(0) = \infty$ and $g_a(\infty) = \infty$. Thus, the minimum must lie in between and since function $g_\alpha$ is differentiable, we have:
	\begin{align*}
		\frac{\partial}{\partial x}g_\alpha(x) = 1-x^{-\alpha} = 0 \quad \implies \quad x = 1. 
	\end{align*}
	Thus, we have $g_\alpha(1) = 0$. Suppose now that $\alpha < 1$, we have $g_{\alpha}(0) = \frac{\alpha}{1-\alpha} > 0$  and $g_{\alpha}(\infty) = \infty$. Thus, again, the minimum must lie in between and with the same calculations as before, we conclude $g_\alpha(1) = 0$. The case $\alpha = 1$ is trivial.
\end{proof}


\begin{lemma}\label{lemma:convexBound}
Let $P \in \PM{\Xs}$ and let $\alpha \in [0,1)$. Let $\mathcal{Q} \subseteq \PM{\Xs}$ be an $(1-\alpha)$-convex~\citep[][Definition 4]{erven2014renyi} subset of distributions. Let $Q^* \in \mathcal{Q}$ be the $\alpha$-moment projection:
\begin{align*}
	Q^* = \argmin_{Q \in \mathcal{Q}} \left\{ D_{\alpha}(P \| Q) \right\}.
\end{align*}
If $Q^*$ exists, then for every $Q \in \mathcal{Q}$ if holds that:
\begin{align*}
	D_{\alpha}(P \| Q) \ge D_{\alpha}(P \| Q^*) + D_{\alpha}(Q^* \| Q).
\end{align*}
\end{lemma}

\begin{proof}
	The proof of the result is inspired to~\citep[][Theorem 14]{erven2014renyi}. Let $\lambda \in [0,1]$ and let us define $Q_{\lambda}$ as the $\left(1-\alpha,(1-\lambda,\lambda)\right)$-mixture of $Q^*$ and $Q$:
	\begin{align*}
		q_\lambda(x) = Z^{-1}_{\lambda} \left((1-\lambda)q^*(x)^{1-\alpha}+\lambda q(x)^{1-\alpha}\right)^{\frac{1}{1-\alpha}},\\
		Z_\lambda = \int_{\Xs} \left((1-\lambda)q^*(x)^{1-\alpha}+\lambda q(x)^{1-\alpha}\right)^{\frac{1}{1-\alpha}} \de x.
	\end{align*}
	Let us first observe that for $\lambda = 0$, we have $Q_0 = Q^*$ and $Z_0 = \int_{\Xs} q^*(x) \de x = 1$. Since $\mathcal{Q}$ is $(1-\alpha)$-convex and $Q^*$ is the minimizer over $\mathcal{Q}$, it holds that $\frac{\partial}{\partial \lambda} D_{\alpha}(P \|Q_{\lambda}) \rvert_{\lambda = 0} \ge 0$. First of all, we compute:
	\begin{align*}
		 \int_{\Xs} p(x)^\alpha q_{\lambda}(x)^{1-\alpha} \de x = Z_\lambda^{\alpha-1} \int_{\Xs} \left[(1-\lambda) p(x)^\alpha q^*(x)^{1-\alpha} + \lambda p(x)^\alpha q(x)^{1-\alpha} \right] \de x
	\end{align*}
	\begin{align*}
		\frac{\partial}{\partial \lambda}  Z_{\lambda} = \frac{1}{1-\alpha} \int_{\Xs} \left((1-\lambda)q^*(x)^{1-\alpha}+\lambda q(x)^{1-\alpha}\right)^{\frac{\alpha}{1-\alpha}} \left( q(x)^{1-\alpha} - q^*(x)^{1-\alpha} \right) \de x.
	\end{align*}
	The latter, for $\lambda=0$, becomes: $\frac{\partial}{\partial \lambda}  Z_{\lambda}\Big\rvert_{\lambda=0} = \frac{1}{1-\alpha}\left[ \int_{\Xs} q^*(x)^{\alpha} q(x)^{1-\alpha} - 1\right]$. For calculation easiness, instead of directly operating on $D_{\alpha}(P\|Q_{\lambda})$, we consider:
	\begin{align*}
	\frac{\partial}{\partial \lambda} \int_{\Xs} p(x)^\alpha q_{\lambda}(x)^{1-\alpha} \de x & = Z_\lambda^{\alpha-1} \int_{\Xs} \left[- p(x)^\alpha q^*(x)^{1-\alpha} + p(x)^\alpha q(x)^{1-\alpha} \right] \de x ,\\
	& \quad + (\alpha-1) Z_\lambda^{\alpha-2} \frac{\partial}{\partial \lambda}  Z_{\lambda}\int_{\Xs} \left[(1-\lambda) p(x)^\alpha q^*(x)^{1-\alpha} + \lambda p(x)^\alpha q(x)^{1-\alpha} \right] \de x .
	\end{align*}
	We now evaluate it at $\lambda = 0$:
	\begin{align*}
	\frac{\partial}{\partial \lambda} \int_{\Xs} p(x)^\alpha q_{\lambda}(x)^{1-\alpha} \de x \Big\rvert_{\lambda = 0} & = - \int_{\Xs} p(x)^\alpha q^*(x)^{1-\alpha}  \de x + \int_{\Xs} p(x)^\alpha q(x)^{1-\alpha} \de x \\
	& \quad - \int_{\Xs} p(x)^\alpha q^*(x)^{1-\alpha} \de x \left[ \int_{\Xs} q^*(x)^{\alpha} q(x)^{1-\alpha} \de x -1 \right].
	\end{align*}
	For $\alpha \ge 1$, we require $\frac{\partial}{\partial \lambda} \int_{\Xs} p(x)^\alpha q_{\lambda}(x)^{1-\alpha} \de x \Big\rvert_{\lambda = 0} \ge 0$, to obtain:
	\begin{align*}
		\int_{\Xs} p(x)^\alpha q(x)^{1-\alpha} \de x \ge \int_{\Xs} p(x)^\alpha q^*(x)^{1-\alpha}\de x  \int_{\Xs} q^*(x)^\alpha q(x)^{1-\alpha} \de x.
	\end{align*}
	By applying both sides the $\log$ function and dividing by $\frac{1}{\alpha-1} > 0$ we get the result. Symmetrically, for $\alpha<1$, we require the converse $\frac{\partial}{\partial \lambda} \int_{\Xs} p(x)^\alpha q_{\lambda}(x)^{1-\alpha} \de x \Big\rvert_{\lambda = 0} \le 0$. Recalling that $\frac{1}{\alpha-1} < 0$, we obtain the desired result.
\end{proof}


\section{Closed Form of the Integral for Gaussians}\label{apx:closedGauss}
In this appendix, we derive a closed form for the integral involved in the computation of the bound of Theorem~\ref{thr:thrConcentration} in the case that all involved distributions are Gaussians and for $\alpha=2$. Let us introduce the notation:
\begin{align}
\mu = \mathcal{N}(\mathbr{\mu_\mu},\mathbr{\Sigma_\mu}), \qquad \phi = \mathcal{N}(\mathbr{\mu_\phi},\mathbr{\Sigma_\phi}) ,\qquad \nu = \mathcal{N}(\mathbr{\mu_\nu},\mathbr{\Sigma_\nu}).
\end{align}
We have to compute the following integral:
\begin{align*}
    \int_{\mathcal{X}} \frac{\mu^4(\mathbr{x})}{\phi(\mathbr{x})\nu(\mathbr{x})^2} \de \mathbr{x}.
\end{align*}
Let us start elaborating on the integrand function, denoting for properly sized vector $\mathbr{x}$ and matrix $\mathbr{S}$, $\|\mathbr{m}\|_{\mathbr{S}} = \mathbr{x}^T \mathbr{S}\mathbr{x}$ and $|\mathbr{S}|$ the determinant of $\mathbr{S}$:
\begin{align*}
    \frac{\mu^4(\mathbr{x})}{\phi(\mathbr{x})\nu(\mathbr{x})^2} & = \frac{(2\pi)^{-2k} |\mathbr{\Sigma_\mu}|^{-2} \exp \left(-2 \|\mathbr{x}-\mathbr{\mu_\mu}\|^2_{\mathbr{\Sigma_\mu}^{-1}} \right)}{(2\pi)^{-k/2} |\mathbr{\Sigma_\phi}|^{-1/2} \exp \left(-1/2 \|\mathbr{x}-\mathbr{\mu_\phi}\|^2_{\mathbr{\Sigma_\phi}^{-1}} \right)(2\pi)^{-k} |\mathbr{\Sigma_\nu}|^{-1} \exp \left(- \|\mathbr{x}-\mathbr{\mu_\nu}\|^2_{\mathbr{\Sigma_\nu}^{-1}} \right)} \\
    & =  \frac{(2\pi)^{-k/2} |\mathbr{\Sigma_\mu}|^{-2}}{|\mathbr{\Sigma_\phi}|^{-1/2}|\mathbr{\Sigma_\nu}|^{-1} }  \exp \left(-2 \|\mathbr{x}-\mathbr{\mu_\mu}\|^2_{\mathbr{\Sigma_\mu}^{-1}} +1/2 \|\mathbr{x}-\mathbr{\mu_\phi}\|^2_{\mathbr{\Sigma_\phi}^{-1}} + \|\mathbr{x}-\mathbr{\mu_\nu}\|^2_{\mathbr{\Sigma_\nu}^{-1}} \right).
\end{align*}
Now, we have to deal with the argument of the exponential:
\begin{align*}
    -2 & \|\mathbr{x}-\mathbr{\mu_\mu}\|^2_{\mathbr{\Sigma_\mu}^{-1}}  +1/2 \|\mathbr{x}-\mathbr{\mu_\phi}\|^2_{\mathbr{\Sigma_\phi}^{-1}} + \|\mathbr{x}-\mathbr{\mu_\nu}\|^2_{\mathbr{\Sigma_\nu}^{-1}} \\
    & = - \frac{1}{2} \mathbr{x}^T \underbrace{\left( 4 \mathbr{\Sigma_\mu}^{-1} - \mathbr{\Sigma_\phi}^{-1}- 2 \mathbr{\Sigma_\nu}^{-1} \right)}_{\mathbr{M}} \mathbr{x} + \underbrace{\left( 4 \mathbr{\Sigma_\mu}^{-1}  \mathbr{\mu_\mu} - \mathbr{\Sigma_\phi}^{-1} \mathbr{\mu_\phi} - 2 \mathbr{\Sigma_\nu}^{-1} \mathbr{\mu_\nu}\right)^T}_{\mathbr{b}^T} \mathbr{x} \\
    & \quad -\frac{1}{2} \underbrace{\left(4\mathbr{\mu_\mu}^T \mathbr{\Sigma_\mu}^{-1}  \mathbr{\mu_\mu} -  \mathbr{\mu_\phi}^T\mathbr{\Sigma_\phi}^{-1} \mathbr{\mu_\phi}  -2   \mathbr{\mu_\nu}^T \mathbr{\Sigma_\nu}^{-1} \mathbr{ \mu_\nu }\right)}_{\mathbr{c}}.
\end{align*}
We now proceed completing the square:
\begin{align*}
    \mathbr{x}^T \mathbr{M} \mathbr{x} - 2 \mathbr{b}^T \mathbr{x} = ( \mathbr{x} - \mathbr{M}^{-1}\mathbr{b})^T\mathbr{M}( \mathbr{x} - \mathbr{M}^{-1}\mathbr{b}) -\mathbr{b}^T  \mathbr{M}^{-1} \mathbr{b}.
\end{align*}
Thus, we have:
\begin{align*}
    -\frac{1}{2} \left(\mathbr{x}^T \mathbr{M} \mathbr{x} - 2 \mathbr{b}^T\mathbr{x} + \mathbr{c} \right) =   -\frac{1}{2}(\mathbr{x} - \mathbr{M}^{-1}\mathbr{b})^T \mathbr{M}(\mathbr{x} - \mathbr{M}^{-1}\mathbr{b}) +\frac{1}{2} \mathbr{b}^T \mathbr{M}^{-1} \mathbr{b} - \frac{1}{2} \mathbr{c}.
\end{align*}
Moreover, we observe that the following expression is the density of a $k$-variate normal distribution with mean $M^{-1}b$ and covariance matrix $M^{-1}$:
\begin{align*}
   (2\pi)^{-k/2} |\mathbr{M}^{-1}|^{-1/2} \exp\left( -\frac{1}{2}(\mathbr{x} - \mathbr{M}^{-1}\mathbr{x})^T \mathbr{M}(\mathbr{x} - \mathbr{M}^{-1}\mathbr{b}) \right).
\end{align*}
Thus, its integral is 1. Therefore, coming to the initial expression:
\begin{align*}
     \int_{\mathcal{X}} \frac{\mu^4(\mathbr{x})}{\phi(\mathbr{x})\nu(\mathbr{x})^2} \de \mathbr{x} & = \frac{(2\pi)^{-k/2} |\mathbr{\Sigma_\mu}|^{-2}}{|\mathbr{\Sigma_\phi}|^{-1/2}|\mathbr{\Sigma_\nu}|^{-1} } \left(  (2\pi)^{-k/2} |\mathbr{M}^{-1}|^{-1/2} \right)^{-1} \exp\left(\frac{1}{2} \mathbr{b}^T \mathbr{M}^{-1} \mathbr{b} - \frac{1}{2} \mathbr{c} \right) \\
     & = \frac{|\mathbr{\Sigma_\phi}|^{1/2}|\mathbr{\Sigma_\nu}| }{  |\mathbr{\Sigma_\mu}|^{2} |\mathbr{M}|^{1/2} } \exp\left(\frac{1}{2} \left( \mathbr{b}^T \mathbr{M}^{-1} \mathbr{b}- \mathbr{c} \right)\right) 
\end{align*}  

\section{Gradient of the Objective Function of Theorem~\ref{thr:thrConcentration}}
In this appendix, we report the expression of the gradient of the right hand side of Theorem~\ref{thr:thrConcentration}:
\begin{align*}
(1-\alpha) \frac{1}{nj} & \sum_{k \in [j]} \sum_{l \in [n]} \frac{q_{\bm{\xi}_i}(x_{k,l})^\alpha}{\Phi_{i,j}(x_{k,l}) q_{\bm{\xi}}(x_{k,l})^{\alpha-1}}\left( \nabla_{\bm{\xi}} \log q_{\bm{\xi}}(x_{k,l}) \right) f(x_{k,l})^\alpha \\
&  - 2(\alpha-1) \overline{m}^\alpha \sqrt{\frac{\log(1/\delta)}{2nj \int_{\mathcal{X}} 
 \frac{q_{\bm{\xi}_i}(x)^{2\alpha}}{\Phi_{i,j}(x) q_{\bm{\xi}}(x)^{2(\alpha-1)}} \de x }} \int_{\mathcal{X}} \frac{q_{\bm{\xi}_i}(x)^{2\alpha}}{\Phi_{i,j}(x) q_{\bm{\xi}}(x)^{2(\alpha-1)}} \left( \nabla_{\bm{\xi}} \log q_{\bm{\xi}}(x) \right) \de x
\end{align*}

The integral present in the second addendum can be either evaluated from samples (i.e., replacing the expectation with the sample mean) or computed exactly for common classes of distributions, e.g. Gaussian distributions, as we show in Appendix~\ref{apx:closedGauss}.

\section{Experimental Details}\label{apx:expDet}
In this appendix, we report the experimental details and additional experimental results.  


\paragraph{Infrastructure}
The experiments have been run on two machines:
\begin{itemize}
\item 2 x CPUs Intel(R) Xeon(R) CPU E7-8880 v4 @ 2.20GHz (22 cores, 44 thread, 55 MB cache) and 128 GB RAM;
\item 4 x Intel(R) Xeon(R) CPU E5-4610 v2 @ 2.30GHz (8 cores, 16 thread, 16 MB cache) and 256 GB RAM.
\end{itemize}

\paragraph{Environments}
The environments are the rllab implementations~\citep{duan2016benchmarking}, MIT license, \url{https://github.com/rll/rllab}. The Swimmer environment belongs to the Mujoco suite~\citep{mujocoCit}, MuJoCo Personal License, \url{http://www.mujoco.org/}.

\paragraph{Algorithms}
The TRPO implementation is taken from baselines, MIT licence, \url{https://github.com/openai/baselines}. For POIS we use the original implementation~\citep{metelli2018policy}, MIT license, \url{https://github.com/T3p/baselines}. 

\paragraph{Hyperparameters}
In order to properly compare the algorithms, a set of 20 seeds has been chosen. A subset of 5 seeds, underlined, was used to test the performances
during the tuning phase. Once the optimal hyperparameters were found, the experiments were extended to the other 15 seeds. In the following, we report the hyperparameter values for \algname. 

The \emph{shift return} refers to the need for making the return non-negative in order to perform the optimization of the $\alpha$-moment in \algname. This procedure is carried out independently at each algorithm iteration by subtracting the minimum return among the ones observed. The \emph{variance init} hyperparameter refers to the logarithm of the standard deviation. All experiments have been carried out with Gaussian policies linear with  mean linear in the state variables and constant variance uniform over the state space.

\underline{Cartpole}
\begin{itemize}
	\item seeds: \underline{0}, 3, 11, 16, \underline{19}, \underline{42}, \underline{66}, \underline{72}, 84, 87, 90, 123, 222, 343, 404, 452, 542, 875, 943, 999
	\item max iters: 500
	\item policy: linear
	\item policy init: zeros
	\item capacity: 1
	\item inner: 1
	\item variance init: -1
	\item step size: 1 / gradient norm
	\item penalization: True
	\item delta: 0.75
	\item max offline iters: 10
\end{itemize}

\underline{Mountain Car}
\begin{itemize}
	\item seeds: \underline{0}, 3, 11, 16, \underline{19}, \underline{42}, \underline{66}, \underline{72}, 84, 87, 90, 123, 222, 343, 404, 452, 542, 875, 943, 999
	\item max iters: 500
	\item policy: linear
	\item policy init: zeros
	\item capacity: 1
	\item inner: 1
	\item variance init: -1
	\item step size: 2 / gradient norm
	\item penalization: True
	\item delta: 0.9
	\item max offline iters: 10
	\item shift return: True
\end{itemize}

\underline{Inverted Double Pendulum}
\begin{itemize}
	\item seeds: \underline{0}, 3, 11, 16, \underline{19}, \underline{42}, \underline{66}, \underline{72}, 84, 87, 90, 123, 222, 343, 404, 452, 542, 875, 943, 999
	\item max iters: 500
	\item policy: linear
	\item policy init: zeros
	\item capacity: 1
	\item inner: 1
	\item variance init: -1
	\item step size: 2 / gradient norm
	\item penalization: True
	\item delta: 0.99
	\item max offline iters: 10
\end{itemize}

\underline{Swimmer}
\begin{itemize}
	\item seeds: \underline{0}, 3, 11, 16, \underline{19}, \underline{42}, \underline{66}, \underline{72}, 84, 87, 90, 123, 222, 343, 404, 452, 542, 875, 943, 999
	\item max iters: 500
	\item policy: linear
	\item policy init: zeros
	\item capacity: 1
	\item inner: 1
	\item log-std init: -0.6
	\item step size: 1 / gradient norm
	\item penalization: True
	\item delta: 0.99
	\item max offline iters: 10
	\item shift return: True
\end{itemize}

For POIS (both AB and PB) and TRPO, the same hyperparameter value have been used, except for the algorithm-specific ones that have been tuned with the same protocol discussed above ($\delta_{\text{KL}} \in \{0.001, 0.01, 0.1, 1\}$). In particular, for POIS, we employ the line search procedure presented in the original paper for setting the step-size. The following table summarizes the algorithm-specific hyperparameter values for the different algorithms and environments.

\begin{center}
\begin{tabular}{lccccc}
\toprule
	Environment / Algorithm & \algname (delta) & AB-POIS (delta) & TRPO (max kl)\\
	\midrule
	Cartpole & 0.75 & 0.4 &0.01  \\
	Mountain Car & 0.9 & 0.9 &  0.01\\
	Inverted Double Pendulum & 0.99 & 0.1 & 0.001 \\
	Swimmer & 0.99 & 0.8  & 0.01 \\
	\bottomrule
\end{tabular}

\vspace{.5cm}

\begin{tabular}{lccccc}
\toprule
	Environment / Algorithm & PB-POIS (delta) & PB-\algname (delta)\\
	\midrule
	Cartpole & 0.4 & 0.6\\
	Mountain Car & 1 & 0.00001\\
	Inverted Double Pendulum & 0.1 & 0.999999\\
	Swimmer & 0.4 & 0.4\\
	\bottomrule
\end{tabular}
\end{center}




\subsection{Noise Robustness}
As we have already observed, using the trajectory return $\mathcal{R}(\tau)$ as function $f$ does no longer allow to provide performance improvement guarantees. Nevertheless, we conjecture that the loss of this property is compensated by the variance reduction implicit in our approach. In the direction of empirically showing this aspect, we tested the parameter-based version of \algname in the Inverted Double Pendulum environment, with forced stochasticity in the environment. Specifically, whenever an action is prescribed by the policy the actual action to be executed is obtained by adding while Gaussian noise with standard deviation $\sigma$. The results are shown in Figure~\ref{fig:apxNoise}. We observe that our algorithm is overall competitive with PB-POIS and, in the case of $\sigma=1$, significantly outperforms PB-POIS. 

\begin{figure}
\includegraphics[width=\textwidth]{img/noise}
\caption{Learning curves comparing PB-POIS and PB-\algname with increasing magnitude of the noise (20 runs, 95\% c.i.).}\label{fig:apxNoise}
\end{figure}


\subsection{About Return Translation}
Our approach can be employed for non-negative functions $f$. Since in the PO experimental evaluation we employ $f = \mathcal{R}(\tau)$. Under the assumption that the immediate reward is bounded $R(s,a) \in [R_{\min},R_{\max}]$ for all $(s,a) \in \mathcal{S \times A}$, we can make the return function with a simple translation and preserving the optimality of policies:
\begin{align*}
	\overline{R}(\tau) = \mathcal{R}(\tau) -  \underbrace{R_{\min} \frac{1-\gamma^H}{1-\gamma}}_{- c_{\min}}, 
\end{align*}
where $R_{\min} \frac{1-\gamma^H}{1-\gamma}$ is the minimum achievable return. Of course, we can perform the translation even by using a constant $c \ge  c_{\min} = -R_{\min} \frac{1-\gamma^H}{1-\gamma}$ and still obtain a translated return that remains positive. It is worth noting, from Theorem~\ref{thr:trustRegion} that the size of the trust region is larger as the constant approaches the its minimum possible value.

For instance, we consider $\alpha=2$, $f \ge 0$ , and we apply a further translation with $c \ge 0$. From Theorem~\ref{thr:trustRegion}, we have:
\begin{align*}
	D_2(I_{+c \circ f}[P] \|P) = \log \frac{\E_{x \sim P}[(f(x)+c)^2]}{\E_{x \sim P}[f(x)+c]^2} = \log \frac{\E_{x \sim P}[f(x)^2] + c^2 + 2c \E_{x \sim P}[f(x)]}{\E_{x \sim P}[f(x)]^2 +c^2 + 2c \E_{x \sim P}[f(x)]}.
\end{align*}
Since $\E_{x \sim P}[f(x)^2] \ge \E_{x \sim P}[f(x)]^2$, we have that this expression is maximized with the smallest value of $c$, \ie $c=0$.


\end{document}
