\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{mymacros}
\usepackage[inline]{enumitem}
\usepackage{wrapfig}
\usepackage[]{algorithm2e}
\usepackage{booktabs}
\usepackage{url}
\usepackage{caption}
\usepackage{hyperref}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\allowdisplaybreaks[4]


\title{On the Relation between Policy Improvement \\ and Off-Policy Minimum-Variance Policy Evaluation}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:albertomaria.metelli@polimi.it}{Alberto Maria Metelli}{}}
\author[1]{\href{mailto:samuele.meta@mail.polimi.it}{Samuele Meta}}{}
\author[1]{\href{mailto:marcello.restelli@polimi.it}{Marcello Restelli}{}}
% Add affiliations after the authors
\affil[1]{%
    Dipartimento di Elettronica, Informazione e Bioingegneria\\
    Politecnico di Milano\\
    Milan, Italy
}


\newcommand{\algname}{MBPExPI\@\xspace}
\newcommand{\algnameext}{Minimum-Variance Policy Evaluation for Policy Improvement\@\xspace}
\newcommand{\alphaInterval}{[0,\infty]}%(0,1) \cup (1,\infty)}

\newcommand{\qqa}{\textbf{\textcolor{vibrantBlue}{(Q1)}}\@\xspace}
\newcommand{\qqb}{\textbf{\textcolor{vibrantTeal}{(Q2)}}\@\xspace}
\newcommand{\qqc}{\textbf{\textcolor{vibrantRed}{(Q3)}}\@\xspace}

\newcommand{\parref}[1]{(\ref{#1})}


\definecolor{revisionColor}{rgb}{0.6, 0.4, 0.8}

\newcommand{\vvv}{\textcolor{black}{\bm{\xi}}}
\newcommand{\Cvvv}{\textcolor{black}{\bm{\Xi}}}

  
  \input{math_commands}

  \begin{document}
  

\maketitle

\begin{abstract}
	Off-policy methods are the basis of a large number of effective Policy Optimization (PO) algorithms. In this setting, Importance Sampling (IS) is typically employed for off-policy evaluation, with the goal of estimating the performance of a target policy, given samples collected with a different behavioral policy. However, in Monte Carlo simulation, IS represents a variance minimization approach. In this field, a suitable behavioral distribution is employed for sampling, allowing diminishing the variance of the estimator below the one achievable when sampling from the target distribution. In this paper, we analyze IS in these two guises in the context of PO. We provide a novel view of off-policy PO, showing a connection between the policy improvement and variance minimization objectives. Then, we illustrate how minimizing the off-policy variance can, in some circumstances, lead to a policy improvement, with the advantage, compared with direct off-policy learning, of implicitly enforcing a trust region. Finally, we present numerical simulations on continuous RL benchmarks, with a particular focus on the robustness to small batch sizes. 
\end{abstract}

\section{Introduction}
Policy Optimization methods~\citep[PO,][]{deisenroth2013survey} have been widely exploited in Reinforcement Learning~\citep[RL,][]{sutton2018reinforcement} with successful results in addressing, to name a few, continuous-control~\citep[\eg][]{peters2008reinforcement, lillicrap2015continuous}, robot manipulation~\citep[\eg][]{gu2017deep, chatzilygeroudis2020survey}, and locomotion~\citep[\eg][]{kohl2004policy, duan2016benchmarking}. Most of these algorithms employ the notion of \emph{trust region}~\citep{conn2000trust}, introduced ante litteram in the RL literature by the \emph{safe} RL approaches~\citep{kakade2002approximately, pirotta2013safe}, giving rise to a surge of effective algorithms, having TRPO~\citep{schulman2015trust} as the progenitor. The core of any RL algorithm, being value-based or policy-based, lies in the ability to employ the samples collected with the current (or \emph{behavioral}) policy to evaluate the performance of a candidate (or \emph{target}) policy~\citep{sutton2018reinforcement}. The skeleton rationale behind the usage of a trust region is to control the set of candidate policies whose performance can be accurately evaluated. Intuition suggests that if the candidate policy is \quotes{sufficiently close} to the current one, this \emph{off-policy} evaluation problem~\citep{precup2000elegibility} will provide a good estimate for the performance of the candidate policy. Formally, this idea has been studied in the field of Importance Sampling~\citep[IS,][]{owen2013monte} and the phenomenon is particularly apparent looking at the IS estimator variance, which grows exponentially with the \Renyi divergence~\citep{renyi1961measures} between the behavioral and the target policy~\citep{metelli2018policy, metelli2020importance, LiotetVMR22}. In this off-policy learning setting, IS is employed as a \emph{what-if} analysis tool~\citep{owen2013monte} and its role is \emph{passive}, as samples have been already collected with the current behavioral policy. In this sense, the trust region is an \emph{a-posteriori} remedy for the limitations of off-policy evaluation, for controlling the uncertainty injected by the IS procedure.

However, IS originated in the Monte Carlo simulation community~\citep{hesterberg1988advances, hammersley2013monte} as an \emph{active} tool for \emph{variance minimization}. While in off-policy learning, the behavioral policy is fixed and we look for the best target policy, whose performance we aim to estimate, here the roles are reversed. Indeed, in off-policy minimum-variance evaluation, the target policy is fixed and we search for the behavioral policy (from which to collect samples) that yields an IS estimate with the minimum possible variance~\citep{hammersley2013monte, kahn1953methods}. It might seem surprising, at first, that sampling from a policy, other than the target one, can lead to an estimator with less variance (even zero in some cases) \wrt the on-policy estimate. In this role, IS has been previously employed in RL, mainly to address rare events~\citep{frank2008jordan, ciosek2017offer} which naturally lead to high-variance estimates, when tackled on-policy. The idea of explicitly using IS as a variance reduction technique, with the goal of finding an optimal behavioral policy, was proposed by~\citep{hanna2017data} for evaluation and subsequently combined with policy gradient learning~\citep{hanna2018towards, hanna2019data}.


\textbf{Contributions}~~The goal of this paper is to investigate the relation between \emph{policy improvement} and \emph{off-policy minimum variance policy evaluation}. Intuitively, given a target policy, when the reward function is positive, one way to reduce the variance of the IS estimator is to assign larger probability to the trajectories that have a large impact on the mean, \ie those with high returns. Thus, in some circumstances, reducing the variance of the IS estimator moves the policy towards a policy improvement direction.
After having introduced the background (Section~\ref{sec:prelim}), we present the problem of finding the minimum-variance behavioral distribution (Section~\ref{sec:minVarDist}). Then, we study the properties of such a distribution in relation with policy improvement in two settings: unconstrained (Section~\ref{sec:optRepOpt}) and constrained (Section~\ref{sec:constrPolicySpace}). First, we assume that there are no restrictions for choosing the behavioral distribution. We show that the minimum-variance behavioral distribution, besides leading to the zero-variance estimator~\citep{kahn1953methods}, is guaranteed to yield a policy improvement, requiring the non-negativity of the reward only. Furthermore, we prove that this approach allows controlling the divergence between two consecutive distributions, thus enforcing an implicit \emph{trust region}. Although this provides a valuable starting point, the minimum-variance distribution might be unrealizable given the environment transition model, \ie there might be no policy inducing it. For this reason, we move to the scenario in which the distributions are constrained in a suitable space. In this setting, the zero-variance estimator could not be achievable. Furthermore, the presence of a constrained space introduces a bias in terms of policy improvement, still preserving the trust region enforcement. 
Finally, we provide numerical simulations on both \emph{action-based} and \emph{parameter-based} paradigms of policy optimization~\cite{metelli2018policy} to test the effects of minimum-variance policy evaluation in comparison with policy optimization. The simulation are conducted on continuous-control benchmarks, in comparison with POIS~\citep{metelli2018policy} and TRPO~\citep{schulman2015trust}, with a particular focus on the robustness of to small batch sizes (Section~\ref{sec:sampleBasedOptimization}). The proof of the results presented in the main paper are reported in Appendix~1.

\section{Preliminaries}\label{sec:prelim}
In this section, we provide the necessary background that will be employed in the paper.

\textbf{Mathematical Notation}~~Let $\Xs$ be a set, and let $\mathfrak{F}_{\Xs}$ be a $\sigma$-algebra over $\Xs$. We denote with $\PM{\Xs}$ the space of probability measures over $(\Xs, \mathfrak{F}_{\Xs})$. Let $P \in \PM{\Xs}$, whenever needed, we assume that $P$ admits a density function $p$. For a subset $\Ys \subseteq \Reals$, we denote with $\mathscr{B}(\Xs, \Ys)$ the space of measurable functions $f: \Xs \rightarrow \Ys$. Let $P,Q \in \PM{\Xs}$ be two probability measures such that $P \ll Q$, \ie $P$ is absolutely continuous \wrt $Q$, for every $\alpha \in \alphaInterval$, we define the \emph{$\alpha$-\Renyi divergence} as~\citep{renyi1961measures}: $D_\alpha(P\|Q) =  \frac{1}{\alpha-1} \log \int_{\Xs} p(x)^\alpha q(x)^{1-\alpha} \de x$. In the limit of $\alpha \rightarrow 1 $, the \Renyi divergence reduces to the KL-divergence $\kl(P\|Q)$, while for $\alpha \rightarrow \infty$, it corresponds to $\esssup_{x \sim Q} \left\{p(x)/q(x)\right\}$. Let $\alpha \in (0,+\infty)$, a set of probability measures $\mathcal{Q}$ is \emph{$\alpha$-convex}~\citep{erven2014renyi} if for every $P,Q\in \mathcal{Q}$ and $\lambda \in [0,1]$ it holds that the probability measure $Q_\lambda \coloneqq Z^{-1}_\lambda \left(\lambda P^\alpha + (1-\lambda) Q^\alpha \right)^{1/\alpha} \in \mathcal{Q}$, where $Z_\lambda$ is a normalization constant.

\textbf{Importance Sampling}~~Let $P,Q \in \PM{\Xs}$ with $P \ll Q$ and let $f \in \mathscr{B}(\Xs, \Reals)$. Importance Sampling~\citep[IS,][]{owen2013monte} allows estimating the expectation of $f$ under a \emph{target} distribution $P$, \ie $\E_{x \sim P}[f(x)]$ having samples $\{x_i\}_{i\in [n]}$ collected with a \emph{behavioral} distribution $Q$: $\widehat{\mu}_{P/Q} = \frac{1}{n} \sum_{i \in [n]} \frac{p(x_i)}{q(x_i)} f(x_i)$.
The IS estimator is unbiased~\citep{owen2013monte}, \ie $\E_{x_i \sim Q}[\widehat{\mu}_{P/Q}] = \E_{x \sim P}[f(x)]$, but it might suffer from large variance, due to the heavy-tailed behavior~\citep{metelli2018policy,MetelliRR21}. The properties of $\widehat{\mu}_{P/Q}$ and its transformations have been extensively studied in the literature~\citep[\eg][]{ionides2008truncated, thomas2015high, papini2019optimistic, metelli2020importance, kuzborskij2020confident, MetelliPDR21}.

\textbf{Policy Optimization}~~A Markov Decision Process~\citep[MDP,][]{puterman1994markov} is a 6-tuple $\mathcal{M} = (\Ss, \As, P, R, \gamma, D_0)$, where $\Ss$ is the state space, $\As$ is the action space, $P : \SAs \rightarrow \PM{\Ss}$ is the transition model, $R : \SAs \rightarrow [0,R_{\max}]$ is the reward function, $\gamma \in [0,1]$ is the discount factor, and $D_0 \in \PM{\Ss}$ is the initial state distribution. The agent's behavior is modeled by a \emph{parametric} policy $\pi_{\vtheta} : \Ss \rightarrow \PM{\As}$ belonging to a parametric policy space $\Pi_{\Theta} = \{\pi_{\vtheta} : \vtheta \in \Theta \subseteq \Reals^{d}\}$. The interaction between an agent and the MDP generates a \emph{trajectory} $\tau = (s_0,a_0,s_1,a_1, \dots, s_{H-1},a_{H-1},s_H)$ where $H \in \Nat$ is the trajectory length and $s_0 \sim D_0$, $a_t \sim \pi_{\vtheta}(\cdot|s_t)$, $s_{t+1} \sim P(\cdot|s_t,a_t)$ for all $t \in \{0,\dots,H-1\}$. Given a trajectory $\tau$, the \emph{return} is the discounted sum of the rewards $\mathcal{R}(\tau) = \sum_{t=0}^{H-1} \gamma^t R(s_t,a_t) $. For a policy $\pi_{\vtheta} \in \Pi_{\Theta}$, we denote with $p(\cdot|\vtheta)$ the induced trajectory distribution: $p(\tau|\vtheta) = D_0(s_0) \prod_{t=0}^{H-1} \pi_{\vtheta}(a_t|s_t) P(s_{t+1}|s_t,a_t)$. 
In the \emph{action-based} (AB) setting, an agent aims at finding a parametrization fulfilling: $\vtheta^* \in \argmax_{\vtheta \in \Theta}  \left\{ J(\vtheta)\right\}$, where:
$$J(\vtheta) = \E_{\tau \sim p(\cdot|\vtheta)} \left[\mathcal{R}(\tau) \right]$$
is the \emph{expected return}. $\pi_{\vtheta}$ must be stochastic to ensure exploration. Instead, in the \emph{parameter-based} (PB) setting, we consider a \emph{hyperpolicy} $\nu_{\vrho} \in \mathscr{P}({\Theta})$, belonging to a parametric hyperpolicy space $\mathcal{N}_{\mathcal{P}} = \{\nu_{\vrho} : \vrho \in \mathcal{P} \subseteq \Reals^{l} \}$, from which we sample the parameters $\vtheta$ of the policy. In this case, the policy $\pi_{\vtheta}$ can be deterministic since exploration is managed at the hyperpolicy level and the agent goal becomes to learn a hyperpolicy parametrization maximizing the expected return: $\vrho^* \in \argmax_{\vrho \in \mathcal{P}} \{J(\vrho)\}$, where:
$$J(\vrho) = \E_{\vtheta \sim \nu_{\vrho}}[J(\vtheta)].$$
In the paper, we keep the presentation as general as possible, introducing the results for arbitrary distributions. Then, we will particularize for the parametric PO setting.

\section{Minimum--Variance Behavioral Distribution}\label{sec:minVarDist}
In this section, we revise the problem of finding a behavioral distribution $Q \in \PM{\Xs}$ that induces an IS estimate $\widehat{\mu}_{P/Q}$ with minimum variance, knowing the (fixed) target distribution $P \in \PM{\Xs}$ and function $f \in \mathscr{B}(\Xs, [0,\infty))$.\footnote{We restrict our attention to non-negative functions. From the RL perspective, this choice is w.l.o.g. since we can always define an equivalent non-negative reward function, by means of a translation of the original one.} Furthermore, we do not enforce any restriction on the possible forms of the behavioral distribution $Q \in \PM{\Xs}$. 
The problem and the corresponding well-known \emph{minimum-variance behavioral distribution} $Q^*$ are stated in the following for all $x \in \Xs$~\citep{kahn1950random}:
{\thinmuskip=1mu
\medmuskip=1mu
\thickmuskip=1mu
\begin{align}\label{eq:problem}
\min_{Q \in \PM{\Xs}} \mathop{\mathbb{V}\mathrm{ar}}_{x \sim Q} \left[  \frac{p(x)}{q(x)} f(x)\right]  \implies  q^*(x) =\frac{ p(x) f(x)}{\mathop{\mathbb{E}}_{x \sim P}[f(x)]}.
\end{align}
}%
We observe that the IS estimator $\widehat{\mu}_{P/Q^*}$ is non-stochastic, equal to the quantity we aim to estimate, \ie $\widehat{\mu}_{P/Q^*}=\E_{x \sim P}[f(x)]$. This suggests that the construction of $Q^*$ is infeasible as it requires knowledge of $\E_{x \sim P}[f(x)]$. Since $Q^*$ generates a non-stochastic estimator, it not only  leads to zero-variance but, clearly, simultaneously minimizes the absolute central moments of any order. A second, and most remarkable property, is that $Q^*$ is a \emph{performance improvement} \wrt $P$, \ie the expectation of $f$ under $Q^*$ is larger than the expectation of $f$ under the target $P$~\citep{owen2013monte}:
{\thinmuskip=1mu
\medmuskip=1mu
\thickmuskip=1mu
\begin{align}\label{eq:Improvement}
	\E_{x \sim Q^*}[f(x)] - \E_{x \sim P}[f(x)] = \frac{\mathbb{V}\mathrm{ar}_{x \sim P}[f(x)]}{\E_{x \sim P}[f(x)]} \ge 0.
\end{align}
}%
It is worth noting that the magnitude of the improvement is directly related to the reduction in variance $\mathbb{V}\mathrm{ar}_{x \sim P}[f(x)]$.
Equation~\parref{eq:Improvement} suggests an appealing connection between the problem of finding the minimum-variance behavioral distribution and the problem of finding a target distribution that maximizes the expectation $\E_{x \sim P}[f(x)]$, \ie policy optimization. 

Before proceeding, let us map this general setting to PO. In the action-based (AB) setting, $x$ is the trajectory $\tau$, $P$ and $Q$ are trajectory distributions $p(\tau|\vtheta)$ induced by policies $\pi_{\vtheta}$. Instead, in the parameter-based (PB) setting, $x$ is the pair $(\vtheta, \tau)$,  $P$ and $Q$ are joint distributions $\nu_{\vrho}(\vtheta)p(\tau|\vtheta)$ induced by hyperpolicies $\nu_{\vrho}$. In both cases, function $f$ is the trajectory return $\mathcal{R}(\tau)$. 

In the following two sections, we will delve into the properties of the minimum-variance distribution under two assumptions: (i) there are no restrictions in the choice of the behavioral distribution $Q \in \PM{\Xs}$ (Section~\ref{sec:optRepOpt}); (ii) the behavioral distribution must be chosen within a subset $Q \in \mathcal{Q} \subseteq \PM{\Xs}$ (Section~\ref{sec:constrPolicySpace}).



\section{Unconstrained Probability Distribution Space}\label{sec:optRepOpt}
In Section~\ref{sec:minVarDist}, we have seen that $Q^*$ is a performance improvement \wrt $P$. We now formalize this construction by defining the operator $\mathcal{I}_{f} : \PM{\Xs} \rightarrow \PM{\Xs}$:
\begin{align}\label{eq:operator}
	\left(\mathcal{I}_{f}[P]\right)(x) = \frac{p(x) f(x)}{\E_{x \sim P}[f(x)]}, \quad \forall x \in \Xs.
\end{align}
Thus, $\mathcal{I}_{ f}$ takes as input a target distribution $P\in \PM{\Xs}$, a function $ f \in \mathscr{B}(\Xs,[0,\infty))$, and outputs the minimum-variance behavioral distribution for the IS estimation of $\E_{x \sim P}[f(x)]$, \ie $Q^* = \mathcal{I}_{ f}[P]$. Intuitively, looking at Equation~\parref{eq:operator}, by iterating the application of $\mathcal{I}_{f}$, we will obtain distributions tending to assign larger probability mass to points $x \in \Xs$ with high values of $f(x)$. The following result, due to~\cite{ghosh2020operator}, generalizes Equation~\parref{eq:Improvement}, showing that not only $\mathcal{I}_{ f}[P]$ is a performance improvement \wrt $P$, even when considering a composition between a monotonic increasing function $h$ and $f$, \ie using the operator $\mathcal{I}_{h \circ f}$.

\begin{restatable}[Proposition 9 of~\cite{ghosh2020operator}]{prop}{propIncreasing}\label{prop:propIncreasing}
Let $P \in \PM{\Xs}$, $f \in \mathscr{B}(\Xs,[0,\infty))$, and $h : [0, \infty) \rightarrow [0, \infty)$ monotonic increasing. Then, $\mathcal{I}_{h \circ f}[P]$ is a performance improvement \wrt $P$:
\begin{align*}
		\mathop{\E}_{x \sim \mathcal{I}_{h \circ f}[P]}[f(x)] & - \mathop{\E}_{x \sim P}[f(x)] \\
		& = \frac{\mathop{\mathbb{C}\mathrm{ov}}_{x \sim P}[h(f(x)), f(x)]}{\mathop{\E}_{x \sim P}[h(f(x))]} \ge 0.
	\end{align*}
\end{restatable}
Note that, since $h$ is a monotonic increasing function, we have that $\mathbb{C}\mathrm{ov}_{x \sim P}[h(f(x)), f(x)] \ge 0$~\citep{cuadras2002covariance}.

\subsection{Convergence Properties}\label{sec:convProp}
We now analyze the effect of repeatedly applying operator $\mathcal{I}_{f}$. More formally, let us consider an initial distribution $P \in \PM{\Xs}$, and suppose to iterate the application of the operator $\mathcal{I}_{ f}$, generating the sequence of distributions $(Q_{k})_{k \in \Nat}$, where $Q_{0} = P$ and for every $k \in \Nat[0]$ we have $Q_{k} = \mathcal{I}_{ f}[Q_{k-1}] = \left(\mathcal{I}_{ f}\right)^k[P]$. The following result shows that, under certain conditions, the operator $\mathcal{I}_{ f} $ admits fixed points and the sequence $(Q_{k})_{k \in \Nat}$ converges to a distribution $Q_{\infty} $ that assigns probability only to the global maxima of $f$, restricted to the support of $P$, \ie $\supp(P)$.
\begin{restatable}[]{thr}{thrConvergence}\label{thr:convergence}
Let $P \in \PM{\Xs}$ and $f \in \mathscr{B}(\Xs,[0,\infty))$. Then, the following statements hold:
\begin{enumerate}[label=(\roman*),leftmargin=*,noitemsep,topsep=0pt]
	\item $P$ is a fixed point of $\mathcal{I}_{ f}$, \ie $\mathcal{I}_{ f}[P] = P$ a.s., if and only if $\mathbb{V}\mathrm{ar}_{x \sim P}[f(x)] = 0$;
	\item let $\mathcal{X}^{*} = \argmax_{x \in \supp(P)} \{f(x)\}$ be the set of maxima of $f$ restricted to the support of $P$. If $\mathcal{X}^{*}$ is non-empty and measurable then, the repeated application of $\mathcal{I}_{ f}$ converges to a distribution $Q_{\infty} = \lim_{k \rightarrow \infty} \left(\mathcal{I}_{ f} \right)^k [P]$ with support $\mathcal{X}^{*}$. In particular $
	 \E_{x \sim Q_\infty} [f(x)]  = \max_{x \in \supp(P)} \{f(x)\}$.
\end{enumerate}
\end{restatable}

As a corollary to point (i), any deterministic $P$ is a fixed point of $\mathcal{I}_{f}$. Furthermore, from point (ii), we deduce that if we select $P$ that assigns non-zero probability to all points in $\Xs$, \ie $\supp(P) = \Xs$, the iterated application of $\mathcal{I}_{ f}$ converges to the distribution $Q_\infty$ such that $ \E_{x \sim Q_{\infty}} [f(x)] = \max_{x \in \Xs}\{ f(x)\}$, \ie we are performing a global optimization of $f$. It is worth noting that the reasoning above can be generalized by performing an application of a strictly-increasing function $h : [0, \infty) \rightarrow [0, \infty)$ leading to the operator $\mathcal{I}_{h \circ f}$ preserving the same properties.

\subsection{Implicit Trust Region}\label{sec:implicitTrustRegion}
We now prove that we are able to naturally control the divergence between two consecutive distributions $Q_k$ and $Q_{k+1}=\mathcal{I}_{ f}[Q_{k}]$ with $k \in \Nat$, with the effect of enforcing an \emph{implicit} trust region. The following result shows how it is possible to obtain a bound on the $\alpha$-\Renyi divergence between two consecutive distributions.
\begin{restatable}[]{thr}{thrTrustRegion}\label{thr:trustRegion}
	Let $P \in \PM{\Xs}$ and $f \in \mathscr{B}(\Xs,[0,\infty))$. Then, for every $\alpha \in [0, \infty]$, it holds that:
	\begin{align*}
	D_{\alpha}\left(\mathcal{I}_{f}[P] \| P\right) = \frac{1}{\alpha-1} \log \frac{ \E_{x \sim P}[f(x)^\alpha]}{\E_{x \sim P}[f(x)]^\alpha}.
\end{align*}
In particular, for $\alpha=1$ it holds that:
\begin{align*}
	D_{\text{KL}}(\mathcal{I}_{ f}[P] \| P) = \frac{\mathbb{C}\mathrm{ov}_{x \sim P}[f(x), \log f(x) ]}{\E_{x \sim P}[f(x)]}.
\end{align*}
\end{restatable}
For $\alpha = 2$, we obtain $D_{2}(I_{ f}[P] \| P) = \log \frac{\E_{x \sim P}[f(x)^2]}{\E_{x \sim P}[f(x)]^2} \le \frac{\mathbb{V}\mathrm{ar}_{x \sim P}[f(x)]}{\E_{x \sim P}[f(x)]^2}$. Thus, the divergence is large when the variance of $f(x)$ is. The result is particularly remarkable as we are able to control the \Renyi divergences of \emph{any} order $\alpha \in  [0,\infty]$. This is a relevant achievement since the trust regions commonly used, like KL-divergence~\citep{schulman2015trust}, are unable to control higher-order divergences that can still be infinite.


\begin{figure*}
\centering
\includegraphics[width=\textwidth]{img/ackley}
\caption{The Ackley function (left), the expectation of the distribution $Q_k = (\mathcal{I}_{h \circ f})^k[P]$ (center), and the KL-divergence (right) between two consecutive distributions $Q_{k-1}$ and $Q_k$, with $h=(\cdot)^\beta$.}\label{fig:example}
\end{figure*}

\begin{example}
	We consider (a slight variation of) the one-dimensional Ackley function~\citep{ackley2012connectionist}: $f(x) = -5 + 20 \exp (-0.1414 |x| ) + \exp ( 0.5 ( \cos (2\pi x) +1) ) + e$, shown in Figure~\ref{fig:example} (left) and the class of increasing functions $(h \circ f)(x)= f(x)^\beta$ where $\beta \ge 0$. We consider an initial uniform distribution $P = \mathrm{Uni} \left([-5,5] \right)$. In Figure~\ref{fig:example}, we plot the expectation of distribution $Q_k=(\mathcal{I}_{h \circ f})^k[P]$ (center) and the KL-divergence between two consecutive distributions (right), as a function of the number of applications $k$, for the different $\beta$ values. We observe that convergence to the global optimum ($x^*=0$ and $f(x^*) = 15$) is faster for higher powers that also lead to larger trust regions. We can now appreciate the role of the increasing function $h$ that works as a regularizer with the effect of controlling the size of the trust region.
\end{example}



\section{Constrained Probability Distribution Space}\label{sec:constrPolicySpace}
The approach we have presented in Section~\ref{sec:optRepOpt} can be applied when there are \emph{no} restrictions on the class of distributions that can be played, \ie we can select $Q$ in the whole space $\PM{\Xs}$. 
However, in the action-based PO, we can intervene on the policy $\pi_{\vtheta}$ factors only of the distribution $p(\tau|\vtheta) = D_0(s_0) \prod_{t=0}^{H-1} \pi_{\vtheta}(a_t|s_t) P(s_{t+1}|s_t,a_t)$, leading to a constrained setting. Similarly, in the parameter-based PO, we can act on the hyperpolicy $\nu_{\vrho}$ while keeping the trajectory  distribution $p(\tau|\vtheta)$ fixed.

More in general, when considering a class of distributions $\mathcal{Q} \subseteq \PM{\Xs}$, even if $P \in \mathcal{Q}$, the distribution $\mathcal{I}_{ f}[P]$ might not belong to $\mathcal{Q}$.
Furthermore, while $\mathcal{I}_{ f}[P]$ minimizes \emph{all} absolute central $\alpha$-moments of the IS estimator, as it leads to a non-stochastic estimator (Section~\ref{sec:minVarDist}), there may exist different distributions in $\mathcal{Q}$ minimizing the different absolute central $\alpha$-moments:
\begin{align}\label{eq:probAlpha}
	\min_{Q \in \mathcal{Q}} \E_{x \sim Q}\left[ \left| \frac{p(x)}{q(x)} f(x) - \E_{x \sim P} [f(x)] \right|^\alpha \right].
\end{align}
Apart from $\alpha=2$, where the problem in Equation~\parref{eq:probAlpha} reduces to Equation~\parref{eq:problem}, for general value of $\alpha \in [0, \infty]$, the optimization is not straightforward (\eg Equation~\parref{eq:probAlpha} is not differentiable for $\alpha \in (0,2)$). The following result shows that performing a \emph{moment projection} through the $\alpha$-\Renyi divergence is a reasonable surrogate for minimizing the absolute central $\alpha$-moments of Equation~\parref{eq:probAlpha}.
\begin{restatable}[]{prop}{propBoundAlpha}\label{prop:boundMoment}
Let $P \in \PM{\Xs}$ and $f \in \mathscr{B}(\Xs,[0,\infty))$. Then, for any $\alpha \in [2, \infty)$, it holds that:

\begin{align*}
& \underbrace{\E_{x \sim Q}\left[ \left| \frac{p(x)}{q(x)} f(x) - \E_{x \sim P} [f(x)] \right|^\alpha \right]}_{\text{absolute central $\alpha$-moment }} \\
& \qquad\qquad \le \underbrace{\E_{x \sim Q}\left[ \left( \frac{p(x)}{q(x)} f(x) \right)^\alpha\right]}_{\text{(non-central) $\alpha$-moment }} \\
& \qquad\qquad = e^{(\alpha-1) D_\alpha\left(\mathcal{I}_{ f}[P] \| Q\right) } \E_{x \sim P}[f(x)]^\alpha.
\end{align*}


\end{restatable}
Thus, having considered the subset of distributions $\mathcal{Q}\subseteq \PM{\Xs}$, whenever $\mathcal{I}_{ f}[P] \not\in \mathcal{Q}$, we replace it with the corresponding moment projection performed through the $\alpha$-\Renyi divergence:
\begin{align}\label{eq:probMomentProjection}
	Q^{\dagger} \in \argmin_{Q \in \mathcal{Q}} \left\{ D_{\alpha}(\mathcal{I}_{ f}[P] \| Q) \right\}.
\end{align}

\subsection{Performance Improvement}
In Proposition~\ref{prop:propIncreasing}, we have seen that $\mathcal{I}_{ f}[P]$ is a performance improvement \wrt $P$, evaluated under function $f$ (and also under the composition between $f$ and \emph{any} strictly-increasing function $h$). Unfortunately, when we move to a constrained set of distributions $\mathcal{Q} \subseteq \PM{\Xs}$, the performance improvement cannot be in general guaranteed for function $f$. However, as we shall see, the performance improvement still holds for a monotonic transformation of $f$, depending on the choice of $\alpha$.
\begin{restatable}[]{thr}{thrImprovementConstrained}\label{thr:thrImprovementConstrained}
Let $P \in \PM{\Xs}$ and $f \in \mathscr{B}(\Xs,[0,\infty))$. Let $\mathcal{Q} \subseteq \PM{\Xs}$, $Q \in \mathcal{Q}$, and $\alpha \in \alphaInterval$, then, it holds that:
\begin{align*}
\mathop{\E}_{x \sim Q}& \left[f(x))^\alpha \right]  -  \mathop{\E}_{x \sim P}[f(x)^\alpha] \ge \frac{\E_{x \sim P}[f(x)]^\alpha}{\alpha-1} \\
&\qquad \times \left( e^{(\alpha-1)D_{\alpha}(\mathcal{I}_{f} [P]\| P)}  -e^{(\alpha-1)D_{\alpha}(\mathcal{I}_{ f}[P] \|Q)} \right)
\end{align*}
In particular, for $\alpha=1$, it holds that~\citep[][Proposition 6]{ghosh2020operator}:
\begin{align*}
	\mathop{\E}_{x \sim Q}& \left[f(x) \right] -  \mathop{\E}_{x \sim P}[f(x)] \ge\mathop{\E}_{x \sim P}[f(x)] \\
	& \qquad \times \left(D_{\text{KL}}(\mathcal{I}_{ f}[P] \| P) - D_{\text{KL}}(\mathcal{I}_{ f}[P] \|Q)  \right).
\end{align*}
\end{restatable}
While the inequality holds in general, the performance improvement is obtained provided that $D_\alpha\left(\mathcal{I}_{ f}[P] \| Q\right)  \le D_\alpha\left(\mathcal{I}_{ f}[P] \| P\right) $, which is always guaranteed when $P \in \mathcal{Q}$ and $Q=Q^\dagger$, being $Q^\dagger$ defined in Equation~\parref{eq:probMomentProjection} as the minimizer of the second divergence term. 
The theorem shows that by minimizing the $\alpha$-moment of the function $ f$, we are able to guarantee a performance improvement on the function $f(\cdot)^\alpha$. In particular, if we select $\alpha=1$, we obtain a guarantee on the performance improvement of function $f$. From the RL perspective, therefore, moving in the direction of minimizing the $\alpha$-moment provides an improvement for the expected $\alpha$-power of the return $\mathbb{E}_{\tau \sim p(\cdot|\vtheta)}[\mathcal{R}(\tau)^\alpha]$.


\subsection{Convergence Properties}
By using Equation~\parref{eq:probMomentProjection} as an iterate $Q_{k+1} \in \argmin_{Q \in \mathcal{Q}} \left\{ D_{\alpha}(\mathcal{I}_{f}[Q_k] \| Q) \right\}$ to generate a sequence of distributions $(Q_k)_{k \in \Nat}$, we are \emph{not} guaranteed to converge to any fixed-point distribution $Q_{\infty}$, differently form the unconstrained setting (Theorem~\ref{thr:convergence}). This is because the minimization might yield multiple solutions. Nevertheless, we are able to provide guarantees on the final divergence value and on the performance of the distributions $Q_k$.
\begin{restatable}[]{thr}{thrConvergenceConstr}\label{thr:convergenceConstr}
Let $P \in \PM{\Xs}$ and $f \in \mathscr{B}(\Xs,[0,\infty))$. Let $\mathcal{Q} \subseteq \PM{\Xs}$ and suppose that $ f$ is bounded from above, then, the iterate $Q_{k+1} \in \argmin_{Q \in \mathcal{Q}} \left\{ D_{\alpha}(\mathcal{I}_{ f}[Q_k] \| Q) \right\}$ (where possible ties are broken arbitrarily) satisfies:
\begin{enumerate}[label=(\roman*),leftmargin=*,noitemsep,topsep=0pt]
		\item the sequence of divergences $D_{\alpha}(\mathcal{I}_{ f}[Q_k]\|Q_k)$ is convergent;
		\item the sequence of expectations $\E_{x \sim Q_k}\left[f(x)^\alpha \right]$ is non-decreasing in $k \in \Nat$ and converges to a stationary point of $\E_{x \sim Q}\left[f(x)^\alpha \right]$ \wrt $Q \in\mathcal{Q}$.
	\end{enumerate}
\end{restatable}

The convergence of the sequences $D_{\alpha}(\mathcal{I}_{f}[Q_k]\|Q_k)$ and $\E_{x \sim Q_k}\left[f(x)^\alpha \right]$ is derived by the performance improvement of Theorem~\ref{thr:thrImprovementConstrained}. Importantly, Theorem~\ref{thr:convergenceConstr} shows the convergence to a \emph{stationary point} of $\E_{x \sim Q}\left[f(x)^\alpha \right]$. If $\mathcal{Q}$ is a parametric space  $\mathcal{Q}_{\Cvvv} = \{Q_{\vvv} \in \PM{\Xs} : \vvv \in \Cvvv \subseteq \Reals^d \}$,\footnote{\textcolor{black}{In the action-based PO $\vvv  = \vtheta$ are the policy parameters and $\Cvvv = \Theta$, while in the parameter-based PO $\vvv  = \vrho$ are the hyperpolicy parameters and $\Cvvv = \mathcal{P}$.}} then we are guaranteed to stop when $\E_{x \sim Q_{\vvv}} \left[ \nabla_{\vvv} \log q_{\vvv}(x) f(x)^\alpha \right] = 0$, like for a general policy gradient method maximizing $f(x)^\alpha$~\citep{papini2018stochastic}. Compared to the result for the unconstrained distribution space (Theorem~\ref{thr:convergence}), we loose the convergence to a fixed point. This property can be recovered under the assumption that the iterate in Equation~\parref{eq:probMomentProjection} admits a unique solution for every $P$. In such a case, we will converge to a distribution $Q_{\infty} = \argmin_{Q \in \mathcal{Q}}\left\{ D_{\alpha}(\mathcal{I}_{ f}[Q]\|Q) \right\}$. 

\subsection{Implicit Trust Region}
In Theorem~\ref{thr:trustRegion}, we have proved that the $\alpha$-\Renyi divergence between $\mathcal{I}_{f}[P] $ and $P$ is bounded.  In this section, we study whether similar properties hold when we consider a limited set of distributions $\mathcal{Q} \subseteq \PM{\Xs}$. The following result shows that, under a particular form of convexity~\citep{erven2014renyi} of $\mathcal{Q}$, we are able to control the trust region as well.
\begin{restatable}[]{thr}{thrImplicitTrustConstr}\label{thr:thrImplicitTrustConstr}
Let $\alpha \in [0,1)$ and $f \in \mathscr{B}(\Xs,[0,\infty))$. Let $\mathcal{Q} \subseteq \PM{\Xs}$ be a $(1-\alpha)$-convex set~\citep[][Definition 4]{erven2014renyi}, $P \in \mathcal{Q}$, $Q^{\dagger} \in \argmin_{Q \in \mathcal{Q}} \left\{ D_{\alpha}(\mathcal{I}_{ f}[P] \| Q) \right\}$, then it holds that:
\begin{align*}
	D_{\alpha}\left(Q^{\dagger} \big\| P \right) \le D_{\alpha}\left(\mathcal{I}_{ f}[P] \| P \right) - D_{\alpha}\left(\mathcal{I}_{ f}[P] \big\| Q^{\dagger} \right).
\end{align*}
\end{restatable}
Therefore, we are always guaranteed that the trust region induced by $Q^{\dagger}$ is tighter compared to the one induced by $Q^* = \mathcal{I}_{ f}[P]$ computed in Theorem~\ref{thr:trustRegion}, \ie $D_{\alpha}\left(Q^{\dagger} \big\| P \right) \le D_{\alpha}\left(\mathcal{I}_{ f}[P] \| P \right)$.

A summary of the properties of the unconstrained and constrained settings is reported in Table~\ref{tabb}.


\begin{table*}
\resizebox{\textwidth}{!}{%  
	\begin{tabular}{l|C{5.5cm}C{3.2cm}C{3cm}C{1.8cm}}
	\toprule
	\textbf{Setting} & \textbf{Iterate} & \textbf{Performance improvement} & \textbf{Convergence}  & \textbf{In policy search?}\\
	\midrule
	Unconstrained & $Q_{k+1} = \mathcal{I}_f[Q_k]$ & Yes, on $h\circ f$ ($h$ any monotonic increasing) & Global optimum of $f$  & Not realistic\\
	Constrained & $Q_{k+1} \in \argmin_{Q \in \mathcal{Q}} D_\alpha(\mathcal{I}_f[Q_k] \| Q) $ & Yes, on $f(\cdot)^\alpha$ & Stationary point of $\E_{x \sim Q}[f(x)^\alpha]$ & Realistic\\
	\bottomrule
	\end{tabular}}
	\caption{Summary of the properties of the constrained and unconstrained settings.}\label{tabb} 
\end{table*}

\section{Numerical Simulations}\label{sec:sampleBasedOptimization}
In this section, we numerically validate the theoretical findings presented in the previous sections. To this end we make use of a sample-based policy learning algorithm \emph{\algnameext} (\algname). 
 generality, we consider a parametric distribution space $\mathcal{Q}_{\Cvvv} = \{Q_{\vvv} \in \PM{\Xs} : \vvv \in \Cvvv \subseteq \Reals^d \}$, a common setting met in PO.

\RestyleAlgo{ruled}
\begin{algorithm}[t]
\SetAlgoNoLine
\SetKwInOut{Input}{input}
\SetKwInOut{Output}{output}
\Input{$\alpha$ divergence order, $h$ function, $f$ function, $\mathcal{Q}_{\Cvvv}$ distribution space, $\vvv_1 $ initial parameter, $n$ batch size}
\Output{final parameter $\vvv_{I+1} \in \Cvvv$}
\BlankLine
\For(\tcp*[r]{\textcolor{blue}{Optimization}}){$i=1,\dots,I$}{
	$\textcolor{red}{\overline{\vvv}_{i,1}} = \textcolor{blue}{\vvv_i}$
	\BlankLine
	\For(\tcp*[r]{\textcolor{red}{Evaluation}}){$j=1,\dots,J$}{
		Collect $\mathcal{D}_{i,j} = \{ (x_l, f(x_l)) \}_{l \in [n]}$ with $Q_{\textcolor{red}{\overline{\vvv}_{i,j}}}$\label{lalg:collect}\\
		\textcolor{black}{Using  $\left( \mathcal{D}_{i,k} \right)_{k \in [j]}$, perform $M$ steps of gradient descent on the objective of Theorem~\ref{thr:thrConcentration}}\hspace{-3.8cm}\label{lalg:optim}
	}
	$\textcolor{blue}{\vvv_{i+1}} = \textcolor{red}{\overline{\vvv}_{i,J+1}}$
}
\caption{\algname.}\label{alg:alg}
\end{algorithm}

\subsection{Algorithm}
The goal of \algname consists in illustrating what are the effects of employing the minimization of the $\alpha$-moment of the IS estimator to learn proficient policies. As we have seen in the previous section, this approach is guaranteed to yield performance improvement for $\alpha = 1$ only (in the constrained case). However, as we shall see, from an empirical perspective other choices of $\alpha$ would deliver surprisingly remarkable performances too.\footnote{While we limit our presentation to \emph{actor-only} algorithms, our framework can be applied to \emph{actor-critic} methods by setting, for instance, $f = Q_{\bm{w}}$ (i.e., the critic) and $q_{\bm{\xi}} = \pi_{\bm{\theta}}$ (i.e., the actor). Clearly, the convergence properties of such an approach would depend on the critic accuracy.}

The structure of \algname consists of two nested loops. The outer loop (\texttt{\textcolor{blue}{Optimization}}) acts on the target distribution $q_{\textcolor{blue}{\vvv_{i}}}$. At the end of each outer iteration $i \in [I]$, the target distribution $q_{\textcolor{blue}{\vvv_{i+1}}}$ is updated with the last behavioral distribution produced by the inner loop $q_{\overline{\vvv}_{i,J+1}}$. Instead, the inner loop (\texttt{\textcolor{red}{Evaluation}}) takes the target distribution provided by the outer loop $q_{\textcolor{blue}{\vvv_i}}$ and provides a new behavioral distribution. At each inner iteration $j \in [J]$, it collects samples $\mathcal{D}_{i,j}$ with the current behavioral distribution $q_{\textcolor{red}{\overline{\vvv}_{i,j}}}$ and employs them, together with all the samples collected so far $(\mathcal{D}_{i,k})_{k \in [j]}$, to compute the next  behavioral distribution $q_{\textcolor{red}{\overline{\vvv}_{i,j+1}}}$, with the goal of finding the behavioral distribution minimizing the absolute central $\alpha$-moment of the IS estimator (Equation~\ref{eq:probAlpha}). As we shall see, the optimization is performed using samples and by resorting to a penalized objective.

\textbf{Sample-based Optimization}~~The problem of finding the next behavioral distribution parameter $\textcolor{red}{\overline{\vvv}_{i,j+1}}$ using the samples collected so far $\left( \mathcal{D}_{i,k} \right)_{k \in [j]}$ is an off-policy learning problem. Let us define $\Phi_{i,j} = \frac{1}{j} \sum_{k \in [j]} q_{\textcolor{red}{\overline{\vvv}_{i,k}}}$ as the mixture of the $j$ behavioral distributions experienced so far in the inner loop. Instead of directly estimating ${D}_{\alpha}\left( \mathcal{I}_{ f}[Q_{\textcolor{blue}{\vvv_{i}}}] \| Q_{\textcolor{red}{\vvv}}\right))$, we refer to the (non-central) $\alpha$-moment, which is connected to the original objective through Proposition~\ref{prop:boundMoment}. Since we have samples coming from different behavioral distributions, we can use a \emph{multiple} IS estimator~\citep{veach1995optimally}:\footnote{Clearly, when $\alpha=1$, the expression does not depend on the behavioral distribution. Thus, for the sake of the algorithm, it makes sense to consider $\alpha>1$ only.} 
\begin{equation}
\begin{aligned}\label{eq:DEstimator}
	\widehat{d}_{\alpha}\left( \mathcal{I}_{ f}[Q_{\textcolor{blue}{\vvv_{i}}}] \| Q_{\textcolor{red}{\vvv}}; \Phi_{i,j}\right) & =  \frac{1}{nj} \sum_{k\in [j]} \sum_{l \in[n]} \underbrace{\frac{q_{\textcolor{red}{\vvv}}(x_{k,l})}{\Phi_{i,j}(x_{k,l})}}_{\text(a)} \\
	& \quad \times\underbrace{\frac{q_{\textcolor{blue}{\vvv_i}}(x_{k,l})^\alpha}{q_{\textcolor{red}{\vvv}}(x_{k,l})^\alpha}  f(x_{k,l})^\alpha}_{\text{(b)}}.
\end{aligned}
\end{equation}
The (a) factor accounts that we are using samples collected with the mixture $\Phi_{i,j}$ to estimate an expectation under $q_{\textcolor{red}{\vvv}}$, whereas the (b) factor  is the actual variable we want to compute the expectation of, \ie the $\alpha$-moment. It is simple to prove that the expectation of $\widehat{d}_{\alpha}$ is indeed the $\alpha$-moment~\citep{papini2019optimistic}. To minimize Equation~\parref{eq:DEstimator}, we employ a variance correction
to mitigate the effect of finite samples~\citep{metelli2018policy}, theoretically grounded in the following result.
\begin{restatable}[]{thr}{thrConcentration}\label{thr:thrConcentration}
Let $\mathcal{Q}_{\Cvvv} \subseteq \PM{\Xs}$ be a set of parametric distributions and let $\textcolor{red}{\vvv}, \textcolor{blue}{\vvv_{i}} \in \Cvvv$. If $\|f \|_{\infty} \le \overline{m}$, then, if samples are independent, for every $\delta \in [0,1]$, with probability at least $1-\delta$ it holds that:
\begin{align*}
	\E_{x \sim\textcolor{red}{\vvv} } & \left[ \left( \frac{q_{\textcolor{blue}{\vvv_{i}}}(x)}{q_{\textcolor{red}{\vvv}}(x)} f(x) \right)^\alpha \right]  \le  \widehat{d}_{\alpha}\left( \mathcal{I}_{ f}[Q_{\textcolor{blue}{\vvv_{i}}}] \| Q_{\textcolor{red}{\vvv}}; \Phi_{i,j}\right) \\
	& \quad + \overline{m}^\alpha \sqrt{\frac{2\log \frac{1}{\delta}}{nj} \int_{\Xs} \frac{q_{\textcolor{blue}{\vvv_i}}(x)^{2\alpha}}{\Phi_{i,j}(x)q_{\textcolor{red}{\vvv}}(x)^{2(\alpha-1)}} \de x}.
\end{align*}
\end{restatable}
Some remarks are in order. First, the integral within the square root is an upper bound to the variance of the $\alpha$-moment estimator $\widehat{d}_{\alpha}\left( \mathcal{I}_{ f}[Q_{\textcolor{blue}{\vvv_{i}}}] \| Q_{\textcolor{red}{\vvv}}; \Phi_{i,j}\right)$. In particular, when $\textcolor{red}{\vvv} = {\textcolor{blue}{\vvv_{i}}}$, we obtain the exponentiated \Renyi divergence, as  illustrated in~\citep{metelli2020importance}. When all involved distributions are Guassians, it is possible to provide a closed-form tight bound on this quantity (Appendix~2). Second, unlike the results available in the literature about concentration of IS estimator, without corrections or transformations, we are able to provide an exponential concentration inequality (dependence on $\delta$ of the form $\log (1/\delta)$ ), instead of a polynomial concentration (dependence of the form $1/\delta$). This is due to the fact that we are dealing with random variables that are bounded to zero from below and they allow applying stronger unilateral Bernstein's concentration inequalities~\citep{boucheron2009concentration}.
The reader might object that to optimize the proposed objective function, designed to enforce an implicit trust region, we are actually introducing an additional correction term. This is necessary for theoretical purposes, but, as we shall see in the Section~\ref{sec:exper}, the need for a penalization or constraint is significantly less relevant than in existing approaches, like TRPO~\citep{schulman2015trust}, or POIS~\citep{metelli2018policy}. The expression of the gradient of the right hand side of Theorem~\ref{thr:thrConcentration} is reported in Appendix~3.


\begin{figure*}[t]
\centering
\includegraphics[width=\textwidth]{img/experiments}
\caption{\textcolor{black}{Average return as a function of the number of episodes for different environments and algorithms with batch size $n=100$, $\alpha=2$, and $J=1$ (20 runs $\pm$ 95\% bootstrapped c.i.).}}\label{fig:experiment}
\end{figure*}

\begin{figure*}[t]
\begin{minipage}{.55\textwidth}
  \centering
  \includegraphics[width=\textwidth]{img/batch-size}
\captionof{figure}{Average return as a function of the number of episodes in the Cartpole environment for different algorithms, batch-size $n$ and inner iterations $J$ (10 runs $\pm$ 95\% bootstrapped c.i.).}\label{fig:experimentBS}
\end{minipage}%
\hfill
\begin{minipage}{.42\textwidth}
  \centering
  \includegraphics[width=.88\textwidth]{img/power}
  \captionof{figure}{Average return as a function of the number of episodes in the Inverted Double Pendulum for different choices of $h=(\cdot)^\beta$ (5 runs $\pm$ 95\% bootstrapped c.i.).}\label{fig:experimentPW}
\end{minipage}
\end{figure*}



\textbf{Sample Collection}~~In the action-based setting (AB-\algname), we sample $n$ trajectories $\{\tau_{l}\}_{l \in [n]}$ independently with the policy $\pi_{\overline{\vtheta}_{i,j}}$ and we build the dataset $\mathcal{D}_{i,j} = \{(\tau_{l}, \mathcal{R}(\tau_l))\}_{l \in [n]}$. Instead, in the parameter-based setting (PB-\algname), we sample independently $n$ policy parameters $\{\vtheta_l\} _{l \in [n]}$ and for each of them we run policy $\pi_{\vtheta_l}$ once to generate trajectory $\tau_l$. The corresponding dataset is given by $\mathcal{D}_{i,j} = \{((\vtheta_l,\tau_{l}), \mathcal{R}(\tau_l))\}_{l \in [n]}$. For the AB case, the correction in  Theorem~\ref{thr:thrConcentration} is estimated from samples, as done for the \Renyi divergence in~\citep{metelli2018policy}, since it involves integrals between trajectory distributions, while the closed form exists for Gaussian distributions (Appendix~2).


\subsection{Results}\label{sec:exper}
In this section, we provide the simulation results on continuous control tasks. We first compare the learning performance of \algname with POIS~\citep{metelli2018policy} and TRPO~\citep{schulman2015trust} on four benchmarks. Then, deepen two relevant aspects of \algname: its robustness to small batch sizes and the effect of applying a monotonic increasing transformation $h$ on function $f$. All experiments are conducted with Gaussian policies, linear in the state, with fixed variance. The experimental details are reported in Appendix~4. The code to reproduce the presented results is provided at: \url{https://github.com/albertometelli/uai2023}.

\textbf{Comparison with POIS and TRPO}~~In Figure~\ref{fig:experiment}, we show the average return as a function of the number of collected episodes, with a batch size $n=100$, using $\alpha=2$, and one inner iteration ($J=1$). In the Cartpole environment, we observe that the performance of AB-\algname is slightly above that of AB-POIS and PB-\algname; while the fastest learning curve is shown by PB-POIS. Instead, TRPO converges to a suboptimal policy that fails keeping the pole in the vertical position. In the Inverted Double Pendulum experiment, the gap between AB-\algname and AB-POIS and TRPO is more evident. The PB versions outperform the AB ones with \algname slightly faster than POIS. In the Mountain Car domain, while AB-POIS, TRPO, and PB-\algname display a similar convergence speed, AB-\algname and PB-POIS reach the optimal performance faster. Finally, in the Mujoco Swimmer domain~\citep{mujocoCit}, AB-\algname and TRPO clearly outperform AB-POIS, although the fastest learning curves are displayed by the PB versions of POIS and \algname. 


\textbf{Robustness to Small Batch Sizes}~~
Based on the previous results, we further investigate the properties of \algname in terms of variance control. In the Cartpole domain, we test the robustness to the reduction of the batch size. In Figure~\ref{fig:experimentBS}, we show the average return as a function of the number of collected episodes for batch sizes $n \in \{11, 50\}$ and different number of inner iterations $J$. Also considering the $n=100$ case (Figure~\ref{fig:experiment}), we notice, as expected, that the variance of each setting increases overall as $n$ decreases. Nevertheless, \algname proves to be robust, always succeeding in reaching the optimal performance. Differently, POIS suffers the reduced batch size, while TRPO always converging to the same suboptimal policy. The desirable behavior of \algname is indeed an effect of the kind of objective function we employ that explicitly accounts for the variance of the estimator, trying to minimize it, and, as we have shown in the previous sections, it allows enforcing an implicit trust region. Finally, a small number of inner iterations $J$ is beneficial for the stability.

\textbf{Effect of the Function $h$}~~We now investigate the effects of using a transformation function $h = (\cdot)^\beta$. Thus, instead of optimizing the expected return, we will optimize the $\beta$-power of the expected return. In Figure~\ref{fig:experimentPW}, we show the learning curves of the Inverted Double Pendulum for different values of $\beta$. We notice that for $\beta$ close to $1$ ($0.5$, $1$, $2$) the curves are not very dissimilar, while for too extreme powers ($0.1$ and $4$) the learning performance degrades. This example shows an interesting phenomenon, \ie even if we optimize a power of return, within certain limits, we are still able to converge to a (near-)optimal policy.

\section{Discussion and Conclusions}\label{sec:discussion}
In this paper, we have studied the relation between policy improvement and off-policy minimum-variance policy evaluation. Specifically, we imported the role of IS as a variance reduction active tool, typical of the Monte Carlo simulation, to the off-policy learning setting. We have illustrated that by minimizing the absolute central $\alpha$-moment of the IS estimator yields a performance improvement guaranteed on a power of the original objective function, \ie the expected return in RL. Although the performance improvement is ensured for the case of $\alpha=1$ only, we have empirically illustrated that even considering $\alpha > 1$, especially $\alpha=2$ (\ie minimizing the variance), delivers remarkable learning curves. This phenomenon is justified by the fact that minimizing the variance of IS estimator, as proved theoretically, naturally induces a trust region, mitigating the need for an explicit penalization or constraint. Thus, the bias due to the fact that we are not providing a performance improvement for the expected return (but just for the expected $\alpha$-power of the return) is compensated by the reduced variance and enforced trust region. Furthermore, this method has proved to be remarkably robust to the reduction of the batch size. We believe that this work contributes to shed light on an appealing facet of off-policy learning with possible new research opportunities. Future works include an extension of the convergence analysis to the sample-based setting and an experimentation of with more complex policy architectures. Specifically, an interesting direction is to investigate the application to \emph{actor-critic} architectures.



\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
This paper is supported by FAIR (Future Artificial Intelligence Research) project, funded by the NextGenerationEU program within the PNRR-PE-AI scheme (M4C2, Investment 1.3, Line on Artificial Intelligence).
\end{acknowledgements}

% References


\bibliography{biblio}
\end{document}
