% Given a tuple of experience $(s,a,r,s',a')$, we write $\epsilon^\tau = S_\theta^{s, a}(\tau) - (r + \gamma z')$ for the distributional TD error for a fixed $\tau$ with $r \sim R$ and $z' \sim Z(s',a')$, and we define $\epsilon := \epsilon^{0.5}$. Ideally, after training, we should have $\mathbb{E}\left[ \epsilon \right] = 0$. However, it is likely that during training, we have $\mathbb{E}\left[ \epsilon \right] \neq 0$, as the policy being evaluated has not been fully fitted yet. Thus, acting greedily with respect to the mean may be suboptimal, therefore hurting exploitation. 

% Expectiles offer the opportunity to act optimally even if the TD error has not been fully reduced yet, by acting greedily with respect to a different expectile $\tau^{\mathrm{act}}$, such that $\mathbb{E}[\epsilon^{\tau^{\mathrm{act}}}] = 0$. An interesting property of expectiles is that the expectile indexed by $\tau$ corresponds to the mean of a particular reweighted regression problem, where values above it are rescaled by a factor $\tau$ and values below by a factor $1 - \tau$~\citep{expbible}. Therefore, to obtain the formula for $\tau^{\mathrm{act}}$, we can first write:
% \begin{equation}
%     \mathbb{E}[\epsilon^\tau] =  \tau \mathbb{E}[\, \epsilon \mid \epsilon \geq 0] \mathrm{Pr}(\epsilon \geq 0) + (1 - \tau) \mathbb{E}[\, \epsilon \mid \epsilon < 0] \mathrm{Pr}(\epsilon < 0).
% \end{equation}

% Then the rescaled error $\epsilon^{\tau^{\mathrm{act}}}$ is centered on $0$ if $\mathbb{E}[\epsilon^{\tau^{\mathrm{act}}}] = 0$, i.e., 
% \begin{equation}
%     \tau^{\mathrm{act}} \mathbb{E}[ \,|\epsilon| \mid \epsilon \geq 0] \mathrm{Pr}(\epsilon \geq 0) = (1 - \tau^{\mathrm{act}}) \mathbb{E}[ \,|\epsilon| \mid \epsilon < 0] \mathrm{Pr}(\epsilon < 0) ,
% \end{equation}
% %
% which, assuming $\epsilon \neq 0$, gives a formula for the selection of $\tau^{\mathrm{act}}$:
% \begin{equation}
% \label{tau-scheduler}
%     \tau^{\mathrm{act}} = \frac{1}{1 + \frac{\mathbb{E}[ \, |\epsilon| \mid \epsilon \geq 0] \mathrm{Pr}(\epsilon \geq 0)}{\mathbb{E}[ \,|\epsilon| \mid \epsilon < 0] \mathrm{Pr}(\epsilon < 0)}}.
% \end{equation}
% %
% In practice, given a batch of training errors $\epsilon$, we compute empirically the expectile fraction $\tau^{\mathrm{act}}$ and use it to act in the environment. In our case, we use soft actor-critic (SAC)~\citep{sac} as the backbone for our method, so we train the actor on the task of maximizing the corresponding expectile value of the $Z$-function, that is, given an actor $\pi_\psi$ parameterized by $\psi$ and a parameter $\alpha$ controlling the entropy of the desired policy, we maximize the following loss by gradient ascent:
% %
% \begin{equation}
%     \mathcal{L}^{\psi}(s,a) =  S_\theta(s, a_\psi, \tau^{\mathrm{act}}) - \alpha \log \pi_\psi(a_\psi \mid s), \textup{ with } a_\psi \sim \pi_\psi(\cdot \mid s).
% \end{equation}
% %
% For better robustness, we perform a soft update of $\tau^{\mathrm{act}}$ at each training step by Polyak averaging~\citep{polyak} with the same forgetting factor as for target networks of the $Z$-function. Also, we noticed in our experiments that individual errors could be very large and challenge the stability of our agent, so we clip $\mathbb{E}[ \, |\epsilon| \mid \epsilon \geq 0]$ and $\mathbb{E}[ \, |\epsilon| \mid \epsilon < 0]$ in Eq.~\ref{tau-scheduler}. We perform an ablation study in Appendix~\ref{ablation-scheduler}.


% \section{Ablation study for the action selection strategy}
% \label{ablation-scheduler}

% \begin{figure*}
%     \centering
%     \tabskip=0pt
%     \halign{#\cr
%       \hbox{%
%         \begin{subfigure}[b]{.49\textwidth}
%         \centering
%         \includegraphics[width=.95\textwidth]{figs/walker_ablation.png}
%         %\caption{Approximating a distribution with separate and dual training.}
%         \end{subfigure}%
%       }%
%       \hbox{%
%         \begin{subfigure}[b]{.49\textwidth}
%         \centering
%         \includegraphics[width=.95\textwidth]{figs/ant_ablation.png}
%         %\caption{Approximating a distribution with separate and dual training.}
%         \end{subfigure}%
%       }\cr
%       \hbox{%
%         \begin{subfigure}{.49\textwidth}
%         \centering
%         \includegraphics[width=.95\textwidth]{figs/humanoid_ablation.png}
%         %\caption{Tabular distributional RL with separate  and dual training.}
%         %\label{fig:separate_vs_dual_bellman}
%         \end{subfigure}%
%       }%
%       \hbox{%
%         \begin{subfigure}[b]{.49\textwidth}
%         \centering
%         \includegraphics[width=.95\textwidth]{figs/hopper_ablation.png}
%         %\caption{Approximating a distribution with separate and dual training.}
%         \end{subfigure}%
%       }\cr
%     }
%     \caption{Ablation study of the action selection strategy. Average return of $5$ seeds on the MuJoCo continuous control benchmark.}
%     \label{fig:mujoco-ablation}
% \end{figure*}

% We perform a short ablation study on the effect of the action selection strategy introduced in Section~\ref{sec:scheduler}, to assess whether the adaptive version brings an improvement over a mean selection.
% Our ablation study shows that our adaptive action selection strategy may be slightly more sample efficient than a mean selection on Walker2d-v4 and Humanoid-v4, albeit not significantly on our 5 seeds. However, it improves overall performance and sample efficiency on Hopper-v4.

% This confirms our initial idea that our adaptative action selection strategy can help in improving the sample efficiency, especially in harder environments such as Hopper.

% \vspace{3cm}
\subsection{Action selection strategy for environment interactions}
\label{sec:scheduler}
This section is concerned with the choice of statistic to be considered by the agent when interacting with the environment. Previous studies based on quantiles have considered the CVaR~\citep{codac} or the (potentially weighted) mean of quantiles~\citep{qr-dqn,fqf}, and previous work on expectile-based distributional RL used the expectile $0.5$~\citep{er-dqn}, as it is a natural estimator of the mean. We take a different approach by leveraging properties of the expectiles. Namely, expectile regression offers the possibility of adapting the action selection strategy, even when our goal is to optimize for the mean return. This allows us to propose a method allowing faster propagation of the temporal difference error and, as a result, better sample-efficiency. 


%\expectilewasserstein*

% \begin{proof}
% We first prove (i). We have:
% \begin{equation}  
%         W_1(\Pi_{E}\eta_1, \mathcal{E}_{\eta_1}) = \sum_{i=0}^{K-1} \frac{1}{K} \mathbb{E}_{X \sim \mathcal{E}_{\eta_1}} \left[ \left| X - E_{\eta_1} \left( \frac{2i + 1}{2K}\right) \right| \;\middle|\;  E_{\eta_1} \left( \frac{i}{K} \right) \leqslant X \leqslant E_{\eta_1} \left( \frac{i + 1}{K} \right)\right] 
% \end{equation}
% As shown in \citep{german-paper, expbible}, the expectile function is continuous and monotonically increasing on $[0,1]$. Therefore, $\forall \varepsilon > 0, \exists K \in \mathbb{N}$ such that 
% \begin{equation}  
% \begin{split}
% \forall x &\in \left[ E_{\eta_1} \left( \frac{i}{K} \right), E_{\eta_1} \left( \frac{i+1}{K} \right)\right] , \\ 
% &\left| x - E_{\eta_1} \left(\frac{2i+1}{2K} \right) \right| \leqslant \max \left[ \left| E_{\eta_1} \left(\frac{i}{K} \right) - E_{\eta_1} \left(\frac{2i+1}{2K} \right) \right| ; \left| E_{\eta_1} \left(\frac{i+1}{K} \right) - E_{\eta_1} \left(\frac{2i+1}{2K} \right) \right| \right] \leqslant \varepsilon   \; . 
% \end{split}
% \end{equation}
% Then, by bounding the expectation with a rectangle, we get:
% \begin{equation}  
% \begin{split}
% \forall \varepsilon >0, \exists K \in\mathbb{N} \;/\;\; W_1(\Pi_{E}\eta_1, \mathcal{E}_{\eta_1}) \leqslant \sum_{i=0}^{K-1} \frac{1}{K} \times \frac{\varepsilon}{K} = \frac{\varepsilon}{K} \;,
% \end{split}
% \end{equation}
% which, by definition, yields (i).

% (ii) results from (i) and the triangle inequality:
% \begin{itemize}[nosep, leftmargin=*]
%     \item $W_1(\Pi_{E}\eta_1, \Pi_{E}\eta_2) \leqslant W_1(\Pi_{E}\eta_1, \mathcal{E}_{\eta_1}) + W_1(\mathcal{E}_{\eta_1}, \mathcal{E}_{\eta_2}) + W_1(\mathcal{E}_{\eta_2}, \Pi_{E}\eta_2) = W_1(\mathcal{E}_{\eta_1}, \mathcal{E}_{\eta_2}) + o \left(\frac{1}{K}\right)$
%     \item $W_1(\Pi_{E}\eta_1, \Pi_{E}\eta_2) \geqslant \left| W_1(\Pi_{E}\eta_1, \mathcal{E}_{\eta_1}) - W_1(\mathcal{E}_{\eta_1}, \Pi_{E}\eta_2) \right|$

%     \begin{itemize}
%         \item If $W_1(\Pi_{E}\eta_1, \mathcal{E}_{\eta_1}) \leqslant W_1(\mathcal{E}_{\eta_1}, \Pi_{E}\eta_2)$, we have: 
%         \begin{equation}
%         \begin{split}
%             W_1(\Pi_{E}\eta_1, \Pi_{E}\eta_2) &\geqslant  W_1(\mathcal{E}_{\eta_1}, \Pi_{E}\eta_2) - W_1(\Pi_{E}\eta_1, \mathcal{E}_{\eta_1}) \\
%             &\geqslant \left| W_1(\mathcal{E}_{\eta_1}, \mathcal{E}\eta_2) - W_1(\mathcal{E}_{\eta_2}, \Pi_{E}\eta_2) \right| - W_1(\Pi_{E}\eta_1, \mathcal{E}_{\eta_1})\\
%             & = W_1(\mathcal{E}_{\eta_1}, \mathcal{E}\eta_2) + o \left(\frac{1}{K}\right)
%         \end{split}
%         \end{equation}
%         \item If $W_1(\Pi_{E}\eta_1, \mathcal{E}_{\eta_1}) \geqslant W_1(\mathcal{E}_{\eta_1}, \Pi_{E}\eta_2)$, we have  $W_1(\Pi_{E}\eta_1, \Pi_{E}\eta_2) \geqslant  W_1(\Pi_{E}\eta_1, \mathcal{E}_{\eta_1}) - W_1(\mathcal{E}_{\eta_1}, \Pi_{E}\eta_2) $
%     \end{itemize}
% \end{itemize}
% \end{proof}

%\romain{Text about what that means and how it relates to the following lemma.}
%\begin{corrolary}[EDP theorem]
%Consider the class of MDPs $\mathscr{M}$ with a fixed discount factor $\gamma \in [0, 1)$. Then the collection of expectile statistics $e_k(\eta) = E^{-1}_\eta \left(\frac{2k-1}{2K} \right)$ for $k = 1,\dots, K$ is $\varepsilon$-approximately Bellman-closed for $\mathscr{M}$, where $\varepsilon = \mathcal{O} \left( \frac{1}{K^2} \right)$.
%\end{corrolary}

%\begin{proof}
%\end{proof}


% \begin{equation}  
% \begin{split}
%         W_1(\Pi_{\mathcal{M}}\eta_1, \eta_1) &= \sum_{i=0}^{K-1} \frac{1}{K} \mathbb{E} \left[ \left| X - E_\eta \left(\mathrm{floor}^k \left( E^{-1}_\eta \left( F^{-1}_\eta \left(\frac{2i + 1}{2K} \right) \right)  \right)\right) \right| \mid F^{-1}_\eta \left(\frac{i}{K} \right) \leqslant X \leqslant F^{-1}_\eta \left(\frac{i+1}{K} \right) \right]\\
%         & = \frac{1}{K} \sum_{i=0}^{K-1}  \mathbb{E} \left[ \left| E_\eta(E^{-1}_\eta(X)) - E_\eta \left(\mathrm{floor}^k \left( E^{-1}_\eta \left( F^{-1}_\eta \left(\frac{2i + 1}{2K} \right) \right)  \right)\right) \right| \mid E^{-1}_\eta \left(F^{-1}_\eta \left(\frac{i}{K}\right)\right) \leqslant E^{-1}_\eta(X) \leqslant E^{-1}_\eta \left( F^{-1}_\eta\left(\frac{i+1}{K} \right)\right) \right] \\
%         & \leqslant \frac{1}{K} \sum_{i=0}^{K-1} \frac{1}{K}\left( E_\eta\left(E^{-1}_\eta \left( F^{-1}_\eta\left(\frac{i+1}{K} \right)\right)\right) - \min \left\{ E_\eta\left(E^{-1}_\eta \left( F^{-1}_\eta\left(\frac{i}{K}\right)\right)\right) \; ; \; E_\eta \left(\mathrm{floor}^k \left( E^{-1}_\eta \left( F^{-1}_\eta \left(\frac{2i + 1}{2K} \right) \right)  \right)\right) \right\} \right)\\
%         & \leqslant \frac{1}{K} \sum_{i=0}^{K-1} \frac{1}{K}\left( E_\eta\left(E^{-1}_\eta \left( F^{-1}_\eta\left(\frac{i+1}{K} \right)\right)\right) - E_\eta\left(E^{-1}_\eta \left( F^{-1}_\eta\left(\frac{i}{K}\right)\right) - \frac{1}{K}\right)\right)\\
%         & = \frac{1}{K}\left( E_\eta\left(E^{-1}_\eta(1)\right) - E_\eta\left(E^{-1}_\eta(0)\right) \right) + \frac{1}{K}\\
%         & = \frac{I + 1}{K}\\
% \end{split}
% \end{equation}


%  \medskip


%\textbf{Broader impact statement}. Automated decision making, when employed in resource allocation, has the potential to discriminate populations based on various factors~\citep{fan2022welfare} -- this is even more true for minorities, that are often not present enough in the training of such algorithms~\citep{minorityfaces}. While this drawback is present in most applications of RL, we believe that moving towards distributional reinforcement learning helps in accounting for edge cases. This is even more true with expectiles, that are notoriously good at representing threshold effects and outliers~\citep{expectile-blue}.
%Similarly to previous quantile-based methods, we use the $N = 201$ statistics sampled at each training step to generate $N$ samples from the target distribution. 



% In short, where quantiles are a generalisation of the median, expectiles are a generalisation of the mean. Interestingly, they learn different information about on the distribution: quantiles are traditionally trained using an asymmetric L1 loss and are therefore robust to outliers, while expectiles rely on an asymmetric L2 loss, which finds more stable solutions and has better optimization properties. In particular, expectile regression produces the best linear unbiased estimator (BLUE) of any point within the range of the distribution, \textit{including all quantiles of the distribution}~\citep{expectile-blue}. Moreover, expectile regression provides a straightforward estimator of the mean expected return, i.e., the expectile $0.5$. Finally, expectiles present valuable properties for deep RL that we detail in Section \ref{sec:method}. 

%The properties listed above make expectile regression a desirable approach for distributional RL. However, in distributional temporal difference learning~\citep{distributional-book}, the distributional Bellman operator requires samples from the target distribution to compute the target during value-function training and, contrary to quantiles, obtaining such samples from expectile values is not obvious. Previous work~\citep{er-dqn} observed that a naive approach using expectile values as pseudo-samples lacks theoretical guarantees and makes the distribution collapse to the mean in practice. \citet{er-dqn} solved this issue by applying an imputation step that returns samples based on expectile values, and takes the form of finding an approximate solution to an optimization problem. This solution, although effective, is extremely slow, preventing practical applications at scale.

%Due to those drawbacks, quantile approaches are more commonly used than expectile ones. Yet, in order to attain satisfactory performance, quantile-based approaches trade the L1 loss for a Huber loss, thus foregoing their approximate bellman closedness~\citep{distributional-book}. As we show in Section \ref{section:collapse}, this translates to a collapse of the estimated distribution!

%In this paper, we make use of the theoretical connection between quantiles and expectiles to learn both at the same time. By doing so, we prevent the estimated distribution from collapsing, without sacrificing computation time compared to a quantile-based approach.
