%%%%%%%% ICML 2024 EXAMPLE LATEX SUBMISSION FILE %%%%%%%%%%%%%%%%%

%\documentclass[review]{uai2024}
% \documentclass[accepted]{myuai}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\documentclass[accepted]{uai2024}


% Recommended, but optional, packages for figures and better typesetting:
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{booktabs} % for professional tables
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\input{math_commands.tex}
\usepackage{algorithmic}
\usepackage{pgfplots}
\usepackage{hyperref}
\usepackage{rotating}
\usepackage{booktabs}

\usepackage{url}
% For theorems and such
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{float}
\usepackage{dsfont}
% if you use cleveref..
\usepackage[capitalize,noabbrev]{cleveref}
\usepackage{algorithm}
\usepackage{algpseudocode}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

\usepackage{pdfpages}
\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{mathtools}
\usepackage{amsmath,amsfonts,amssymb}
\usepackage{algorithm}
\usepackage{algpseudocode}
\usepackage{graphicx}
\usepackage{subfig}
\usepackage{tikz}
\usepackage{amsthm}
\usepackage{multirow}
\usepackage{graphicx}
%\usepackage{subcaption}
\usepackage{mwe}
\usepackage{array}
\usepackage{xcolor}
\usepackage{pgfplots}

\usepackage{dblfloatfix}
\usepackage{pgfplots}
\pgfplotsset{compat=1.17}

\usepackage{pgf}
\usepackage{pgfplots}
\usetikzlibrary{calc}
\usepackage{dblfloatfix}
%% define \inputgraph
\newcommand{\inputgraph}[2]{% #1 = file, #2 = graph name
  \long\def\GRAPH ##1#2 {}%
  \input{#1}
}
\let\ENDGRAPH\endinput
%\renewcommand{\bibsection}{}
% Define bar chart colors
%
\definecolor{bblue}{HTML}{1f77b4}
\definecolor{rred}{HTML}{d62728}
\definecolor{ggreen}{HTML}{2ca02c}
\definecolor{oorange}{HTML}{ff7f0e}

	\definecolor{green}{rgb}{0.16, 0.67, 0.53}
\definecolor{blue}{rgb}{0.19, 0.55, 0.91}
\definecolor{red}{rgb}{0.8, 0.25, 0.33}
\definecolor{orange}{rgb}{0.91, 0.45, 0.32}

\makeatletter
\newcommand\footnoteref[1]{\protected@xdef\@thefnmark{\ref{#1}}\@footnotemark}
\makeatother
%% Self-defined macros
\usepackage{graphicx}
\usepackage{xcolor}

\DeclareRobustCommand{\legendsquare}[1]{%
  \textcolor{#1}{\rule{1ex}{1ex}}%
}
\newcommand{\indep}{\perp \!\!\! \perp}
\newcommand{\confirm}[1]{{\color{purple} Confirm: #1}}
\newcommand{\vR}{\boldsymbol{R}}
\newcommand{\AC}[1]{{\color{blue} Abhilash: #1}}
\newcommand{\ACMod}[1]{{\color{blue} #1}}
\newcommand{\NB}[1]{{\color{blue} #1}}
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\newcommand\PlaceText[3]{%
\begin{tikzpicture}[remember picture,overlay]
\LARGE
\node[outer sep=0pt,inner sep=0pt,anchor=south west] 
  at ([xshift=#1,yshift=-#2]current page.north west) {#3};
\end{tikzpicture}%
}
%\DeclareMathOperator*{\argmax}{arg\,max}
%\DeclareMathOperator*{\argmin}{arg\,min}
\DeclarePairedDelimiter{\norm}{\lVert}{\rVert} 
\def\Expect{{\mathbb E}}
\def\Prob{{\mathbb P}}
\def\minimize{\mathop{\rm minimize}}
\def\maximize{\mathop{\rm maximize}}
\def\subto{{\rm subject \mbox{   }\rm to}}
\def\min{\mathop{\rm min}}
\def\max{\mathop{\rm max}}
\def\argmax{\mathop{\rm argmax}}
\def\argmin{\mathop{\rm argmin}}
\def\diag{\mathop{\rm diag}}
\def\sup{\mathop{\rm sup}}
\def\inf{\mathop{\rm inf}}
\newcommand{\ie}{{\it i.e., }}
\newcommand{\eg}{\textit{e.g.}}
\newcommand{\V}[1]{{{\boldsymbol #1}}}
\newcommand{\x}{\V{x}}
\newcommand{\X}{\mathcal{X}}
\newcommand{\e}{\V{e}}
\newcommand{\0}{\V{0}}
\renewcommand{\1}{\V{1}}
\newcommand{\Dist}{\mathcal{D}}
\newcommand{\mymbox}[1]{\mbox{\scriptsize #1}}
\renewcommand{\Re}{\mathbb{R}}
\newcommand{\quoteIt}[1]{``#1''}
\newcommand{\support}[1]{\1_{[#1]}}
\newcommand{\Halmos}{{\qed}}
\newcommand{\eye}{{I}} %I matrix
\newcommand{\network}{{\mathfrak{F}}}

\definecolor{lgreen}{HTML}{78C357}
\definecolor{dgreen}{HTML}{006400}
\definecolor{lblue}{HTML}{7BA9D0}
\definecolor{dblue}{HTML}{00008B}
\definecolor{newred}{HTML}{FF0000}
\definecolor{lred}{HTML}{FFC0CB}
\definecolor{salmon}{HTML}{FFA07A}
\definecolor{grey}{HTML}{808080}

\newcommand{\mxi}{{m}}
\newcommand{\mpsi}{{m}}
\newcommand{{\polx}}{{\boldsymbol{x}}}
\newcommand{\U}{{\mathcal{U}}}
\newcommand{\VaR}{{\mathop{\mbox{VaR}}}}
\newcommand{\Dxipsi}{{\mathcal{D}_{\psi \xi}}}
\newcommand{\D}{{\mathcal{D}}}
\newcommand{\Dxi}{{\mathcal{D}_{\xi}}}
\newcommand{\Dxik}{{\mathcal{D}_{\xi}^k}}
\newcommand{\gThetaE}{{V_E}}
\newcommand{\gTheta}{{V}}
\newcommand{\gThetaD}{{V_D}}
\newcommand{\alphaKM}{{\alpha_{K}}}
\newcommand{\alphaSVDD}{{\alpha_{S}}}
\newcommand{\piTheta}{{\theta}}
\newcommand{\aTheta}{{\theta}}
\newcommand{\esssup}{{\mathop{\mbox{esssup}}}}
\newcommand{\TotalVar}{{\mbox{TotalVar}}}
\newcommand{\GenVar}{{\mbox{GenVar}}}
\newcommand{\cvec}[0]{{perturbation vector}}
\newcommand{\xiname}[0]{{\cvec{}}}
\newcommand{\ECRO}[0]{TbS}
\newcommand{\EECRO}[0]{DTbS}
\newcommand{\CROES}[0]{{ETO-DbS}}
\newcommand{\CROCS}[0]{ETO-CPS}
\newcommand{\CROCCS}[0]{ETO-ACPS}

\renewcommand{\eqref}[1]{(\ref{#1})}


\newcommand{\EDmodified}[1]{{\color{red} #1}}
\newcommand{\EDcomments}[1]{{\EDmodified{Erick commented: #1}}}
%\newcommand{\EDcomments}[1]{{}}
\newcommand{\modified}[1]{{\color{blue} #1}}
\newcommand{\removed}[1]{{}}

\tikzstyle{arrow} = [thick,->,>=stealth]
\tikzstyle{startstop} = [rectangle, rounded corners, 
minimum width=3cm, 
minimum height=1cm,
%text-centered, 
draw=black]
\tikzstyle{end} = [rectangle, rounded corners, 
minimum width=3cm, 
minimum height=1cm,
%text-centered, 
draw=black]
\vspace{0.25cm}

\tikzstyle{arrow} = [thick,->,>=stealth]
\tikzstyle{startstop} = [rectangle, rounded corners, 
minimum width=1cm, 
minimum height=1cm,
%text-centered, 
draw=black]
\tikzstyle{end} = [rectangle, rounded corners, 
minimum width=1cm, 
minimum height=1cm,
%text-centered, 
draw=black]
\vspace{0.25cm}

% Todonotes is useful during development; simply uncomment the next line
%    and comment out the line below the next line to turn off comments
%\usepackage[disable,textsize=tiny]{todonotes}
\usepackage[textsize=tiny]{todonotes}

% hyperref makes hyperlinks in the resulting PDF.
% If your build breaks (sometimes temporarily if a hyperlink spans a page)
% please comment out the following usepackage line and replace
% \usepackage{icml2024} with \usepackage[nohyperref]{icml2024} above.
\usepackage{hyperref}




% If accepted, instead use the following line for the camera-ready submission:
% \usepackage[accepted]{icml2024}

% For theorems and such
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}

% if you use cleveref..
\usepackage[capitalize,noabbrev]{cleveref}


% Todonotes is useful during development; simply uncomment the next line
%    and comment out the line below the next line to turn off comments
%\usepackage[disable,textsize=tiny]{todonotes}
\usepackage[textsize=tiny]{todonotes}
  

\title{End-to-end Conditional Robust Optimization%\footnote{This article is under review at 40$^{th}$ Conference on Uncertainty in Artificial Intelligence (UAI 2024).}
}
\author[1]{\href{mailto:abhilash.chenreddy@hec.ca}{Abhilash Reddy Chenreddy}}%\footnote{Corresponding author\\  Email addresses: erick.delage@hec.ca (Erick Delage), abhilash.chenreddy@hec.ca (Abhilash Chenreddy)}}
\author[1]{\href{mailto:erick.delage@hec.ca}{Erick Delage}}
\affil[1]{GERAD and Department of Decision Sciences, HEC Montr\'eal, %Montr\'eal, Qu\'ebec, {H3T 2A7}, 
Canada}

\begin{document}
\maketitle

\begin{abstract}
The field of Contextual Optimization (CO) integrates machine learning and optimization to solve decision making problems under uncertainty. Recently, a risk sensitive variant of CO, known as Conditional Robust Optimization (CRO), combines uncertainty quantification with robust optimization in order to promote safety and reliability in high stake applications. Exploiting modern differentiable optimization methods, we propose a novel end-to-end approach to train a CRO model in a way that accounts for both the empirical risk of the prescribed decisions and the quality of conditional coverage of the contextual uncertainty set that supports them. While guarantees of success for the latter objective are impossible to obtain from the point of view of conformal prediction theory, high quality conditional coverage is achieved empirically by ingeniously employing a logistic regression differentiable layer within the calculation of coverage quality in our training loss.  We show that the proposed training algorithms produce decisions that outperform the traditional \quoteIt{estimate then optimize} approaches.
\end{abstract}

\section{Introduction}
In a standard machine learning setting, $\Psi \subseteq \mathbb{R}^m$ represent the input set and $\Xi \subseteq \mathbb{R}^m$ represent the output sets and we aim to learn a model $\network_\theta$  parameterized by $\theta$ that approximates the relationship between the input and output by minimizing a loss function $\mathcal{L}$. In real-world applications, we usually have a dataset of $M$ samples, $\Dxipsi:= \{( \psi_i, \xi_i)\}_{i=1}^M$ which are used to approximate the underlying input-output relationship learned by the model. For a new data sample $\psi \in \Psi$, the model trained on $\Dxipsi$ is used to predict a corresponding target $\xi = \network_{\theta}(\psi) $. Recently, there has been a growing interest in developing data-driven optimization solutions that integrate this learning process with the subsequent optimization process. In this context, one accounts for the fact that the prediction is used within a cost minimization problem $\hat{x}^*(\psi):=\arg\min_{x\in\X}c(x,\network_\theta(\psi))$, where $\X\subseteq \Re^n$ is the set of feasible decisions and $c(x,\xi)$ the cost function. The intent is to adapt the training procedure to produce an adapted decision with low out-of-sample expected cost $\Expect[c(\hat{x}^*(\psi),\xi)]$. 

When there is a mismatch between the training loss $\mathcal{L}$ and the cost function $c(x,\xi)$, a small error in predicting $\xi$ for a given $\psi$ can lead to highly suboptimal $x^*(\psi)$ (see \cite{elmachtoub2022smart}). Task-based (or decision-focused) learning (c.f. \cite{mandi2023decisionfocused,donti2017task}) addresses this issue by training the model $\network_{\theta}$ directly on the performance of the policy $x^*(\psi)$. By trading off predictive performance in favor of task performance, the task-based approach can give near optimal decisions.

In high stakes applications, a Decision Maker (DM) usually demonstrates a certain degree of risk aversion by requiring some level of protection against a range of plausible future scenarios. A natural risk averse variant of integrated learning and optimization takes the form of Conditional Robust Optimization (CRO) (see \cite{chenreddy2022data}), which integrates conformal prediction with robust optimization. Specifically, machine learning is first used to estimate an uncertainty set $
\U(\psi)$ for an observed context $\psi$. This set $\U(\psi)$, known to contain the realized $\xi$ with a high probability, 
\removed{
contextually adapted uncertainty set $
\U(\psi)$ known to contain with high probability the realized $\xi$, }
is then inserted into the conditional robust optimization model:
\begin{align}
    \label{eq:CRO}
    x^*(\psi) := \arg\min_{x \in \mathcal{X}} \max_{\xi \in \mathcal{U}(\psi)} c(x, \xi)
\end{align}
To this date, the methods proposed in the conditional robust optimization literature follow an Estimate Then Optimize (ETO) paradigm. Namely, data is first used to estimate the contextual uncertainty sets which are then calibrated to meet the required coverage levels. These sets are then used as input to the CRO problem to get the adapted robust decision $x^*(\psi)$. However, the process of calibrating uncertainty sets does not take into account the downstream optimization task, potentially resulting in misalignment between the loss function used in the initial estimation and the objective of robust optimization.
In this paper, we propose a novel end-to-end learning framework for conditional robust optimization that constructs the contextual uncertainty set by accounting for the downstream task loss. Our contributions can be described as follows:
\begin{itemize}
    \item We propose for the first time an end-to-end training algorithm to produce contextual uncertainty sets, $\mathcal{U}(\psi)$ that lead to reduced risk exposure for the solution of the down-stream CRO problem% end to end
    \item We introduce a novel joint loss function aimed at enhancing the conditional coverage of $\mathcal{U}(\psi)$ while improving the CRO objective
    \item We demonstrate through a set of synthetic environments that our end-to-end approach surpasses ETO approaches at the CRO task while achieving comparable if not superior conditional coverage with its learned contextual set
    \item We show empirically how our end-to-end learning approach outperforms other state-of-the-art methods on a portfolio optimization problem using real world data from the US stock market    
\end{itemize}

\begin{remark}
It is worth noting that when the estimated uncertainty set $\mathcal{U}(\psi)$ reduces to a singleton $\{\network_{\theta}(\psi)\}$, i.e. a point prediction, the CRO problem simplifies to the deterministic contextual optimization problem: $x^*(\psi) := \arg \min_{x \in \mathcal{X}} c(x,\network_{\theta}(\psi))$. For this special case, the training of $\network_{\theta}(\psi)$ using an end-to-end paradigm has been more heavily studied, see for instance \cite{amos_optnet_2017,berthet2020learning,elmachtoub2022smart}. End-to-end CRO therefore constitutes a more general and unexplored framework that can potentially address the need to provide more robust decisions in situations where parameters cannot be perfectly estimated. This is particularly noticeable in a portfolio optimization problem where a point estimate of the return of assets will necessarily motivate investing all available wealth in the one single asset with highest predicted return. In contrast, it is rather easy to formulate an uncertainty set $\mathcal{U}(\psi)$ such that the CRO problem encourage diversification of the investment.
\end{remark}

\section{Related work}

\textbf{Estimate Then Optimize} popularized by the initial work of \cite{hannah2010nonparametric} is a framework that integrates machine learning and optimization tasks. Several approaches are proposed to learn the conditional distribution from data. \cite{kannan2023residuals, sen2018learning} propose using residuals from the trained regression model to learn conditional distributions. \cite{bertsimas2020predictive} assign weights to the historical observations of the parameters and solve the weighted SAA problem. We refer the readers to the \cite{mivsic2020data} survey for various applications of the ETO framework. Besides the mentioned risk neutral applications, there is a growing interest in integrating machine learning techniques to Robust Optimization to handle risk-averse scenarios. \cite{chenreddy2022data} identify clusters of the uncertain parameters based on the covariate data and calibrate the sets for these clusters. \cite{patel2023conformal} propose using non-convex prediction regions to construct uncertainty sets. \cite{blanquero2023contextual} construct contextual ellipsoidal uncertainty sets by making normality assumptions.
\cite{ohmori2021predictive} use a non-parametric K-nearest neighbors model to identify the minimum volume ellipsoid to be used as an uncertainty set. \cite{sun2024predict} solve a robust contextual LP problem where a prediction model is first learned, and then uncertainty is calibrated to match robust objectives.
It is to be noted that all these CRO approaches follow the ETO paradigm. 

\textbf{End-to-end learning} is a more recent stream of work that integrates the Estimation and Optimization tasks and trains using the downstream loss.
\cite{donti2017task} proposed using an end-to-end approach for learning probabilistic machine learning models using task loss. 
\cite{elmachtoub2022smart} learn contextual point predictor by minimizing the regret associated with implementing prescribed action based on such a point predictor.
\cite{amos2017optnet} use implicit differentiation methods to train an end-to-end model. \cite{butler2023efficient} solve large-scale QPs using the ADMM algorithm that decouples the differentiation procedure for primal and dual variables. \cite{elmachtoub2022smart} and \cite{mandi2020smart} propose using a surrogate loss function to train integrated methods to address loss functions with non-informative gradients. \cite{wang2023learning} propose learning a non-contextual uncertainty set by maximizing the expected performance across a set of randomly drawn parameterized robust constrained problems while ensuring guarantees on the probability of constraint satisfaction with respect to the joint distribution over perturbance and robust problems. \cite{costa2023distributionally} propose a distributionally robust end-to-end system that integrates residual based distribution estimation and robustness tuning to the portfolio construction problem.
We refer the reader to \cite{Kotary_Fioretto_Van_Hentenryck_Wilder_2021}, \cite{inbook}, \cite{mandi2023decisionfocused}, and \cite{sadana2023survey} for broader discussions on both ETO and end-to-end approaches.\\

\textbf{Uncertainty quantification} methods are employed to estimate the confidence of deep neural networks over their predictions (\cite{ kontolati2022survey}). Common uncertainty quantification approaches include using Bayesian methods like stochastic deep neural networks, ensembling over predictions from several models to suggest intervals, and models that directly predict uncertain intervals. \cite{gawlikowski2021survey}. Beyond estimating predictive uncertainty, ensuring its statistical reliability is crucial for safe decision-making \cite{guo2017calibration}. Conformal prediction has become popular as a distribution-free calibration method \cite{shafer2008tutorial}. Although conformal prediction ensures marginal coverage, attaining conditional coverage in the most general case is desirable  \cite{vovk2012conditional}. Although considered unfeasible, \cite{romano2020malice} offers group conditional guarantees for disjoint groups by independently calibrating each group. 

\section{Estimate then Robust Optimize}
\label{sec:ETO}
The concept of \quoteIt{Estimate Then Optimize} comes from the contextual optimization literature (see \cite{sadana2023survey}). In the context of CRO, the role of the \textbf{Estimation} process is to quantify the uncertainty about $\xi$ given the observed $\psi$. This is given as input to an \textbf{Optimization} problem that prescribes an optimal contextual decision $x^*(\psi)$. 

When the downstream optimization problem is a CRO problem, the estimation step is required to produce a region that adapts to the observed covariates $\psi$ and is expected to contain the response $\xi$ with high confidence. This can be executed in two steps: first, by learning a parametric conditional distributional model denoted as $F_{\theta}(\psi)$, and second, by calibrating an implied confidence region $\U_\theta(\psi)$ to ensure $\Prob_{F_\theta(\psi)}(\xi\in\U_\theta(\psi))=1-\epsilon$.
\removed{This can be done indirectly by first calibrating a conditional distribution model $F_\theta(\psi)$ to the data, followed by an implied confidence region $\U_\theta(\psi)$ that satisfies $\Prob_{F_\theta(\psi)}(\xi\in\U_\theta(\psi))=1-\epsilon$.} For e.g., when one assumes that $\xi |\psi \sim \mathcal{N}(\hat{\mu}(\psi), \hat{\Sigma}(\psi))$, one can learn $(\hat{\mu}(\psi), \hat{\Sigma}(\psi))$ by maximizing the log-likelihood function (see \cite{barratt2023covariance})
\[-\frac{n}{2} \log(2\pi) + \sum_{j=1}^n \log L(\psi)_{jj} - \frac{1}{2} \| L(\psi)^\top (\xi - \hat{\nu}(\psi)) \|_2^2\]
where $L(\psi)$ and $\hat{\nu}(\psi)$ are the parametric mappings that can be used to compose  %that allows recovering $ = \textbf{chol}(\hat{\Sigma}(\psi)^{-1})$, for a given $\psi$, one can predict mean and covariance of $\xi$ as 
$\hat{\mu}(\psi) := ({L(\psi)^{-1}})^{\top}\nu(\psi)$ and $\hat{\Sigma}(\psi) = ({L(\psi)^{-1}})^{\top}L(\psi)^{-1}$. Using the $\alpha$ quantile from the chi-squared distribution with $m$ degrees of freedom, one can define $\U_\theta(\psi)$ that satisfies $\Prob(\xi\in\U_\theta(\psi))=1-\epsilon$ asymptotically.

Some recent work completely circumvent the need for the intermediary $F_\theta$ by calibrating some $\U_\theta(\psi)$ directly on the dataset. For example, \cite{chenreddy2022data} propose identifying a $k$-class classifier, $a: \Re^m \rightarrow [K]$ to reduce $\mathcal{U}_{\theta}(\psi):= \mathcal{U}_{\theta}(a(\psi)) $ such that $\Prob(\xi\in\U_\theta(k)|a(\psi) = k) \geq 1-\epsilon \,\ \forall\ k$. The literature on conformal prediction also belongs to the family of distribution-free approaches. It separates the calibration of the shape of $\U_\theta(\psi)$ from the calibration of its size, parameterized by a radius $r>0$, on a reserved validation set to provide out-of-sample marginal coverage guarantees of the form $\Prob(\xi\in\U_\theta(\psi))\geq 1-\epsilon,$ where the probability is taken over both the draw of the validation set and of the next sample. According to the Lemma 4.2 in \cite{chenreddy2022data}, such a coverage guarantee is sufficient to ensure that the out-of-sample Value-at-risk of the robust policy produced by CRO is bounded above by the worst-case value of the in-sample problem.

\section{End-to-End Conditional Robust Optimization} \label{sec:E2E}
While the ETO approach presented in the section \ref{sec:ETO} presents an efficient way to quantify the uncertainty conditionally, it does not take into account the quality of the decisions $x^*(\psi)$ that is prescribed by the downstream CRO model. In practice, the quality of a robust decision is usually assessed by measuring the risk associated with the cost produced on a new data sample (a.k.a. out-of-sample). We assume that this risk is measured by a risk measure that reflects the amount of risk aversion experienced by the DM. For instance, one can use conditional value-at-risk represented by the function, $\rho_\alpha(X):=\inf_{t} t+ (1/(1-\alpha))\Expect[(X-t)^+]$, which computes the expected value in the right tail of the random cost $X$ for a certain risk aversion $\alpha$ and it covers both expected value and the worst-case cost as special cases (i.e.  $\alpha=0$ and $1$ respectively). 

In the ETO framework, once the optimal decision $x^*(\psi)$ is determined, the DM can assess the associated risk, also known as task loss,  $\rho_\alpha(c(x^*(\psi), \xi))$. This metric allows for comparison across models to select the suitable one. However, it is important to note that the model with the best performance in terms of task loss may differ from the optimal model based on prediction loss. Motivated by recent evidence (see \cite{elmachtoub2022smart}) indicating that performance improvement can be achieved by employing a decision-focused/ task-based learning paradigm, we propose end-to-end conditional robust optimization.

\subsection{The ECRO training problem}

Formally, we let $\Psi\subseteq\Re^m$ be an arbitrary support set for $\psi$ whereas $\Xi\subseteq\mathbb{R}^m$ is assumed for simplicity to be contained within a ball centered at 0 of radius $R_\xi$. We consider $c(x,\xi)$ to be convex in $x$ and concave in $\xi$ and let $\mathcal{X}(\psi):=\{x\in\Re^n|g(x,\psi)\leq 0,\,h(x,\psi)=0\}$ be a convex feasible set for $x$, possibly dependent on $\psi$, and defined through a set of convex inequalities, identified using $g:\Re^n\times\Re^m\rightarrow \Re^J$ and affine equalities, identified using an affine mapping $h:\Re^n\times\Re^m\rightarrow \Re^J$. The conditional optimal policy then becomes:
\begin{align}
    \label{eq:CRO_new}
    x^*(\psi, \U) := \arg\min_{x \in \mathcal{X}(\psi)} \max_{\xi \in \mathcal{U}(\psi)} c(x, \xi),
\end{align}
where we make explicit how the decision depends on both the contextual uncertainty set and the realized covariate. Given a parametric family of contextual uncertainty set $\U_\theta$ with $\theta\in \Theta$ and a dataset $D_{\psi\xi}:=\{(\psi^i,\xi^i)\}_{i=1}^{M}$, the ECRO training problem consists in identifying 
\begin{align}\label{eq:E2Eloss}
% \min_\theta 
\min_{\theta\in\Theta} \mathcal{L}_{ECRO}(\theta) := \rho_{i\sim M}(c(x^*(\psi^i,\mathcal{U}_\theta),\xi^i)),
\end{align}
where $\rho_{i\sim M}$ refers to the risk when $i$ is drawn uniformly from $1$ to $M$, while, for simplicity, we assume $\rho(\cdot)$ to be a conditional value-at-risk measure, and $\U_\theta(\psi)$ to be ellipsoidal for all $\psi$. Namely, we can assume that 
\begin{align}\label{eq:ConSet}
    &\mathcal{U}_\theta(\psi) =\mathcal{E}(\mu_\theta(\psi),\Sigma_\theta(\psi), r)\\ \nonumber
    &\;:= 
    \{\ \xi\in\mathbb{R}^{\mxi} : (\xi - \mu_\theta(\psi))^T \Sigma_\theta(\psi)^{-1}(\xi - \mu_\theta(\psi)) \leq 1\}\,,
\end{align}
%where  \(\Sigma(\psi) \in \mathbb{R}^{m \times m}\) is a symmetric matrix, $\mu(\psi) \in \mathbb{R}^{\mxi} $ and \(r \in \mathbb{R}\). 
for some $\mu_\theta: \Re^m\rightarrow \Re^m$ and $\Sigma_\theta:\Re^m\rightarrow \mathcal{S}_+$, where $\mathcal{S}_+$ is the set of positive definite matrices, for all $\theta\in\Theta$. While the robust optimization literature suggests various uncertainty set structures that facilitate the resolution of the RO problem, the ellipsoidal set stands out as a natural one to employ as it retains numerical tractability (see \cite{ben1998robust}) and can easily be described to the DM.

\begin{figure}[!ht]
% \vskip -0.1in
    \centering
    \tikzset{
      basic/.style  = {draw, text width=5em, drop shadow,  rectangle},
      root/.style   = {basic, thin, align=center,
                   fill=gray!45 , text width=5em},
      level 2/.style = {basic, thin,align=center, fill=gray!30,
                   text width=5em},
      level 3/.style = {basic, thin, align=left, fill=gray!20, text
      width=5em, node distance = 40pt} 
    }
    \begin{tikzpicture}[node distance=2cm]
    \begin{scope}[scale=0.85, transform shape]
\node (b1) [end] {Estimation};
\node (b2) [startstop, below of=b1] {Optimization};
\node (b3) [startstop, below of=b2]{Task loss};
% \draw [arrow] (b1) -- node [midway,above] {} node [midway,below] {} (b2);
\draw [arrow] (0,1.5) -- (b1) 
    node [midway,right] {$\Dxipsi$} node [midway,below] {};
\draw [arrow] (b1) -- (b2) node [midway,right] {$\U_{\theta}$};
\draw [arrow] (b2) -- (b3) node [midway,right] {$x^*(\cdot,\U_\theta)$};
\draw [arrow] (b3) -- (0,-5.5) node [midway,right] {$\U_{\theta^*},\,x^*(\cdot,\U_{\theta^*})$};
\draw [thick] (b3) -- (-2, -4);
\draw [thick] (-2,-4) -- (-2, 0) node [midway,left] {$\nabla_\theta \mathcal{L}_{ECRO}(\theta)$}; 
\draw [arrow] (-2,0) -- (b1);
    \end{scope}
\end{tikzpicture}\\
    \caption{Training pipeline for task-based learning  }
    \label{fig:TSpipeline}
    \end{figure}

The training pipeline for the task-based learning approach is illustrated in figure \ref{fig:TSpipeline}. In this pipeline, one starts from an arbitrary $\theta^0$, the optimization problem \eqref{eq:CRO_new} is solved first for each data point, and the resulting optimal actions are then implemented in order to measure the empirical risk under $D_{\psi\xi}$, which we call empirical ECRO loss of $\theta^0$. A gradient of $\mathcal{L}_{ECRO}(\theta)$ can then be used to update $\theta^0$ in a direction of improvement. Key steps in this pipeline consist of computing $x^*(\psi^i,\U_{\theta})$ efficiently and in a way that enables differentiation with respect to $\theta$.

\subsection{Reducing and solving the robust optimization task}

Given the convex-concave structure of $c(x,\xi)$ and the convexity and compactness of the ellipsoidal set, we can employ Fenchal duality (see \cite{ben2015deriving}) to reformulate the min-max problem as a simpler minimization form over an augmented decision space. Specifically, we first replace the original cost function with the equivalent cost
\[\bar{c}(x,\xi):=\left\{\begin{array}{cl}c(x,\xi) & \mbox{if $\|\xi\|_2\leq R_\xi$}\\ -\infty & \mbox{otherwise}\end{array}\right.,\]
which integrates information about the domain of $\xi$.
One can then employ theorem 6.2 of \cite{ben2015deriving}, to show that problem \eqref{eq:CRO} can be reformulated as:
\begin{align}
\min_{x \in \mathcal{X}(\psi),v} f(x,v,\psi):=\delta^*(v|\mathcal{U}_\theta(\psi)) - \bar{c}_*(x, v)\label{eq:CROmin}
\end{align}
where the support function
\begin{align}
\delta^*(v|\mathcal{U}_{\theta}(\psi)) := \sup_{\xi \in \mathcal{U}_\theta(\psi)} \xi^Tv =  \mu_\theta(\psi)^Tv+  \sqrt{v^T \Sigma_\theta(\psi)} v,
\end{align}
while the partial concave conjugate function is defined as
\begin{align*}
\bar{c}_*(x, v) := \inf_{\xi}  v^T \xi - \bar{c}(x, \xi) =  \inf_{\xi : \|\xi\|_2\leq R_\xi}  v^T \xi - c(x, \xi).
\end{align*}
This leads to $x^*(\psi,\mathcal{U}(\psi))$ being the minimizer of the convex minimization problem: 
\begin{equation}\label{eq:xopt}
\min_{x \in \mathcal{X}(\psi),v} f(x,v,\psi)  
\end{equation}
with $f(x,v,\psi):=\mu_\theta(\psi)^Tv+ \sqrt{v^T \Sigma_\theta(\psi)} v - \bar{c}_*(x, v)$,
% \begin{equation}\label{eq:xoptF}
% f(x,v,\psi):=\mu^Tv+ \sqrt{v^T \Sigma^{-1} v} - \bar{c}_*(x, v),
% \end{equation}
a jointly convex function of $x$ and $v$ and finite valued over its domain, and with sub-derivatives:
\begin{align*}
&\nabla_{v} f(x,v,\psi)=\mu_\theta(\psi)+(1/\sqrt{v^T \Sigma_\theta(\psi)} v)\Sigma_\theta(\psi)v
-\xi^*(x,v) \\
&\nabla_{x} f(x,v,\psi)=\nabla_xc(x,\xi^*(x,v)),
\end{align*}
where $\xi^*(x,v):=\argmin_{\xi : \|\xi\|_2\leq R_\xi}  v^T \xi - c(x, \xi)$. Revisiting the procedure outlined in figure \ref{fig:TSpipeline}, one can observe that the training process requires a forward pass to find the optimal solutions and a backward pass to update the parameter vector $\theta$. This requires the computation of the gradients of the solution to the problem \eqref{eq:E2Eloss} with respect to the input parameters that are passed through the reformulated CRO problem.  Furthermore, the minimization procedure in problem \eqref{eq:E2Eloss} entails navigating through the risk measure $\rho$. These aspects will be further explored in the next section.


\subsection{ Gradient for problem \eqref{eq:E2Eloss}}

In training problem \eqref{eq:E2Eloss}, the gradient of $\mathcal{L}_{ECRO}(\theta)$ with respect to $\theta$ can be obtained using the chain rule:
\begin{align*}
\allowdisplaybreaks
\nabla_\theta \mathcal{L}_{ECRO}&(\theta)
=\sum_i \frac{\partial\rho_{i\sim M}(y_i)}{\partial y_i}\big|_{y_i=c(x^*(\psi^i,\mathcal{U}_\theta),\xi^i) } \cdot \\
&\nabla_{x} c(\x)\big |_{x=x^*(\psi^i,\mathcal{U}_\theta)} \cdot  \\
&\bigg(\nabla_\mu x^*(\psi^i,\mathcal{E}(\mu,\Sigma_\theta(\psi^i)))\big|_{\mu=\mu_\theta(\psi^i)}\nabla_\theta\mu_{\theta}(\psi^i) \\
&+ \nabla_\Sigma x^*(\psi^i,\mathcal{E}(\mu_\theta(\psi^i),\Sigma))\big|_{\Sigma=\Sigma_\theta(\psi^i)}\nabla_\theta\Sigma_{\theta}(\psi^i) \bigg)\\
\end{align*}

Based on \cite{doi:10.1137/1.9781611976595.ch6}, %https://epubs.siam.org/doi/epdf/10.1137/1.9781611976595.ch6
when $\rho(Y):= \mbox{CVaR}_\alpha(Y)$, one can employ the sub-differential:
\[\nabla_{\V{y}}\rho_{i\sim M}(y_i)=\V{\upsilon}(\V{y})\]
with $\V{\upsilon}(\V{y})\in\argmax_{\V{\upsilon}\in\Re^M_+:\1^T \V{\upsilon}=1, \V{\upsilon}\leq ((1-\alpha)N)^{-1}} \V{\upsilon}^T \V{y}$.

Given that $\nabla_{\x} c(\x)$, $\nabla_\theta\mu_{\theta}(\psi)$, and $\nabla_\theta\Sigma_{\theta}(\psi)$ can be readily obtained using auto-differentiation \cite{seeger2017auto} when $c(\x)$, $\mu_{\theta}(\psi)$, and $\Sigma_{\theta}(\psi)$ are differentiable, we focus the rest of this subsection on the process of identifying $\nabla_{(\mu,
\Sigma)}x^*(\psi,\mathcal{E}(\mu,\Sigma))$. Following the decision-focus learning literature (see \cite{blondel2022efficient}), one can identify such derivatives by exploiting the fact that any optimal primal-dual pair $(x^*, v^*, \lambda^*, \nu^*)$ of problem \eqref{eq:xopt} must satisfy the Karush-Kuhn-Tucker (KKT) conditions, which take the form:
\begin{align*}
&G(x^*, v^*, \lambda^*, \nu^*, \mu, \Sigma,\psi) =0,\qquad g(x^*,\psi) \leq 0, \lambda^* \geq 0 .
\end{align*}
where
\begin{align*}
&G(x^*, v^*, \lambda^*, \nu^*,\mu,\Sigma,\psi) := \\
&\begin{bmatrix} \nabla_x f(x^*,v^*,\psi ) + \nabla_x g(x^*, \psi)^T \lambda^* + \nabla_x h(x^*, \psi)^T \nu^* \\ \lambda^* \circ g(x^*,\psi) \\ h(x^*,\psi) \end{bmatrix}
\end{align*}
and \(\circ\) denotes the Hadamard product of two vectors.

One can therefore apply implicit differentiation to the constraints $G(x^*, v^*, \lambda^*, \nu^*,\mu,\Sigma,\psi)=0$ to identify $\nabla_{(\mu,\Sigma)} x^*(\psi,\mathcal{E}(\mu,\Sigma))$ simultaneously with the derivatives of $v^*$, $\lambda^*$, and $\nu^*$ with respect to the pair $(\mu,\Sigma)$. 
Specifically, one is required to solve the system of equations:
\begin{align*}
&\frac{\partial}{\partial x, v, \lambda, \nu} G(x^*,v^*, \lambda^*, \nu^*, \mu,\Sigma,\psi) \cdot \\
&\frac{\partial}{\partial (\mu,\Sigma)} (x^*, v^*, \lambda^*, \nu^*)(\mu,\Sigma) = \\
&- \frac{\partial}{\partial (\mu,\Sigma)} G(x^*, v^*,\lambda^*, \nu^*, \mu,\Sigma,\psi),
\end{align*}
where \(\frac{\partial}{\partial (x, v, \lambda, \nu)} G\) denotes the Jacobian of the mapping \(G\) with respect to \((x, v, \lambda, \nu)\). We refer to \cite{blondel2022efficient} and \cite{duvenaud2020deep} for further details on the computations of related to implicit differentiation.

\subsection{Task-based Set (\ECRO{}) Algorithm}
In this section, we delve into the implementation details of the ECRO training pipeline. Regarding the contextual ellipsoidal set $\mathcal{E}(\mu_\theta(\psi),\Sigma_\theta(\psi))$, we follow the ideas proposed in \cite{barratt2023covariance} and employ a neural network that maps from $\network_\theta:\Re^m \rightarrow \Re^m\times \Re^{m(m+1)/2}\times\Re$. The first set of outputs is used to define $\mu_\theta(\psi)$ while the second and third set forms a lower triangular matrix $L_\theta(\psi)$ and scalar $r_\theta(\psi)$, which is made independent of $\psi$ w.l.o.g., used to produce $\Sigma_\theta(\psi):=r_\theta(\psi)L_\theta(\psi)L_\theta(\psi)^T$. The positive definiteness of $\Sigma_\theta(\psi)$ is ensured by taking an exponential in the last layer of the network for the output that appears in the diagonal of $L$.
The architecture of the neural network can be found in appendix  \ref{architecture}. 

The second set of notable details has to do with solving for $x^*(\psi^i, \mathcal{E}(\mu^i_{\theta},\Sigma^i_{\theta},r_{\theta}))\ \forall i$. In our implementation of end-to-end learning for conditional robust optimization, we found that a trust region optimization (TRO) method (see \cite{byrd2000trust}) could efficiently solve the reformulated robust optimization problem \eqref{eq:xopt} and provide primal-dual solution pairs for this problem. Given that each episode of the training would pass through the same set of data points, we further observed that the training accelerated significantly (see figure \ref{results:modelcomp} in appendix \ref{fig:covcomp}) when the trust region was interrupted early (after $K=5$ iterations) as long as it would be warm started at the solution found at the previous epochs. Algorithm \ref{alg:ecrotraining} presents our proposed training framework for the ECRO approach.

\begin{algorithm}
  \caption{ECRO Training with Trust Region Solver}\label{alg:ecrotraining}
  \begin{algorithmic}[1]
  \State \textbf{input}: dataset $\Dxipsi$, max epochs $T$, max TRO steps $K$, batch size $N$, protection level $\alpha$
  \State Initialize a warm start  buffer $\{\bar{x}_1,\dots,\bar{x}_M\}$ with each $\bar{x}^i\in\mathcal{X}(\psi_i)$
  \State Initialize network parameters $\theta$ and $t=1$
      \While{not converged and ($t \le T$)}
        \State Sample a batch of $N$ indices $\mathcal{B}\subset \{1,\dots,M\}$
         \For{$i\in \mathcal{B}$}
         \State //Run TRO for up to $K$ steps
        \State $x_i^t, \lambda_i^t, \nu_i^t \gets$ \textsc{TRO}($\bar{x}_{i}$, $\mu_{\theta}(\psi_{i}),\Sigma_\theta(\psi_{i}), K)$
        \State $\bar{x}_i\gets x_i^t$ \Comment{Update warm start}
               
      \EndFor
%        \State \textbf{Store} $x^t(\psi_i)_{i = 1}^{N} $ in $R$
        \State \textbf{compute} $\mathcal{L}_{ECRO}(\theta)$ and $\nabla_\theta\mathcal{L}_{ECRO}(\theta)$ for $i\sim\mathcal{B}$
        \State $\theta \gets \theta - \mbox{step size}\cdot\nabla_\theta\mathcal{L}_{ECRO}(\theta)$ 
        %CVaR loss  $\mathcal{L}_{ECRO}(x^t,\xi^t) $
        \State t $\gets$ t + 1
      \EndWhile\label{euclidendwhile}
      
      \State \textbf{return} $\theta$
%    \EndProcedure
  \end{algorithmic}
\end{algorithm}

\section{End-to-End CRO with Conditional Coverage}

Recall that the ETO framework summarized in section \ref{sec:ETO} focused on producing contextual uncertainty set with appropriate marginal coverage (of $1-\epsilon$) of the realization of $\xi$. The training pipeline in section \ref{sec:E2E} was at the other end of the spectrum, disregarding entirely the objective of coverage to increase task performance. In practice, coverage can be a heavy price to pay to obtain performance as it implies a loss in the explainability of the prescribed robust decision. It is becoming apparent that many DMs suffer from algorithm aversion (see \citep{burton2020systematic}) and could be reluctant to implement a robust decision produced from an ill covering uncertainty set. 

We further argue that traditional ETO might already face resistance to adoption given the type of coverage property attributed to the ETO sets, i.e. $\Prob(\xi\in\U(\psi))=1-\epsilon$. Indeed, marginal coverage guarantees only hold in terms of the joint sampling of $\psi$ and $\xi$. This implies that it offers no guarantees regarding the coverage of $\xi$ given the observed $\psi$ for which the decision is made. In fact, a 90\% marginal coverage can trivially be achieved if $\U(\psi)$ returns $\Xi$ when $\psi\in\Psi$, for some arbitrary set $\Psi$, and otherwise returns $\emptyset$, as long as $\Prob(\psi\in\Psi)=1-\epsilon$. This is clearly an issue for applications with critical safety considerations and motivates seeking conditional coverage in addition to the marginal coverage when designing $\mathcal{U}(\psi)$. In this section, we outline a training procedure that integrates a sub-procedure that enhances the conditional coverage performance. 

\subsection{The conditional coverage training problem}

We start by briefly formalizing the difference between the two types of coverage in the definition below.  
\begin{definition}
    Given a confidence level $1-\epsilon$, a contextual uncertainty set mapping $\mathcal{U}(\cdot)$ is said to satisfy \textbf{marginal coverage} if $\Prob(\xi\in\mathcal{U}(\psi))=1-\epsilon$, and to satisfy \textbf{conditional coverage} if $\Prob(\xi\in\mathcal{U}(\psi)|\psi)=1-\epsilon$ almost surely.
\end{definition}
The following lemma identifies a necessary and sufficient condition for a contextual set to satisfy conditional coverage.
\begin{lemma}\label{thm:lossEquiv}
    A contextual uncertainty set $\U(\psi)$ satisfies conditional coverage, at confidence $1-\epsilon$, if and only if
   \[\mathcal{L}_{\mymbox{CC}}(\theta):=\Expect[\,\left(\Prob(\xi \in \U(\psi)|\psi)-(1-\epsilon)\right)^2\,]=0\]
\end{lemma}

\begin{proof}
For any random variable $X$, one can show that :
\begin{align*}
X&=1-\epsilon \mbox{ a.s} \\
&\;\Rightarrow\;\Expect[(X-(1-\epsilon))^2]=1\cdot(1-\epsilon-(1-\epsilon))^2=0
\end{align*}    
\qquad \quad and that, since $y^2\leq 0 \Leftrightarrow y=0$,
\begin{align*}
\Expect&[(X-(1-\epsilon))^2]=0\\
&\;\Rightarrow\; (X-(1-\epsilon))^2 = 0 \mbox{ a.s.} \;\Rightarrow\;X = 1-\epsilon \mbox{ a.s.}.
\end{align*}    
\qquad \quad By letting $X:=\Prob(\xi\in\U_\theta(\psi)|\psi)$, we obtain our result.
\vskip -0.2in
\end{proof}
Equipped with lemma \ref{thm:lossEquiv}, we formulate the \quoteIt{theoretical} conditional coverage training problem as $ \min_{\theta\in\Theta}\;\;\mathcal{L}_{CC}(\theta)$. Since the true conditional distribution $\Prob(\xi\in\U_\theta(\psi)|\psi)$ is typically inaccessible to the DM, we propose an approximation that will make  $\mathcal{L}_{CC}(\theta)$ practical.

\subsection{Regression-based Conditional Coverage Loss}

Given a set $\U$, one can define a  binary random variable $y(\psi,\xi,\U) := \1\{\xi \in \U(\psi)\}$, and rewrite the conditional probability distribution $\Prob(\xi\in\U(\psi)|\psi)$ as $\Prob(y(\psi,\xi,\U) = 1|\psi)$. Using the i.i.d sample data in $\Dxipsi$, one can approximate this conditional probability using a parametric model, i.e.  $\Prob(y(\psi,\xi,\U) = 1 |\psi)\approx g_{\phi}(\psi)$ for some $\phi\in\Phi$. The parameters $\phi$ can be calibrated by minimizing the negative conditional log-likelihood of  $\{y(\psi^i,\xi^i,\U)\}_{i=1}^M$:
\begin{align} \label{eq:NLL}
    \phi^*(\U):=\arg\min_{\phi\in\Phi} -\frac{1}{M} \sum_{i=1}^{M} \log g_{\phi}(\psi^i)^{y^i}(1-g_{\phi}(\psi^i))^{1-y^i},
\end{align}

where $y_i:=y(\psi^i,\xi^i,\U)$.
Using the parametric approximation $g_{\phi^*(\U)}(\psi) \approx \Prob(\xi \in \U(\psi)|\psi)$ and replacing the unknown true distribution of $(\psi,\xi)$ with the empirical one, we obtain our regression-based conditional coverage loss function 
\begin{align*}\label{eq:coverageLoss}
\hat{\mathcal{L}}_{CC}(\theta) := \mathbb{E}^{\mathcal{\Dxipsi}}[(g_{\phi^*(\U_\theta)}(\psi) - (1-\epsilon))^{2}].
\end{align*}

The gradient of $\hat{\mathcal{L}}_{CC}(\theta)$ can be obtained using similar decision-focused training methods as employed for $\mathcal{L}_{ECRO}(\theta)$ given that:
\begin{align*}
\nabla_\theta&\hat{\mathcal{L}}_{CC}= \\
&\sum_{i=1}^M 2(g_{\phi^*(\U_\theta)}(\psi^i) - (1-\epsilon))\nabla_\phi g_{\phi^*(\U_\theta)}(\psi^i)\cdot\\ 
&\sum_{j=1}^M \partial \phi^*(\mathcal{E}(\mu,\Sigma_\theta(\psi^i))) / \partial y^j\cdot \\
&\bigg(\nabla_\mu y^j(\psi^j,\xi^j,\mathcal{E}(\mu,\Sigma_\theta(\psi^j)))\big|_{\mu=\mu_\theta(\psi^j)}\nabla_\theta\mu_{\theta}(\psi^j) \\
&+ \nabla_\Sigma y^j(\psi^j,\xi^j,\mathcal{E}(\mu_\theta(\psi^j),\Sigma))\big|_{\Sigma=\Sigma_\theta(\psi^j)}\nabla_\theta\Sigma_{\theta}(\psi^j)\bigg),
\end{align*}
where the main challenges reside again in the step of differentiating through the minimizer of problem \eqref{eq:NLL}.

\subsection{Dual Task based Set (\EECRO{}) algorithm}
\begin{figure}[!ht]
    \centering
    \tikzset{
      basic/.style  = {draw, text width=5em, drop shadow,  rectangle},
      root/.style   = {basic, thin, align=center,
                   fill=gray!45 , text width=5em},
      level 2/.style = {basic, thin,align=center, fill=gray!30,
                   text width=5em},
      level 3/.style = {basic, thin, align=left, fill=gray!20, text
      width=5em, node distance = 40pt} 
    }
    \begin{tikzpicture}[node distance=2cm]
    \begin{scope}[scale=0.85, transform shape]
\node (b1) [end] {Estimation};
\node (b2) [startstop, below of=b1, xshift= -1.5cm] {Optimization};
\node (b3) [startstop, below of=b2]{\begin{tabular}{c} CRO \\ Task Loss \end{tabular}};
% \node (b4) [startstop, right of=b1, yshift = -1cm] {};
\node (b5) [startstop, right of=b2, xshift= 1cm] {Regression};
\node (b6) [startstop, right of=b3, xshift= 1cm]{\begin{tabular}{c} Coverage \\ Task Loss \end{tabular}};
\node (b7) [startstop, below of=b3, xshift= 1.5cm]{\begin{tabular}{c} Dual \\ Task Loss \end{tabular}};
% \draw [arrow] (b1) -- node [midway,above] {} node [midway,below] {} (b2);
\draw [arrow] (0,1.5) -- (b1) 
    node [midway,right] {$\Dxipsi$} node [midway,below] {};
\draw [arrow] (b1) -- (b2) node [midway,right, xshift= 0.4cm] {$\U_{\theta}$};
\draw [arrow] (b1) -- (b5);
\draw [arrow] (b5) -- (b6) node [midway,right]  {$g_{\phi^*(\U_\theta)}(\cdot)$};

\draw [arrow] (b2) -- (b3) node [midway,right] {$x^*(\cdot,\U_\theta)$};
\draw [arrow] (b3) -- (b7) node [midway,left] {};
\draw [arrow] (b6) -- (b7);
% \draw [thick] (2,1.5) -- (2, -4);
% \draw [arrow] (2, -4) -- (b3);
\draw [thick] (b7) -- (-3, -6);
\draw [thick] (-3,-6) -- (-3, 0) node [midway, left] {$\nabla_\theta \mathcal{L}_{DT}(\theta)$}; 
\draw [arrow] (-3,0) -- (b1);
\draw [arrow] (b7) -- (0,-7.5) node [midway, right] {$\U_{\theta^*}, x^*(\cdot,\U_{\theta^*})$};
   \end{scope}
\end{tikzpicture}\\
    \caption{Training pipeline for dual task based learning  }
    \label{fig:TSCovpipeline}\vskip -0.1in
    \end{figure}
We conclude this section with the presentation of our novel integrated algorithm that learns the contextual uncertainty set network $F_\theta$ by incorporating both the risk mitigation and conditional coverage tasks in the training. Indeed our \EECRO{} training algorithm minimizes the following double task loss function that trades off between the two task objectives: 

\begin{equation}
\mathcal{L}_{DT}(\theta) = \gamma \mathcal{L}_{ECRO}(\theta)+(1-\gamma)\hat{\mathcal{L}}_{CC}(\theta).\label{eq:jointLoss}
\end{equation}

The training pipeline for this algorithm can be seen in figure \ref{fig:TSCovpipeline}. It closely mirrors the structure of the \ECRO{} algorithm, with additional crucial steps to compute the necessary components of the loss presented in \eqref{eq:jointLoss}. Within each epoch, the predicted uncertainty set $\U_\theta$ serves two purposes: i) Optimizing CRO to find the optimal policy $x^*(\cdot,\U_\theta)$ and assessing its associated risk; and ii) producing the binary variable $y(\psi,\xi,\U_\theta)$, which regression leading to $g_{\phi^*(\U_\theta)}(\cdot)$ serves to quantify the quality of the conditional coverage. 
The sum of task losses produces $\mathcal{L}_{DT}(\theta)$, which can be differentiated using decision-focused learning methods. The regression model $g_\phi(\psi)$ takes the form of a feed-forward neural network with a sigmoid activation in the final layer and is optimized using stochastic gradient descent. Algorithm \ref{alg:dualecrotraining} in appendix \ref{algos} presents the details of this \EECRO{} algorithm.

\begin{remark}
It is to be noted that achieving distribution-free finite sample conditional coverage guarantees is known to be impossible in the conformal prediction literature (see \cite{barber:limitsCondConf}). Recently, some progress has been made towards partial forms of conditional coverage guarantees (see \cite{gibbs2023conformal}) yet it is unclear what are the implications of exploiting such partial coverage properties for the downstream CRO decisions. It is also unclear how such conditional conformal prediction procedures could be integrated within an end-to-end CRO approach.
\end{remark}
\begin{figure*}  [htbp]
\centering
\subfloat[]{
  \includegraphics[width=0.34\textwidth]{results/cov_155.png}
}\hspace{-1.5em}%
% \vspace{0.1mm}
\subfloat[]{
  \includegraphics[width=0.34\textwidth]{results/cov_192.png}
}\hspace{-1.5em}%
% \vspace{0.1mm}
\subfloat[]{
  \includegraphics[width=0.34\textwidth]{results/cov_816.png}
}\\
\legendsquare{lgreen}~ \begin{tiny} \CROCS{} \end{tiny} \quad 
 \legendsquare{dgreen}~ \begin{tiny} \CROCCS{} \end{tiny} \quad  
\legendsquare{lblue}~\begin{tiny} \CROES{} \end{tiny}\quad
\legendsquare{lred}~ \begin{tiny} \ECRO{} \end{tiny} \quad
\legendsquare{rred}~ \begin{tiny} \EECRO{} \end{tiny}
\caption{Comparison of uncertainty set ($\alpha$ = 0.9) coverage for different $\psi$ realizations: (a) $[2.5, -0.2]^T$, (b) $[-2.6, 0.5]^T$, (c) $[2.7, 1.9]^T$. The shade indicate the true conditional distribution.\label{plot:covcomp}}\vskip -0.2in
\end{figure*}
\section{Experiments}
This section outlines our experimental framework devised to demonstrate the advantages of the ECRO method in learning the uncertainty sets tailored to covariate information. Our focus lies in assessing the utility of the model in i) improving the CRO performance; and ii) achieving conditional coverage. We conduct a comparative analysis between our two end-to-end approaches, \ECRO{} and \EECRO{}, and three state-of-the-art ETO approaches to formulate contextual ellipsoidal sets. We first consider a Distribution-based contextual ellipsoidal uncertainty Set (\CROES{}) recently introduced in \cite{blanquero2023contextual}, where the conditional distribution of $\xi$ given $\psi$ is presumed to follow a multivariate normal distribution. Additionally, we explore two distributional-free approaches. A vanilla Conformal Prediction Set (\CROCS{}) uses conformal prediction on the output of a point predictor for $\xi$ given $\psi$, after shaping the ellipsoid (through an invariant $\Sigma$) using the residual errors (see \cite{johnstone2021conformal}). An Adapted version of Conformal Prediction Set (\CROCCS{}) (proposed in \cite{messoudi2022ellipsoidal}) adapts the shape $\Sigma$ using local averaging around the observed $\psi$. The code can be found on the \href{https://github.com/Achenred/End-to-end-CRO}{github}\footnote{https://github.com/Achenred/End-to-end-CRO} repository.

\subsection{The portfolio optimization application}

We explore the effectiveness of the proposed methodologies in addressing a classic robust portfolio optimization problem. In this context, we define the cost function $c(x,\xi)$ as $-\xi^Tx$, where $x$ represents a portfolio comprising investments in $m$ different assets, with their respective returns denoted in the random vector $\xi$. Additionally, we impose constraints on $x$, encapsulated within $\mathcal{X}$, defined as $\mathcal{X} := \{ x \in \Re^m | \sum_{i = 1}^{m}x_i = 1, x \geq 0\}$. For this cost function, we obtain the partial concave conjugate function:
\begin{align}
\bar{c}_*(x, v) = \inf_{\xi : \|\xi\|_2\leq R_\xi}  v^T \xi - \xi^T x= -R_\xi\|v-x\|_2
\end{align}
Thus leading to problem \eqref{eq:xopt} becoming

\begin{equation}
\begin{aligned}
\label{eq:OptPortfolioreformed2}
&\min_{x \in \mathcal{X}} f(x,\psi):=x^T\mu_\theta(\psi)+  \sqrt{x^T \Sigma_\theta(\psi) x}
\end{aligned}
\end{equation}
when $R_\xi\rightarrow \infty$, thus capturing $\Xi:=\Re^m$.

\subsection{CRO performance using synthetic data} \label{section:sytheticExp}

We first consider a simple synthetic experiment environment where $m=2$ and where the pair $(\psi,\xi)$ is drawn from a mixture of three 4-d multivariate normal distributions. We sample $N=2000$ observations and use 600 observations to train 400 as validation and 1000 observations for testing. All our results present statistics that are based on 10 simulations, each of which employed a slightly modified mixture model (see section \ref{sec:synth_exp_data}\removed{\href{https://anonymous.4open.science/r/End-to-end-CRO-513E/README.md}{github repository}} for details). The \ECRO{} and \EECRO{} algorithms leverage deep neural networks with the corresponding task losses to learn the necessary components ($\mu_\theta(\psi),\Sigma_\theta(\psi)$) of $\mathcal{U}_\theta(\psi)$. 
All sets are calibrated for a probability coverage of 90\% and the risk of decisions is measured using CVaR at risk level $\alpha=0.9$. 
We also consider an \quoteIt{oracle} method that leverages the exact knowledge of the underlying distribution as an additional benchmark. The method is based on formulating a scenario tree approximation of the joint distribution of $\psi$ and $\xi$ in order to obtain an investment policy that minimizes the CVaR objective \eqref{eq:E2Eloss} directly. More details can be found in the Appendix section \ref{sec:optimalOracle}. The average CVaR objective values and marginal coverages of the uncertainty sets can be found in the table \ref{tab:cvarsample}. 

One can notice that the end-to-end based methods, \ECRO{} and \EECRO{} significantly outperform the ETO methods on the CVaR performance. It appears that in order to maintain the required marginal coverage, the ETO approaches learned sets that resulted in overly conservative RO solutions. We also observe that the \ECRO{} and \EECRO{} models achieve a CVaR performance that is very close to our estimate of the best achievable performance, i.e. the oracle method's performance.

\begin{figure}[!t]
\centering
{
  \includegraphics[width=80mm]{results/cumdistplot.pdf}\\
}%\\
\legendsquare{lgreen}~ \begin{tiny} \CROCS{} \end{tiny} \quad 
 \legendsquare{dgreen}~ \begin{tiny} \CROCCS{} \end{tiny} \quad  
\legendsquare{lblue}~\begin{tiny} \CROES{} \end{tiny}\quad
\legendsquare{lred}~ \begin{tiny} \ECRO{} \end{tiny} \quad
\legendsquare{rred}~ \begin{tiny} \EECRO{} \end{tiny}
\caption{Average cumulative distribution of conditional coverage frequency when $\psi$ is sampled uniformly from dataset over 10 simulated environments. Shaded region represent 90\% CI}\label{plot:cumplot}\vskip -0.1in
\end{figure}

\begin{table}[!ht]
\label{table:SyntheticResults}
% \vskip 0.15in
\begin{center}
\scalebox{0.9}{
\begin{sc}
\begin{tabular}{lcccr}
\toprule
        Method & CVaR  &  Marginal Coverage \\
        \midrule
        \CROCS{} & $1.59 \pm 0.03$ & $91 \pm 1.8\%$   \\
        
        \CROCCS{} & $1.68 \pm 0.04$ & $91 \pm  1.4\%$  \\
        
        \CROES{} & $1.66 \pm 0.06$  &$85 \pm  7.8\%$ \\
        
        \ECRO{} & $1.05 \pm 0.09$ & $23 \pm  6.1\%$ \\
        
        \EECRO{} & $1.07 \pm 0.09$ &$92\pm  1.5\% $\\
        \hline
        Oracle  & $1.06 \pm 0.10$ &$-$\\
        % CVaR & & \\
\bottomrule
\end{tabular}
\end{sc}
}
\end{center}
\vskip -0.1in
\caption{Avg. CVaR and marginal coverage for $\alpha =1-\epsilon= 0.9$ over 10 simulated environments, error represent 90\% CI. Note that the oracle method exploits full information about the Gaussian mixture model.  \label{tab:cvarsample}}
\vskip -0.2in
\end{table}

Additionally, all the models except \ECRO{} appear to have the marginal coverage $~ 90\%$ which corresponds to the $\alpha$ level they are trained for. By disregarding the aspect of coverage, \ECRO{} was able to improve on the CVaR task but performs poorly in terms of coverage. Comparatively, the dual task based approach \EECRO{} %using the regression based conditional coverage loss, it 
was able to improve on the CVaR performance over the ETO approaches while still maintaining the necessary coverage. 


As pointed out earlier, conditional coverage is a highly desirable property. Given that a synthetic environment gives us access to exact measurements of conditional coverage, figure \ref{plot:cumplot} presents the cumulative distribution of the observed conditional coverage frequencies when $\psi$ is sampled uniformly from the data set. One can notice from the plot that \CROES{}, despite being closer to the required marginal coverage, failed to provide accurate conditional coverage. Among the methods that use conformality score to calibrate the radius, \CROCCS{} method which uses localized covariance matrices has better conditional coverage. However, this comes at the price of CVaR performance. The advantages of the dual task-based approach, \EECRO{}, over the single task one are obvious. While \EECRO{} appears to have overshot the coverage compared to \CROCCS, which aligns closer to 90\%,  we argue that this is not an issue as it ends up providing more coverage than needed while generating nearly the best average CVaR value.
In figure \ref{plot:covcomp} which overlays the various sets learned on the conditional distribution of $\xi$, one can notice that the sets adapt to the covariate information $\psi$ to provide the necessary conditional coverage. 

\begin{figure*}[!ht]
\centering
\subfloat[\begin{tiny}2017\end{tiny}]{
  \includegraphics[width=0.34\textwidth]{results/2017.pdf}
}\hspace{-1.5em}%
% \vspace{0.1mm}
\subfloat[\begin{tiny}2018\end{tiny}]{
  \includegraphics[width=0.34\textwidth]{results/2018.pdf}
}\hspace{-1.5em}%
% \vspace{0.1mm}
\subfloat[\begin{tiny}2019\end{tiny}]{
  \includegraphics[width=0.34\textwidth]{results/2019.pdf}
}\\
\legendsquare{lgreen}~ \begin{tiny} \CROCS{} \end{tiny} \quad 
 \legendsquare{dgreen}~ \begin{tiny} \CROCCS{} \end{tiny} \quad  
\legendsquare{lblue}~\begin{tiny} \CROES{} \end{tiny}\quad
\legendsquare{lred}~ \begin{tiny} \ECRO{} \end{tiny} \quad
\legendsquare{rred}~ \begin{tiny} \EECRO{} \end{tiny}
\caption{Avg. CVaR of returns across 10 portfolio trajectory simulations. Error bars report 95\% CI.}\label{results:portfolio}
\end{figure*}

\subsection{CRO using US stock data}\label{portfolioexp}

We follow the experimental design methodology proposed in \cite{chenreddy2022data}. Our experiments utilize historical US stock market data, comprising adjusted daily closing prices for 70 stocks across 8 economical sectors from January 1, 2012, to December 31, 2020, obtained via Yahoo! Finance's API. Each year contains 252 data points, and we calculate percentage gain/loss relative to the previous day to construct our dataset, denoted as $\xi$. We incorporate the trading volume of individual stocks and other market indices as covariates. We test the robustness of all the model's performance by solving the portfolio optimization problem on randomly selected stock subsets across different periods. Utilizing 15 stocks in each window, we ran the experiment ten times over three moving time frames. We maintain consistent parameters (learning rate $lr$, number of epochs $T$, step size $K$, $\gamma$). Further implementation and parameter tuning details can be found in appendix \ref{sec:portfolio}. Figure \ref{results:portfolio} compares the avg. CVaR of returns and table \ref{tab:marginal_coverage} presents the marginal coverage across different confidence levels for models.

It is evident from the CVaR comparison that the task based methods \ECRO{} and \EECRO{} consistently perform better over the ETO models. Among ECRO approaches, we can clearly observe an advantage for \EECRO{}  over  \ECRO{}, which has on par CVaR performance while having out of sample marginal coverage closer to the expected target level. Conformal-based ETO methods have good marginal coverage as they are designed to have the desired coverage. %Evidently, E2E models perform well on the avg. 
Especially, \CROCCS{} and \CROCS{}, being calibrated using conformal prediction which produces statistically valid prediction regions have near target coverage levels. 

\begin{table}[!ht]
% \vskip 0.1in
\begin{center}
\scalebox{0.9}{
\begin{sc}
\begin{tabular}{lclll}
\hline
Model & \multicolumn{1}{l}{Year} & \multicolumn{3}{c}{Marginal cov. (\%)} \\ \cline{3-5} 
 &  & \multicolumn{3}{c}{Target $1-\epsilon$} \\ 
 & \multicolumn{1}{l}{} & 70\% & 80\% & 90\% \\ \hline
\CROCS{} & \multirow{5}{*}{2017} & 68 & 78 & 87 \\
\CROCCS{} &  & 68 & 77 & 89 \\
\CROES{} &  & 54 & 72 & 85 \\
\ECRO{} &  & 22 & 26 & 28 \\
\textbf{\EECRO{}} &  & \textbf{72} & \textbf{79} & \textbf{88} \\ \hline
\CROCS{} & \multirow{5}{*}{2018} & 67 & 79 & 88 \\
\CROCCS{} &  & 68 & 78 & 87 \\
\CROES{} &  & 59 & 75 & 87 \\
\ECRO{} &  & 23 & 24 & 29 \\
\textbf{\EECRO{}} &  & \textbf{71} & \textbf{80} & \textbf{93} \\ \hline
\CROCS{} & \multirow{5}{*}{2019} & 69 & 78 & 88 \\
\CROCCS{} &  & 71 & 78 & 89 \\
\CROES{} &  & 61 & 76 & 86 \\
\ECRO{} &  & 26 & 30 & 32 \\
\textbf{\EECRO{}} &  & \textbf{69} & \textbf{78}& \textbf{92} \\ \hline
\end{tabular}
\end{sc}
}
\end{center}
% \vskip 0.15in
\caption{Marginal Coverage}
% \vskip 0.1in
\label{tab:marginal_coverage}
\end{table}

\section{Conclusion} % and Future Work}
In summary, the paper introduces a novel framework for conditional robust optimization by combining machine learning and optimization techniques in an end-to-end approach. The study focuses on enhancing the conditional coverage of uncertainty sets and improving CRO performance. Through comparative analysis and simulated experiments, the proposed methodologies show superior results in robust portfolio optimization. The findings point to the importance of uncertainty quantification and highlight the effectiveness of an end-to-end approach in risk averse decision-making under uncertainty.

\subsubsection*{Acknowledgments}
The authors gratefully acknowledge support from the Institut de Valorisation des Données (IVADO), 
the Canadian Natural Sciences and Engineering Research Council [RGPIN-2022-05261], and the Canada Research Chair program [CRC-2018-00105].

\newpage
\bibliography{references}
% \bibliographystyle{icml2024}
\clearpage
\appendix
\section{Algorithms} \label{algos}
\subsection{\EECRO{} algorithm}
\begin{algorithm}[!ht]
  \caption{Dual ECRO Training with Trust Region Solver}\label{alg:dualecrotraining}
  \begin{algorithmic}[1]
  \State \textbf{input}: dataset $\Dxipsi$, max epochs $T$, max TRO steps $K$, batch size $N$, protection level $\alpha$
  \State Initialize a warm start  buffer $\{\bar{x}_1,\dots,\bar{x}_M\}$ with each $\bar{x}_i\in\mathcal{X}(\psi_i)$
  \State Initialize network parameters $\theta$ and $t=1$
%    \Procedure{train}{$\Dxipsi$}\Comment{train}
%      \State \textbf{init} $\mu_{\theta_0}(\psi), \Sigma_{\theta_0}(\psi), R_{\theta_0} \sim   f_{\theta_0}(\psi)$ 
      \While{not converged and ($t \le T$)}
        \State Sample a batch of $N$ indices $\mathcal{B}\subset \{1,\dots,M\}$
         \For{$i\in \mathcal{B}$}
%         \State\textbf{CRO Task Loss}
         \State //Run TRO for up to $K$ steps
        \State $x_i^t, \lambda_i^t, \nu_i^t \gets$ \textsc{TRO}($\bar{x}_{i}$, $\mu_{\theta}(\psi_{i}),\Sigma_\theta(\psi_{i}), K)$
        \State $\bar{x}_i\gets x_i^t$ \Comment{Update warm start}
%        \State\textbf{Coverage Task Loss}
        \State $y_i^t \gets \1\{\xi_i \in \mathcal{E}(\mu_\theta(\psi_i),\Sigma_\theta(\psi_i))\}$
%        \State $g_{\phi^*(\U)}(\psi,\xi;\theta) \approx \Prob(\xi \in \U(\psi)|\psi)$ 
      \EndFor
      \State $\phi^t \gets$ \textbf{solve} prob \eqref{eq:NLL} for $\{(\psi_i,y_i^t)\}_{i\in\mathcal{B}}$
%        \State \textbf{Store} $x^t(\psi_i)_{i = 1}^{N} $ in $R$
        \State \textbf{compute} $\mathcal{L}_{DT}(\theta)$ and $\nabla_\theta\mathcal{L}_{DT}(\theta)$ for $i\sim\mathcal{B}$
        \State $\theta \gets \theta - \mbox{step size}\cdot\nabla_\theta\mathcal{L}_{DT}(\theta)$ 
        %CVaR loss  $\mathcal{L}_{ECRO}(x^t,\xi^t) $
        
      \EndWhile
      
      \State \textbf{return} $\theta$
%    \EndProcedure
  \end{algorithmic}
\end{algorithm}

\section{Supplementary for Experiments}

\subsection{Synthetic data generation process}
\label{sec:synth_exp_data}

Our synthetic experiments rely on a set of  mixtures of three multivariate normal distributions created in a way that produces a bimodal mixture of a normal distribution with a possibly non-normal one with similar covariance matrix. Specifically, each mixture model is constructed using the same three mean vectors 
\(\mu_a = \begin{bmatrix} 0 & 0 & 0 & 0 \end{bmatrix}^T\), \(\mu_b = \begin{bmatrix} 0 & 5 & 5 & 0 \end{bmatrix}^T\), and \(\mu_c = \mu_b\) while the covariance matrices take the form 
\[
\Sigma_a = \begin{bmatrix}
1 & 0 & 0.37 & 0 \\
0 & 1.5 & 0 & 0 \\
0.37 & 0 & 2 & 0.73 \\
0 & 0 & 0.73 & 3
\end{bmatrix},
\]
\(\Sigma_b = \alpha \Sigma_a\) and \(\Sigma_c = \frac{\Sigma_a}{\alpha}\) for some $\alpha\in[0,1]$, which controls the non-normality of the second mode.
Furthermore, we introduce asymmetry in the mixture model by using the mixing proportion \(p_a = \phi\), \(p_b = \frac{1 - \phi}{\alpha + 1}\), and \(p_c = \frac{\alpha (1 - \phi)}{\alpha + 1}\) for some $\phi\in[0,1]$, which controls the dominance of the first mode over the second. Furthermore, $p_b$ and $p_c$ are such that the covariance matrix of the non-normal mixture is equal to the covariance of the normal one, $\Sigma_a$.

\newpage
\subsection{Synthetic conditional data generation}\label{sec:synth_data_gen}

To generate conditional samples for the synthetic data generated in section \ref{sec:synth_exp_data}, we first compute the conditional mean \(\mu_{\xi|\psi}\) and covariance \(\Sigma_{\xi|\psi}\) of \(\xi\) given the observed variables \(\psi\) for each mixture component. Specifically, for each mean vector \(\mu\) and covariance matrix \(\Sigma\) associated with the mixture components (denoted as \(a\), \(b\), and \(c\) in section \ref{sec:synth_exp_data}), we calculate the conditional parameters as,
\[
\mu_{\xi|\psi} = \mu_\xi + \Sigma_{\xi\psi} \Sigma_{\psi\psi}^{-1} (\psi - \mu_\psi)
\]
\[
\Sigma_{\xi|\psi} = \Sigma_{\xi\xi} - \Sigma_{\xi\psi} \Sigma_{\psi\psi}^{-1} \Sigma_{\psi\xi}
\]
Next, we determine the conditional probability of each mixture given the $\psi$ observation using Bayes theorem as $\Prob(\mbox{mixture}=i|\psi)\propto \Prob(\psi|\mbox{mixture}=i)\Prob(\mbox{mixture}=i)$.  
Finally, we can use these conditional probabilities to sample new data points from the respective conditional distributions of \(\xi\) given $\psi$. 

\subsection{Parameter tuning procedure}\label{sec:portfolio}
In this section, we explore the parameter tuning methodology applied to train the network introduced in section \ref{portfolioexp}. Given the time series nature of the data, we employ a rolling window technique for network training. Our architecture depends on a set of hyperparameters, defined as follows: \(lr\) for learning rate, \(T\) for the maximum number of epochs, \(K\) for the maximum TRO steps, \(B\) for the batch size, and \(\alpha\) for the target level. We partition the data into training and validation periods and examine the optimal combination through grid search. For each combination, we train the network and derive the optimal policy using the training data, then apply it to the unseen validation data. The optimal combination is selected based on the lowest CVaR on the validation dataset, viewing this as a worst-case return minimization problem.

Regarding the \EECRO{} algorithm, which balances between two losses—the CRO objective and the conditional coverage loss—we follow a specific strategy to identify the best-performing model. At each epoch, we save the model and initiate model selection only after achieving the required training coverage. Subsequently, we retain the best models meeting the coverage criteria until convergence conditions are met. Among all saved models meeting the coverage requirement, we choose the one with the best CVaR objective.

\subsection{Sensitivity analysis}
We conducted a sensitivity analysis of the validation performance as a function of $\gamma$, which balances the CVaR loss and the conditional coverage loss. The table below presents the model performances on the validation data for different values of $\gamma$. It illustrates how varying $\gamma$ enables a trade-off between the two loss objectives.

\[
\begin{array}{c|c|c|c|c|c}
\gamma & 0.01 & 0.1 & 0.5 & 0.9 & 0.99 \\
\hline
\text{avg. } \mathcal{L}_{\text{ECRO}} & 1.30 & 1.05 & 1.04 & 1.06 & 1.05 \\
\text{avg. } \mathcal{L}_{\text{CC}} & 5.49 & 6.25 & 8.15 & 8.98 & 8.81 \\
\end{array}
\]

\subsection{Convergence comparison} \label{fig:covcomp}
\begin{figure}[ht]
\centering
%\subfloat[]{
  \includegraphics[width=0.5\textwidth]{results/modelcomparision.pdf}
%}
\\
\legendsquare{grey}~ \begin{tiny} $5$-steps TRO\end{tiny} \quad 
 \legendsquare{salmon}~ \begin{tiny} TRO \end{tiny} 
\caption{Convergence comparison between $5$-steps TRO (46 min) and full TRO (129 min).\label{results:modelcomp}}% TR took 129 minutes, while K step TR took 46 minutes for $K = 5$}
% \vskip -0.3in
\end{figure}

\subsection{Architecture} \label{architecture}

\begin{figure}[h]
  \centering
  \includegraphics[width=70mm]{NN/NN.pdf}
  % \caption{Example Neural Network.}
  \label{fig:nn}
\end{figure}
We construct a parametric model for \( \mu \) and \( \Sigma \) using Cholesky decomposition to ensure positive definiteness of \( \Sigma \). We employ a shallow neural network architecture with \( m \) input units, one hidden layer of size \( h \), and \( 2m + \frac{m(m-1)}{2} + 1 \) units in the output layer. We use tanh for activation functions and softplus for diagonal elements of \( L \) to ensure strictly positive values. 

\section{Oracle method for Synthetic Experiments}
\label{sec:optimalOracle}
\removed{
\begin{subequations}
\begin{eqnarray}
\min_{\{x^i\}_{i=1}^N,\lambda,\{s_{ij}\}_{i=1,j=1}^{N,M}} && \lambda + \frac{1}{NM(1-\alpha)}\sum_{i=1}^N \sum_{j=1}^M s_{ij}\\
\subto && s_{ij}\geq 0 \,,\;\forall i=1,\dots,N,\,j=1,\dots,M\\
&& s_{ij}\geq -(\xi^{ij})^T x^i - \lambda \,,\;\forall i=1,\dots,N,\,j=1,\dots,M\\
&& x^i \geq 0\,,\;\forall i=1,\dots,N\\
&& \1^T x^i = 1\,,\;\forall i=1,\dots,N.
\end{eqnarray}
\end{subequations}
}

Given that experiments in section \ref{section:sytheticExp} are based on a synthetic model, we can evaluate the level of sub-optimality of the portfolio policies proposed by the different models. To do so, we developed an \quoteIt{oracle}-based method that has access to the true underlying joint distribution of $\psi$ and $\xi$ and attempts to identify the \quoteIt{true} optimal value of the CVaR objective, namely
%This scenario tree is used in the following  Conditional Value-at-Risk (CVaR) optimization problem:
\[
\min_{\polx:\Psi \rightarrow \mathcal{X}} \text{CVaR}(-\xi^T x(\psi)).
\]
We utilize a scenario tree \(\{\psi^i, \{\xi^{ij}\}_{j=1}^M\}_{i=1}^N\) to approximate the joint distribution of \((\psi, \xi)\), where \(\psi^i \sim F_\psi\) and \(\xi^{ij} \sim F_{\xi|\psi^i}\).
Under such scenario tree, the CVaR optimization problem reduces to a linear program:
\begin{subequations} \label{app:scenTreeSP}
\begin{align}
\min_{\{x^i\}_{i=1}^N,\lambda,\{s_{ij}\}_{i=1,j=1}^{N,M}} & \lambda + \frac{1}{NM(1-\alpha)}\sum_{i=1}^N \sum_{j=1}^M s_{ij}\\
\subto \qquad
&  s_{ij}\geq 0 \,,\;\notag\\
&\quad\forall i=1,\dots,N,\,j=1,\dots,M\\
& s_{ij}\geq -(\xi^{ij})^T x^i - \lambda \,,\;\notag\\
& \quad \forall i=1,\dots,N,\,j=1,\dots,M\\
& x^i \geq 0\,,\;\forall i=1,\dots,N\\
& \1^T x^i = 1\,,\;\forall i=1,\dots,N.
\end{align}
\end{subequations}

To be consistent we the test environment, we consider the $\{\psi_i\}_{i=1}^N$, with $N=1000$, to take on the values of the test set, while $\{\xi^{ij}\}_{j=1}^M$, for each $i$ with $M=1000$, are randomly sampled from $F_{\xi|\psi^i}$.   
This is repeated for the 10 problem instances. The average CVaR optimal value of problem \eqref{app:scenTreeSP} is reported in Table \ref{tab:cvarsample} as the performance of the oracle method.
\end{document}


