% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
% \usepackage{mathtools} % amsmath with fixes and additions
% % \usepackage{siunitx} % for proper typesetting of numbers and units
% \usepackage{booktabs} % commands to create good-looking tables
% \usepackage{tikz} % nice language for creating drawings and diagrams

% \input{style}
\usepackage{graphicx}

\usepackage{float,epstopdf}
\usepackage{bbm}

\usepackage{microtype}

% \usepackage{subfig}
% \usepackage{subfigure}
\usepackage{subcaption}
\usepackage{booktabs}

\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{dsfont}
\usepackage{multicol}
\usepackage{multirow} 
\usepackage{amsfonts} 
\usepackage{mathrsfs}
\usepackage{fancyhdr}
\usepackage[amssymb, thickqspace]{SIunits}
\usepackage{enumitem}
\usepackage{pgfplotstable}
\usepackage{arydshln}

\usepackage{cases}


\usepackage{algorithm}
\usepackage{algorithmic}
\renewcommand{\algorithmicrequire}{\textbf{Input:}} 
\renewcommand{\algorithmicensure}{\textbf{Output:}}

\usepackage{url}
\def\UrlBreaks{\do\/\do-}
\usepackage{xr}

% Todonotes is useful during development; simply uncomment the next line
%    and comment out the line below the next line to turn off comments
%\usepackage[disable,textsize=tiny]{todonotes}
\usepackage[textsize=tiny]{todonotes}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

% \input{comments}

% for underbar
\makeatletter
\def\munderbar#1{\underline{\sbox\tw@{$#1$}\dp\tw@\z@\box\tw@}}
\makeatother

\newtheorem{definition}{Definition}[section]
\newtheorem{theorem}[definition]{Theorem}
\newtheorem{lemma}[definition]{Lemma}
\newtheorem{remark}[definition]{Remark}
\newtheorem{corollary}[definition]{Corollary}
\newtheorem{proposition}[definition]{Proposition}
\newtheorem{observation}[definition]{Observation}
\newtheorem{claim}[definition]{Claim}
\newtheorem{example}[definition]{Example}
\newtheorem{assumption}[definition]{Assumption}
%==================================
\newcommand{\tduy}[1]{\textbf{{\color{red}(Tuan-Duy: #1)}}}
\newcommand{\viet}[1]{\textbf{{\color{magenta}(Viet says: #1)}}}
\newcommand{\note}[1]{\textbf{{\color{red}(#1)}}}
\newcommand{\MC}[1]{\textbf{{\color{blue}(MC says: #1)}}}

%======================================
\newcommand{\be}{\begin{equation}}
\newcommand{\ee}{\end{equation}}
\newcommand{\bea}{\begin{equation*}\begin{aligned}}
\newcommand{\eea}{\end{aligned}\end{equation*}}
\newcommand{\ds}{\displaystyle}
\newcommand{\independent}{\protect\mathpalette{\protect\independenT}{\perp}}
\newcommand{\R}{\mathbb{R}}

\newcommand{\Max}{\max\limits_}
\newcommand{\Min}{\min\limits_}
\newcommand{\Sup}{\sup\limits_}
\newcommand{\Inf}{\inf\limits_}
\newcommand{\Tr}[1]{\Trace \big[ #1 \big]}

\newcommand{\wh}{\widehat}
\newcommand{\mc}{\mathcal}
\newcommand{\mbb}{\mathbb}
\newcommand{\inner}[2]{\big \langle #1, #2 \big \rangle }

\newcommand{\PP}{\mbb P}
\newcommand{\Pnom}{\wh{\mbb P}}
\newcommand{\QQ}{\mbb Q}
\newcommand{\DD}{\mbb D}
\newcommand{\dd}{\mathrm{d}}




%\newcommand{\est}{X\opt}
%\newcommand{\estL}{\lambda\opt}
%\newcommand{\estx}{x\opt}

\DeclareMathOperator{\Trace}{Tr}
\DeclareMathOperator{\diag}{diag}
\DeclareMathOperator{\Diag}{Diag}
\DeclareMathOperator{\sign}{sign}

\DeclareMathOperator{\st}{s.t.}
\DeclareMathOperator{\nullspace}{null}
\DeclareMathOperator{\vect}{vec}
\DeclareMathOperator{\KL}{KL}
\DeclareMathOperator{\FR}{FR}
\DeclareMathOperator*{\Argmin}{Argmin}
\DeclareMathOperator{\Minval}{Min}
\DeclareMathOperator{\grad}{grad}
\DeclareMathOperator{\Hess}{Hess}
\DeclareMathOperator{\Proj}{Proj}

\newcommand{\defeq}{%
	\mathrel{\vbox{\offinterlineskip\ialign{%
				\hfil##\hfil\cr
				$\scriptscriptstyle\triangle$\cr
				%\noalign{\kern0ex}
				$=$\cr
}}}}

% Peyman's comments
\newcommand{\PSD}{\mathbb{S}_{+}} % the set of positive semi-definite matrices of dimension p
\newcommand{\PD}{\mathbb{S}_{++}} % the set of positive semi-definite matrices of dimension p
\newcommand{\PDsigma}{\mathbb{S}_{\ge \sigma}}
\newcommand{\Let}{\triangleq}%https://www.overleaf.com/project/5ea9d406e0687e00012e28ab
\newcommand{\opt}{^\star}
\newcommand{\eps}{\varepsilon}
\newcommand{\ra}{\rightarrow}
\newcommand{\M}{\mc M}
\newcommand{\BB}{\mbb B}
\newcommand{\B}{\mc B}
\newcommand{\X}{X}
\newcommand{\Y}{Y}
\newcommand{\Wass}{\mathds{W}}
\newcommand{\V}{\mathds{W}_S}
\newcommand{\Q}{\mbb{Q}}
\newcommand{\EE}{\mathds{E}}
\newcommand{\x}{x}


\newcommand{\half}{\frac{1}{2}}
\newcommand{\dualvar}{\gamma}


\newcommand{\ie}{{\em i.e.}}
\newcommand{\da}{\downarrow}
\newcommand{\J}{\mc J}
\newcommand{\Dataset}{\wh{\mc D}}
\newcommand{\m}{\mu}
\newcommand{\cov}{\Sigma}
\newcommand{\msa}{\wh \m}
\newcommand{\covsa}{\wh \cov}
\newcommand{\bayes}{\mathrm{Bayes}}

%% xr preamble
\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}

% \setlength{\bibsep}{1.3ex}

\myexternaldocument{nguyen_12-supp}

\title{Robust Bayesian Recourse}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Tuan-Duy H. Nguyen}
\author[1]{Ngoc Bui}
\author[1]{Duy Nguyen}
\author[2]{Man-Chung Yue}
\author[1]{Viet Anh Nguyen}
% Add affiliations after the authors
\affil[1]{%
    VinAI Research, Vietnam
}
\affil[2]{%
    The University of Hong Kong
}
  
  \begin{document}
\maketitle

\begin{abstract}
    Algorithmic recourse aims to recommend an informative feedback to overturn an unfavorable machine learning decision. We introduce in this paper the Bayesian recourse, a model-agnostic recourse that minimizes the posterior probability odds ratio. Further, we present its min-max robust counterpart with the goal of hedging against future changes in the machine learning model parameters. The robust counterpart explicitly takes into account possible perturbations of the data in a Gaussian mixture ambiguity set prescribed using the optimal transport (Wasserstein) distance. We show that the resulting worst-case objective function can be decomposed into solving a series of two-dimensional optimization subproblems, and the min-max recourse finding problem is thus amenable to a gradient descent algorithm. Contrary to existing methods for generating robust recourses, the robust Bayesian recourse does not require a linear approximation step. The numerical experiment demonstrates the effectiveness of our proposed robust Bayesian recourse facing model shifts. Our code is available at \url{https://github.com/VinAIResearch/robust-bayesian-recourse}.
\end{abstract}

\section{Introduction}
% \note{cite this: https://financialservicesblog.accenture.com/model-lifecycle-transformation-how-banks-are-unlocking-efficiencies, https://arxiv.org/abs/2202.11266}

Human constantly embark on multiple temporally-extended planning problems throughout the course of their lifespan, and we have several layers of means-end planning in order to achieve the desired goals. For example, to have a successful career as a machine learning researcher, an individual needs to put in persistent effort from their early education to their post-graduate studies, which may span over the course of over twenty years with numerous significant milestones to achieve. Two of these important milestones are the PhD admission and the job application, and arguably, a favorable outcome at these two milestones may propel an individual's career on a more auspicious trajectory than a negative outcome. To aid the committee to make better decisions, machine learning models are increasingly used in both university admission~\citep{ref:waters2014grade} and job hiring~\citep{ref:sami2019applicant}. A similar trend takes place in credit loan applications~\citep{ref:siddiqi2012credit}, healthcare~\citep{ref:mertes2021ganterfactual} and many others. 

The increasing reliance on and the long impact of algorithmic decisions raise significant requirements on the trustworthiness and explainability of the machine learning models. These requirements become more urgent as black-box, complex models are also gaining spotlight attraction due to their superior performance ~\citep{ref:garisch2019model}.  Post-hoc explanations, which extracts human-understandable explanations, may benefit individuals to understand machine-produced decisions~\citep{ref:kenny2021explaning}. A post-hoc method must demonstrate why unfavorable predictions are made, and possibly how an input would have been to obtain a favorable predicted outcome. If the inputs encode the characteristics of human individuals, then a possible post-hoc explanation may come in the form of a recourse. A recourse recommends the actions that an individual should take in order to receive an alternate algorithmic outcome~\citep{ref:ustun2019actionable}. Consider an applicant who is rejected for a particular job, a recourse may come in the form of personalized recommendations such as "complete a 6-month full-stack engineer internship" or "score 20 more points in the ability test", along with the promise that if the applicant successfully implement the necessary action then the algorithm will return a favorable outcome.

Several approaches has been proposed to provide recourses for machine learning models~\citep{ref:karimi2021survey, ref:steplin2021survey, ref:mishra2021aso, ref:andre2019computation, ref:pawelczyk2021carla}. \citet{ref:wachter2018counterfactual} used a gradient-based approach to find nearest counterfactual to the original instance. \citet{ref:ustun2019actionable} proposed an integer programming approach to generate actionable recourses for linear classifiers. \citet{ref:karimi2020modelagnostic} proposed a model-agnostic approach to generate nearest counterfactual explanations while \citet{ref:poyiadzi2020face} generates counterfactuals that are actionable and supported by the “feasible paths” of actions. \citet{ref:pawelczyk2020once} find a counterfactual explanation with an upper bound for the costs of counterfactual explanations under predictive multiplicity. \citet{ref:mothilal2020explaining} proposed a framework for generating
and evaluating a diverse set of counterfactual explanations based
on determinantal point processes. \citet{ref:bui2022counterfactual} proposed an uncertainty quantification tool to compute the bounds of the probability of validity of a set of counterfactual explanations and enhanced the validity of this set via a correction tool.

These aforementioned approaches all assume that the underlying machine learning models do not change over time. In practice, this assumption is easily violated as experts update the machine learning system frequently due to data distribution shifts~\citep{ref:quionero2009dataset, ref:geeta2019covariate}. As such, an individual may have accomplished all the recommended actions but the next time they apply for the job, the parameters of the model may already change and the updated model may still recommend a negative outcome. In that case, the recourse becomes useless: it is ineffective in overturning a negative prediction, it incurs cost to the applicant, and at the same time it raises substantial doubts about the recourse~\citep{ref:rawal2021algorithmic}. Following this line, a recourse is considered to be robust if it is effective at reversing the algorithmic outcome even under model shifts.

To construct a robust recourse, \citet{ref:upadhyay2021towards} proposed ROAR, a framework that leverages adversarial training to hedge against the perturbation of the model parameter. ROAR considers only linear classifiers; for \textit{non}linear classifiers, ROAR first generates a locally linear approximation of the underlying model (e.g., by using LIME~\citep{ref:ribeiro2016why}), then applies the adversarial training procedure with respect to this locally linear surrogate. However, there are multiple downsides when a locally linear model is used to approximate the nonlinear classifier. Recent works have shown that the locally linear model of LIME has some limitations with both its fidelity and robustness. LIME may not be faithful to the underlying model since it might be influenced by input features at a global scale rather than a local scale~\citep{ref:white2019measurable, ref:laugel2018defining}. At the same time, several works~\citep{ref:alvarez2018on, ref:slack2020fooling, ref:agarwal2021towards} point out that the explanations generated by LIME and other explanation methods may change significantly for nearby original inputs. Moreover, these explanations are even sensitive to the sampling distribution, and the can deliver different explanations of the same input in different simulation runs. Finally, a recourse which is robust for the linear approximation model may not necessarily be robust respective to the original nonlinear model.


% \tduy{Modally robustness, robust goods vs. robust ills. "Resilient enough to survive situational shifts, actionability should be decoupled from the optimization landscape to prevent instituionalized algorithmic oppression e.g., ethnic marginalization etc.,}

% \tduy{Algorithmic hiring}



% Some literature
% \begin{itemize}
%     \item \cite{ref:kim2012robust}
% \end{itemize}


\textbf{Contributions. } The goal of this paper is to formulate a model-agnostic recourse, which is also valid subject to potential future shifts of the machine learning models. Compared to existing methods such as ROAR~\citep{ref:upadhyay2021towards}, our method does not depend on the linear surrogate of the \textit{non}linear predictive model. Instead, our method looks directly into the sampled data points, and employs a Bayesian approach to generate recourses. Potential shifts of the predictive models are engendered  by ``perturbing" these data samples in an adversarial manner. We contribute concretely the followings.
\begin{itemize}[leftmargin = 5mm]
    \item In Section~\ref{sec:recourse}, we propose the notion of a Bayesian recourse, which minimizes the odds ratio between the posterior probability of negative and positive predicted outcomes. In a non-parametric setting, the likelihood can be approximated using a kernel density estimator built around the data sampled in the neighborhood of the boundary point. This results in the KDE-Bayesian recourse, which can be found by (projected) gradient descent. 
    \item In Section~\ref{sec:robust}, we propose the robust counterpart of the Bayesian recourse problem. This robustification involves smoothing the samples by an isotropic Gaussian convolution, then solving a min-max optimization problem over a Wasserstein-Gaussian mixture conditional ambiguity set. Section~\ref{sec:Wass} details our method of using the optimal transport to form the ambiguity sets on the space of Gaussian mixtures.
    \item In Section~\ref{sec:compute}, we show that the robust Bayesian recourse problem is amenable to separability and dimensionality reduction, thus the recourse can be constructed efficiently even in high dimensions. Section~\ref{sec:numerical} demonstrates that our recourse also performs competitively on both synthetic and real datasets.
\end{itemize}

\textbf{Notations. } We use $\delta_s$ to denote a Dirac measure supported on point $s$. The space of $p$-by-$p$ symmetric, positive semidefinite matrix is denoted by $\PSD^p$.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Bayesian Recourse}
\label{sec:recourse}



We consider a generic covariate $X \in \mc X = \R^p$ and a binary predicted label $\hat Y \in \mc Y = \{0, 1\}$, where class 0 denotes an \textit{un}favorable outcome while class 1 denotes a favorable one. Given a pre-specified black-box classifier $\mathcal C$ and an input $x_0$ with unfavorable prediction, i.e., $\mathcal C(x_0) = 0$, the goal of algorithmic recourse is to devise an alternative $x'$ in the vicinity of $x_0$ that satisfies $\mathcal C(x') = 1$. The Bayesian recourse imposes a probabilistic viewpoint into this problem: the goal of Bayesian recourse is to devise an alternative in the vicinity of $x_0$ that has high \textit{favorable posterior probability}. In technical terms, consider the joint random vector of covariate-label $(X, \hat Y) \in \mc X \times \mc Y$, then the class posterior probability of any input $x$ can be represented by the conditional random variable $\hat Y | X = x$.\footnote{In algorithmic recourse, the random variable of interest is the predicted label $\hat Y$ induced by the classifier $\mathcal C$, not the true label $Y$ of the data-generating process. It is important to keep in mind that the (robust) Bayesian recourse is formulated with respect to the predicted label $\hat Y$.} 

\begin{definition}[Bayesian recourse] \label{def:bayes-recourse}
    Given an input $x_0$, let $\mbb X$ be a neighborhood around $x_0$. A Bayesian recourse  $x_{\bayes} \in \mbb X$ is an alternative that minimizes the Bayesian posterior odds ratio, i.e.,
    \[
         x_{\bayes} \Let \arg \Min{x \in \mbb X}~ \displaystyle \frac{\PP( \hat Y = 0 | X = \x)}{ \PP( \hat Y = 1 | X = \x)},
    \]
    for some joint distribution $\PP$ of $(X, \hat Y)$ induced by the sampling of the synthetic covariate $X$ and the synthetic predicted label $\hat Y = \mathcal C(X)$.
\end{definition}

The ratio $\PP( \hat Y = 0 | X = \x)/ \PP( \hat Y = 1 | X = \x)$ is a well-known quantity in Bayesian classification. The posterior probability odds is also a popular ratio in Bayesian statistics, and it has been applied for comparing regression hypotheses~\cite{ref:zellner1981posterior}, econometric models~\cite{ref:geweke1994bayesian}, asset pricing theories~\cite{ref:mcculloch1991bayesian} and collaborative evaluations~\cite{ref:hicks2018bayesian}.

As $x_{\bayes}$ minimizes the Bayesian posterior odds ratio, we can argue that $\PP( \hat Y = 0 | X = x_{\bayes})$ tends to be low, while $\PP( \hat Y = 1 | X = x_{\bayes})$ tends to be high. We next describe how we can solve the optimization problem to get $x_{\bayes}$. Note that the posterior probability can be calculated using the Bayes' theorem~\citep[Theorem~1.31]{ref:schervish1995theory}, and we can instead solve the fractional optimization problem
\[
    \Min{x \in \mbb X}~ \frac{\PP(\hat Y=0) \PP( X = \x | \hat Y = 0) }{ \PP(\hat Y=1) \PP( X = \x | \hat Y = 1) }. 
\]
It is now clear that to find $x_{\bayes}$, we need the marginal probability of $\hat Y$ and the likelihood of $X|\hat Y$. Suppose that we can use a sampling mechanism to sample $n$ covariates $\wh x_i$, then query the given classifier to obtain the predicted labels $\wh y_i = \mathcal C(\wh x_i)$ to form $n$ pairs $(\wh x_i, \wh y_i)$, $i = 1, \ldots, n$. Using these synthetic, labelled samples, we now can formulate the empirical version of Bayesian recourse problem. Let $\mc I_y = \{ i \in [n]: \wh y_i = y\}$ be the indices of samples in class $y \in \mc Y$. Let $N_y = | \mc I_y |$ be the number of training samples with class $y$, then we can use $\gamma_y = N_y/n$, the empirical proportion of data for class $y$, as an estimate of $\PP(\hat Y = y)$.

Next, we take the nonparametric approach to estimate the likelihood $\PP(X = x | \hat Y = y)$ using a kernel density estimator~\citep[Section~1]{ref:tsybakov2008introduction}. As a concrete example, we choose the Gaussian kernel with bandwidth $h > 0$, thus the kernel density estimate of the quantity $\PP(X = x | \hat Y = y)$ is
\[
    L_{\mathrm{KDE}}(\x | \hat Y = y) = \frac{1}{N_y } \sum_{i \in \mc I_y}  \exp\left( - \frac{1}{2h^2} \| \x - \wh x_i \|_2^2 \right).
\]
Thus, the empirical version of the Bayesian recourse, termed the KDE-Bayesian recourse, can be found by solving
\be \label{eq:KDE}
    \Min{x \in \mbb X}~ \displaystyle \frac{\gamma_0 \times L_{\mathrm{KDE}}(  \x | \hat Y = 0) }{ \gamma_1 \times L_{\mathrm{KDE}}( \x | \hat Y = 1)}.
\ee
This problem further simplifies to
\[
    \Min{x \in \mbb X}~\frac{\sum_{i \in \mc I_0}  \exp\left( - \frac{1}{2h^2} \| \x - \wh x_i \|_2^2 \right)}{\sum_{i \in \mc I_1}  \exp\left( - \frac{1}{2h^2} \| \x - \wh x_i \|_2^2 \right)}
\]
by exploiting the definition of $L_{\mathrm{KDE}}$ and $\gamma_y$. In this form, a (projected) gradient descent algorithm can be employed to find the KDE-Bayesian recourse.

There remain two elements to be specified about the formulation of the Bayesian recourse: the sampling scheme to generate covariates $\wh x_i$ and the feasible set $\mbb X$. We discuss these components in the remainder of this section.

\textbf{Sampling scheme. } The goal of the sampling scheme is to synthesize covariate data $\wh x_i$ around the boundary to obtain \textit{local} information from the black-box classifier. Toward this goal, we use a local sampling method, similar to~\citet{ref:vlassopoulos2020explaining} and~\citet{ref:laugel2018defining} as follows.
\begin{itemize}[leftmargin=5mm]
    \item Given an instance $x_0$, we choose $K$ nearest counterfactuals $x_{1}, \ldots, x_K$ from the training data that have favorable predicted outcome, that is, $\mathcal C(x_k) = 1$ for $k = 1, \ldots, K$.
    \item For each counterfactual $x_{k}$, we perform a line search to find a point $x^b_k$ that is on the decision boundary and on the line segment joining $x_{0}$ and $x_{k}$. 
    % To do the line search, we find minimum $\lambda \in [0, 1]$ that has $\mc C(\lambda x_k + (1 - \lambda) x_0) \ge 0.5$ and $x^b_k = \lambda x_k + (1 - \lambda) x_0$.
    \item Among these points $x^b_k$, we choose the nearest point to $x_{0}$ by setting $x^{b} \Let \arg \min_{x^b_i} \{c(x^b_i, x_0)\}$, where $c(\cdot)$ is the cost function. We then sample $\wh x_i$ uniformly in a neighborhood determined by an $\ell_2$-ball with radius $r_p$ centered on $x^b$.
\end{itemize} 

\textbf{Feasible set $\mbb X$. } It is desirable to constrain the recourse in a \textit{strict} neighborhood of distance $\delta$ from the input~\citet{ref:venkatasubramanian2020philosophical}. Thus, we can impose a feasible set of the form
\[\mbb X = \{ x \in \mc X ~:~ \varphi(x, x_0) \le \delta \},\] 
where $\varphi$ is a measure of dissimilarity on the covariate space $\mc X$. Alternatively, if we use a boundary sampler as previously discussed, we may also opt for the constraint $\varphi(x, x^b) \le \delta'$ around the boundary point $x^b$. A good choice of $\varphi$ is the $\ell_1$ distance, which promotes sparse modifications to the input.

In order to construct plausible and meaningful recourses, we could additionally consider the actionability constraints that forbid unrealistic recourses. For example, the gender or race of a person should be considered immutable. Likewise, recourse should not suggest an individual reduce their age to achieve a favorable outcome. These constraints could be easily injected into the definition of the feasible set $\mbb X$, similar to~\citet{ref:upadhyay2021towards}. Finding the optimal actionable recourse restricted to this feasible set could be addressed effectively by a projected gradient descent algorithm~\citep{ref:mothilal2020explaining, ref:upadhyay2021towards}.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Robust Bayesian Recourse} 
\label{sec:robust} 

The Bayesian recourse in Definition~\ref{def:bayes-recourse} depends on the classifier $\mc C$ as we query $\mc C$ to label the samples $\wh x_i$ via $\wh y_i = \mc C(\wh x_i)$. Thus, inherently, the recourse would possess high posterior probability of favorable outcome with respect to the \textit{present} classifier $\mc C$. Because the parameters defining $\mc C$ may be updated, the Bayesian recourse does not guarantee a high probability of favorable outcome with respect to the \textit{future} classifier $\tilde{\mc C}$. Devising a recourse that has a high probability of future favorable outcome encounters two critical difficulties: first, the classifiers $\mc C$ and $\tilde{\mc C}$ are possibly nonlinear, and second, it is nontrivial to predict the shifts in the parameters of $\tilde{\mc C}$ from the present model $\mc C$. Existing robust recourse methods such as ROAR~\citep{ref:upadhyay2021towards} need to approximate a nonlinear model by a linear model using LIME~\citep{ref:ribeiro2016why}, then robustness is represented by perturbations of the parameters of the linear surrogate.

The robust Bayesian recourse takes a completely different path to ensure robustness by removing the need for an intermediate linear surrogate model. The robust Bayesian recourse aims to perturb directly the empirical conditional distributions of $X | \hat Y = y$, which then reshapes the decision boundary in the covariate space in an adversarial manner. Holistically, our approach can be decomposed into the following steps:
\begin{enumerate}[leftmargin = 5mm]
    \item Forming the empirical conditional distributions of $X | \hat Y = y$, then smoothen them by convoluting an isotropic Gaussian noise to each data point.
    \item Formulating the ambiguity set for each conditional distributions of $X | \hat Y = y$.
    \item Solving a min-max problem to find the recourse that minimizes the worst-case Bayesian posterior odds ratio.
\end{enumerate}

We now dive into the technical specifications of the robust Bayesian recourse. Remind that the sampling procedure equips us with the samples $(\wh x_i, \wh y_i)_{i=1, \ldots, n}$, and $\mc I_y$ are indices of samples with predicted label $y$. 
Let $\Pnom_y^\sigma  = N_y^{-1} \sum_{i \in \mc I_y} \delta_{\wh x_i} * \mc N(0, \sigma^2 I)$ be the \textit{smoothed} empirical conditional distribution of $X | Y = y$, in which $*$ denotes the convolution. Notice that $\Pnom_y^\sigma$ is a mixture of Gaussian with $N_y$ components located at the covariate $\wh x_i$ with isotropic variance $\sigma^2 I$. Smoothing the empirical distribution by convoluting a noise to each sample is also attracting attention recently thanks to its possibility to quantify and enhance the robustness of machine learning models~\cite{ref:cohen2020certified}.

We assume now that the conditional distribution can be perturbed in an ambiguity set $\mbb B_{\eps_y}(\Pnom_y^\sigma)$. This set $\mbb B_{\eps_y}(\Pnom_y^\sigma)$ is defined as a neighborhood of radius $\eps_y \ge 0$ centered at the nominal distribution $\Pnom_y^\sigma$. The robust Bayesian recourse is defined as the optimal solution of the following problem
\be \label{eq:dro}
    \Min{x \in \mbb X}~ \displaystyle \Max{\QQ_0 \in \mbb B_{\eps_0}(\Pnom_0^\sigma), \QQ_1 \in \mbb B_{\eps_1}(\Pnom_1^\sigma)}~\frac{ \gamma_0 \QQ_0( X = \x) }{ \gamma_1 \QQ_1( X = \x ) }.
\ee
Notice that $\QQ_y$ is a \textit{conditional} probability measure of $X$ given $\hat Y = y$, and thus it is a measure supported on $\R^p$. The value $\QQ_y(X = x)$ is also the likelihood of $x$ under the conditional measure $\QQ_y$, thus problem~\eqref{eq:dro} can be view as a robust likelihood ratio minimization problem. Here, robustness is defined with respect to the conditional sets $\mbb B_{\eps_y}(\Pnom_y^\sigma)$ in the specific sense: the optimal value of problem~\eqref{eq:dro} constitutes a uniform upper bound of the likelihood ratio over all possible choices of conditional distributions in the sets $\mbb B_{\eps_y}(\Pnom_y^\sigma)$. Further, we have explicitly used $\gamma_y$ as an estimator of the marginal distribution of $\hat Y$ in problem~\eqref{eq:dro}.

There is an intimate relationship between the KDE-Bayesian recourse problem~\eqref{eq:KDE} and the robust Bayesian recourse problem~\eqref{eq:dro}. This relationship is established thanks to the smoothing of the empirical conditional distributions, and is highlighted in the following remark.

\begin{remark}[Recovery of the KDE-Bayesian recourse] The smoothed conditional distribution $\Pnom_y^\sigma$ is a mixture of Gaussians, and the likelihood of $x$ under $\Pnom_y^\sigma$ is
 \[
 \frac{1}{N_y (2\pi)^{\frac{p}{2}} \sigma^p} \sum_{i \in \mc I_y}  \exp\left( - \frac{1}{2\sigma^2} \| \x - \wh x_i \|_2^2 \right).
 \] 
 As a consequence, if the ambiguity sets $\mbb B_{\eps_y}(\Pnom_y^\sigma)$ collapse into singletons, that is, $\mbb B_{\eps_y}(\Pnom_y^\sigma) = \{\Pnom_y^\sigma\}$, then problem~\eqref{eq:dro} coincides with the KDE-Bayesian recourse problem~\eqref{eq:KDE}. Thus, problem~\eqref{eq:dro} can be considered as a robustification of the KDE-Bayesian recourse formulation.
\end{remark}

%thus the quantity $\PP(X = x | Y = y)$ can be replaced by the likelihood of a Gaussian mixture as
% \[
%     L(\x, \Pnom_y^\sigma) = \frac{1}{N_y (2\pi)^{\frac{p}{2}} \sigma^p} \sum_{i \in \mc I_y}  \exp\left( - \frac{1}{2\sigma^2} \| \x - \wh x_i \|_2^2 \right).
% \]



\section{Wasserstein-Gaussian Mixture Ambiguity Sets} \label{sec:Wass}

The central notion underlying the robust Bayesian recourse problem~\eqref{eq:dro} is the set of probability measures for the covariate $X$ conditional that $Y = y$. A suitable design of the ambiguity set $\mbb B_{\eps_y}(\Pnom_y^\sigma)$ is critical to enable an efficient resolution of problem~\eqref{eq:dro}. We here propose a novel design of the ambiguity set by merging ideas from the theory of optimal transport and Gaussian mixtures.

Note that any Gaussian distribution is fully characterized by its mean vector and its covariance matrix. As the smoothed measure $\Pnom_y^\sigma$ is a Gaussian mixture, it is associated with the discrete distribution $\wh \nu_y = N_y^{-1} \sum_{i \in \mc I_y} \delta_{(\wh x_i, \sigma^2 I)}$ on the space of mean vector and covariance matrix $\R^p \times \PSD^p$.\footnote{Associated with any mixture of Gaussians $\QQ_y$ on $\R^p$ is a probability measure $\nu_y$ on the mean-covariance space of $\R^p \times \PSD^p$ such that for any measurable set $\mc S \subseteq \R^p$
    \[
        \QQ_y(X \in \mc S) = \int_{\R^p \times \PSD^p} \int_{\mc S} f(\tilde x | \m, \cov) ~\mathrm{d} \tilde x~\nu_y(\mathrm{d} \m, \mathrm{d} \cov),
    \]
    where $f(\cdot | \m, \cov)$ is the density function of the Gaussian distribution $\mc N(\m, \cov)$.
    }
    Moreover, define the set
\[
    \PDsigma^p \Let \{ \cov \in \PSD^p: \cov \succeq \sigma^2 I\} \subset \PSD^p
\]
of covariance matrices whose eigenvalues are lower bounded by $\sigma^2 > 0$, where $\sigma^2$ is the isotropic variance of the smoothing convolution. Notice that we explicitly constrain the covariance matrices to be invertible so that the likelihood function of each Gaussian component is well-defined. For any $y \in \{0, 1\}$, we formally define the ambiguity set as
\begin{align*}
    &\mbb B_{\eps_y}(\Pnom_y^\sigma) \Let \\
    &\left\{ \QQ_y: \begin{array}{l}
    \nu_y \in \mc P(\R^p \times \PDsigma^p),~\Wass_c(\nu_y, \wh \nu_y) \le \eps_y \\
    \QQ_y \text{ is a Gaussian mixture associated with } \nu_y
    %\QQ_y(X \in \mathbb X) = \int_{\R^p \times \PDsigma^p} \int_{\mathbb X} f(x | \m, \cov) ~\mathrm{d} x~\nu_y(\mathrm{d} \m, \mathrm{d} \cov) \qquad \forall \mbb X \subseteq \R^p ~\text{measurable}
    \end{array}
    \right\}.
\end{align*}
Here, $\mc P(\R^p \times \PDsigma^p)$ denotes the set of all possible distributions supported on $\R^p \times \PDsigma^p$. Intuitively, $\mbb B_{\eps_y}(\Pnom_y^\sigma)$ contains all Guassian mixtures $\QQ_y$ associated with some $\nu_y$ having a distance less than or equal to $\eps_y$ from the nominal measure $\wh \nu_y$. Thus each measure $\QQ_y$ of the random vector $X | Y = y$ is a Gaussian mixture. Each distribution $\nu_y$ is a measure on the space of mean vector-covariance matrix $\R^p \times \PDsigma^p$, and the distance between $\nu_y$ and $\wh \nu_y$ is measured by an optimal transport distance $\Wass_c$. We will use in this paper the type-$\infty$ Wasserstein distance, which is defined as follows.
\begin{definition}[Type-$\infty$ Wasserstein distance] \label{def:wass}
Let $c$ be a nonnegative, symmetric and continuous ground transport cost on $\Xi \Let \R^p \times \PDsigma^p$. The type-$\infty$ Wasserstein distance between two distributions $\nu_1,~\nu_2 \in \mc P(\Xi)$ amounts to
\begin{align*}
&\Wass_{c}(\nu_1, \nu_2) \\
\Let& \Inf{\lambda \in \Lambda(\nu_1, \nu_2)} \left\{ \mathrm{ess} \Sup{\lambda} \big\{ c(\xi_1, \xi_2) : (\xi_1, \xi_2)  \in \Xi \times \Xi \big\} 
\right\},
\end{align*}
where $\Lambda(\nu_1, \nu_2)$ is the set of all couplings of $\nu_1$ and $\nu_2$. 
\end{definition}
	
It remains to specify the ground metric $c$ on the space $\R^p \times \PDsigma^p$. Because the space $\R^p \times \PDsigma^p$ aims to model the mean vectors and the covariance matrices of Gaussian distributions, it is also natural to use a ground metric $c$ that is inspired by the Wasserstein distance between Gaussian distributions. Fortunately, the Wasserstein type-2 distance between Gaussian measures is known in closed form~\citep{ref:olkin1982distance, ref:givens1984class}.
\begin{proposition}[Wasserstein type-2 distance between Gaussian distributions]
The Wasserstein type-2 distance between two $p$-dimensional Gaussian distributions $\mc N(\m, \cov)$ and $\mc N(\msa, \covsa)$ under the Euclidean ground metric amounts to $ \mathds G (\mc N(\m, \cov), \mc N(\msa, \covsa)) = \sqrt{ \| \m - \msa \|_2^2 + \Tr{\cov + \covsa - 2 \big( \covsa^{\half} \cov \covsa^{\half} \big)^{\frac{1}{2}}}}$.
    \end{proposition}

Motivated by the above result, we endow the space $\R^p \times \PDsigma^p$ with the cost function $c$ defined as
    \begin{align*} %\label{eq:c-Wass}
        &c((\m, \cov), (\msa, \covsa))  \\
        &\hspace{1cm} \Let\sqrt{ \| \m - \msa \|_2^2 + \Tr{\cov + \covsa - 2 \big( \covsa^{\half} \cov \covsa^{\half} \big)^{\frac{1}{2}}} }.
    \end{align*}
It is easy to see that $c$ is non-negative, symmetric and continuous on $\R^p \times \PDsigma^p$ and thus $c$ is a valid ground cost for the Wasserstein distance $\Wass_c$ on $\R^p \times \PDsigma^p$.
We should point out that the Wasserstein distance has also been heavily used to construct ambiguity sets in the context of distributionally robust machine learning ~\cite{ref:nguyen2019optimistic, taskesen2021sequential, vu2022distributionally}.
Our formulation of $\Wass_c$ is related with the family of optimal transport for Gaussian mixtures, which we discuss in the next remark. 


\begin{remark}[OT between Gaussian mixtures]
    Our construction relies on representing a Gaussian mixture distribution as a discrete distribution on the mean vector and covariance matrix space. This construction is motivated by recent work on optimal transport between Gaussian mixtures in~\citet{ref:chen2019optimal} and \citet{ref:delon2020wasserstein}. A clear distinction is that we use $\Wass_c$ as the type-$\infty$ distance in Definition~\ref{def:wass}, while the existing literature focuses on type-1 and type-2 distance. As we later demonstrate in Lemma~\ref{lemma:separability}, the type-$\infty$ construction is critical for the separability of the resulting problem.
\end{remark}
    

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\newpage
\section{Computation}
\label{sec:compute}

In this section, we delineate the solution procedure to find a robust Bayesian recourse with the Wasserstein-Gaussian mixture ambiguity sets formalized in Section~\ref{sec:Wass}. Fix any measure $\QQ_y \in \mbb B_{\eps_y}(\Pnom_y^\sigma)$, then $X | Y = y$ follows a mixture of Gaussian under $\QQ_y$, and we let $L(x, \QQ_y)$ be the Gaussian mixture likelihood of a point $x$ under $\QQ_y$. By internalizing the maximization term inside the fraction and replacing $\QQ_y(X = x)$ by the likelihood $L(x, \QQ_y)$, problem~\eqref{eq:dro} is equivalent to
\[
    \Min{x \in \mbb X}~ F(x), \quad F(x) \Let \displaystyle \frac{\gamma_0 \times \Max{\QQ_0 \in \mbb B_{\eps_0}(\Pnom_0^\sigma)}~L(\x, \QQ_0) }{ \gamma_1 \times \Min{\QQ_1 \in \mbb B_{\eps_1}(\Pnom_1^\sigma)} L(\x, \QQ_1) }.
\]
In the sequence, we discuss how to evaluate the objective value $F(x)$, sketch the necessary proof and provide further insights to the likelihood evaluation problems.

\subsection{Reformulations of the Likelihood Evaluation Problems and Routines} \label{sec:refor}


For any $x \in \mbb X$, evaluating its objective value $F(x)$ requires solving the maximization of the likelihood in the numerator
\be \label{eq:likelihood-max}
    \max~\{ L(\x, \QQ_0) : \QQ_0 \in \mbb B_{\eps_0}(\Pnom_0^\sigma)\}
\ee
and the minimization of the likelihood in the denominator
\be \label{eq:likelihood-min}
    \min~\{ L(\x, \QQ_1) : \QQ_1 \in \mbb B_{\eps_1}(\Pnom_1^\sigma)\}.
\ee

At this stage, it is important to relate problems~\eqref{eq:likelihood-max} and~\eqref{eq:likelihood-min} to the existing literature on (Bayesian) likelihood estimation/approximation. Problem~\eqref{eq:likelihood-max} searches for a distribution that \textit{maximizes} the likelihood of $x$ over the set $\mbb B_{\eps_0}(\Pnom_0^\sigma)$, and it is also known in the machine learning literature as an \textit{optimistic} likelihood~\cite{ref:nguyen2019calculating, ref:nguyen2020robust}. There is, however, a clear distinction between the existing results and the results of this paper: \citet{ref:nguyen2019calculating} use a Gaussian feasible set prescribed using the Fisher-Rao distance and \citet{ref:nguyen2020robust} use a moment-based feasible set using the Kullback-Leibler type divergence; in contrast, our set~$\mbb B_{\eps_0}(\Pnom_0^\sigma)$ is a mixture of Gaussian feasible set prescribed using a hierarchical Wasserstein distance. The attractiveness of the existing optimistic likelihood methods lies in their computational tractability. Next, we show that our optimistic likelihood under the Wasserstein-Gaussian mixture ambiguity set also possesses this tractability.

\begin{theorem}[Optimistic likelihood] \label{thm:max}
    For each $i \in \mc I_0$, let $\alpha_i$ be the optimal value of the following two-dimensional optimization problem
     \[
    %\alpha_i = 
    \min_{\substack{a \in \R_+,~d_p \in [\sigma, +\infty) \\ a^2 + (d_p - \sigma)^2 \le \eps_0^2 }} ~ \log d_p + \frac{(\|\x - \wh x_i\|_2 - a)^2}{2d_p^2}  + (p-1) \log \sigma.
    \]
    Then, we have
    \[
    \max~\{ L(\x, \QQ_0) : \QQ_0 \in \mbb B_{\eps_0}(\Pnom_0^\sigma)\} = \frac{\sum_{i \in \mc I_0} \exp(-\alpha_i)}{N_0 (2\pi)^{p/2}} .
    \]
\end{theorem}

Theorem~\ref{thm:max} asserts that we can solve problem~\eqref{eq:likelihood-max} by solving $N_0$ individual subproblems, each subproblem is a two-dimensional minimization problem. Notice that the feasible set of each subproblem is relatively simple: it contains an ellipsoidal constraint and lower bounds on the variables. Hence, it is easy to devise a projection operator for this feasible set. Note that the objective function of the subproblem is non-convex.

Let us now focus our attention on problem~\eqref{eq:likelihood-min}: it searches for a distribution that \textit{minimizes} the likelihood of $x$ over all candidate distributions in $\mbb B_{\eps_1}(\Pnom_1^\sigma)$, and it is termed the \textit{pessimistic} likelihood. It has been previously noticed that the pessimistic likelihood is not easy to solve due to non-convexity~\cite[Appendix~A]{ref:nguyen2020robust}. Surprisingly, for our Wasserstein-Gaussian mixture set, we still can obtain the reformulation below.

\begin{theorem}[Pessimistic likelihood] \label{thm:min}
    For each $i \in \mc I_1$, let $\alpha_i$ be the optimal value of the following two-dimensional optimization problem
     \begin{align*}
    &\min_{\substack{a \in \R_+,~d_1 \in [\sigma, +\infty) \\ a^2 + p(d_1 - \sigma)^2 \le \eps_1^2}} ~ \left\{-\log d_1 - \frac{(\|\x - \wh x_i\|_2 + a)^2}{2d_1^2} \right. \\
    & \hspace{1.2cm} \left. - (p-1) \log \left(\sigma + \sqrt{\frac{\eps_1^2 - a^2 - (d_1 - \sigma)^2}{p - 1}}\right) \right\}.
    \end{align*}
    Then, we have
    \[
    \min~\{ L(\x, \QQ_1) : \QQ_1 \in \mbb B_{\eps_1}(\Pnom_1^\sigma)\} = \frac{\sum_{i \in \mc I_1} \exp(\alpha_i)}{N_1 (2\pi)^{p/2}} .
    \]
\end{theorem}

Theorem~\ref{thm:min} asserts that the pessimistic likelihood problem~\eqref{eq:likelihood-min} admits a similar decomposable structure: solving~\eqref{eq:likelihood-min} is equivalent to solving $N_1$ individual subproblems, each subproblem is a two-dimensional minimization problem with a non-convex objective function. Further, the feasible set of the subproblem is also of tractable form for projection.

\textbf{Numerical routines.} Equipped with Theorems~\ref{thm:max} and~\ref{thm:min}, we can design an iterative scheme to solve the robust Bayesian recourse problem. For any value $x \in \mbb X$, we can use a projected gradient descent to solve a series of two-dimensional subproblems to evaluate the objective value $F(x)$. In Appendix~\ref{sec:app:foa}, we elaborate on the construction of the projection operator as well as the algorithm to evaluate $F(x)$. To optimize $F(x)$ to find the robust recourse, we can also apply a similar algorithm, provided that the projection onto the feasible region $\mbb X$ is easy to solve. 


\begin{figure}
    \centering
    \includegraphics[width=0.8\linewidth]{image/illus_2.png}
    % \vspace{-4mm}
    \caption{An example of the robust Bayesian recourse on a toy 2-dimensional instance. The star denotes the input $x_0$, and the black circle denotes the boundary point $x^b$. Green and blue circles are locally sampled data with favorable and unfavorable predicted values, respectively. The red circle denotes the robust Bayesian recourse, and the curved line denotes the continuum of intermediate solutions of the gradient descent algorithm. The robust Bayesian recourse moves to the interior of the favorable region (green), and thus is more likely to be valid subject to model shifts.}
    \label{fig:illustration}
    \vspace{-5mm}
\end{figure}

\subsection{Sketch of Proofs}	
	
We sketch here the main steps leading to the results in Section~\ref{sec:refor}. Because $\Pnom_y^\sigma$ is a Gaussian mixture and we are using a type-$\infty$ Wasserstein distance to prescribe the neighborhood around the representable distribution, the likelihood evaluation problems admit a decomposable structure. This decomposability has also been exploited previously in the literature of operations management~\citep{ref:bertsimas2021two}, chance constrained programming~\citep{ref:xie2020tractable} and fair classification~\citep{ref:wang2021wasserstein}. In the sequel, we denote $f(x | \m_i, \cov_i)$ the likelihood of $x$ under the $p$-dimensional Gaussian distribution with a mean vector $\m_i$ and a covariance matrix $\cov_i$:
\[
    f(x | \m_i, \cov_i) = \frac{\exp \big( - \half (x - \m_i)^\top \cov_i^{-1} (x - \m_i) \big)}{(2\pi)^{\frac{p}{2}} \det(\cov_i)} .
\]
The next lemma asserts that the likelihood evaluation problem can be decomposed into solving smaller subproblems, each subproblem is an optimization problem over the mean vector - covariance matrix space $\R^p \times \PDsigma^p$.

\begin{lemma}[Separability] \label{lemma:separability}
    There exists a distribution~$\QQ_0\opt$ that solves~\eqref{eq:likelihood-max} and is a mixture of at most $N_0$ Gaussian components. Moreover, problem~\eqref{eq:likelihood-max} is equivalent to a separable problem of the form
    \begin{align*}
        &\max~\{ L(\x, \QQ_0) : \QQ_0 \in \mbb B_{\eps_0}(\Pnom_0^\sigma)\} \\
        =& \left\{\begin{array}{cll}
            \max &  \frac{1}{N_0} \sum_{i \in \mc I_0} f(\x | \m_i, \cov_i) \\ 
            \st & (\m_i, \cov_i) \in \R^p \times \PDsigma^p \\
            & c((\m_i, \cov_i), (\wh x_i, \sigma^2 I)) \le \eps_0 & \forall i \in \mc I_0.
        \end{array} \right.
    \end{align*}
    An analogous result holds for problem~\eqref{eq:likelihood-min} with the corresponding subscript $y=1$.
\end{lemma}

Lemma~\ref{lemma:separability} leverages the essential supremum in the definition of the type-$\infty$ Wasserstein distance in Definition~\ref{def:wass} to separate the problem into subproblem for each component. This separability is \textit{not} obtainable under other types of the Wasserstein distance. 
It is important to bear in mind that each subproblem is still not easy: the objective function is neither convex nor concave in $\cov_i$. Further, we also need to evaluate both the maximization and the minimization counterparts, and tractability is difficult to be established simultaneously in both directions. Despite these difficulties, we can show that each subproblem, which is originally on the $\R^p \times \PDsigma^p$ space, can be reduced to a $2$-dimensional subproblem. This is in fact a significant reduction of dimensionality, and this reduction does not depend on the dimension $p$. First, we provide the reformulation for the maximization counterpart.

\begin{proposition}[Maximization subproblem] \label{prop:max-Wass} 
    Fix any index $i \in \mc I_0$. For any $\wh x_i \in \R^p$, $\x \in \R^p$ and $\eps_0 \in \R_+$, we have
    \[
         \frac{\exp(-\alpha_i)}{(2\pi)^{p/2}} = \left\{
            \begin{array}{cl}
            \max & f(\x| \m_i, \cov_i) \\
            \st & (\m_i, \cov_i) \in \R^p \times \PDsigma^p \\
             &c((\m_i, \cov_i), (\wh x_i, \sigma^2 I)) \le \eps_0, 
            \end{array}
         \right.
    \]
    where $\alpha_i$ is the optimal value of the two-dimensional optimization problem
     \[
        \min_{\substack{a \in \R_+,~d_p \in [\sigma, +\infty) \\ a^2 + (d_p - \sigma)^2 \le \eps_0^2 }} ~ \log d_p + \frac{(\|\x - \wh x_i\|_2 - a)^2}{2d_p^2}  + (p-1) \log \sigma.
    \]
\end{proposition}

The two auxiliary variables $a$ and $d_p$ have a specific meaning which can be explained as follows. Let $(\m_i\opt, \cov_i\opt)$ be the optimal solution of the original maximization problem over $(\m_i, \cov_i)$, and let $(a\opt, d_p\opt)$ be the optimal solution of the reduced problem over $(a, d_p)$. We then have $\| \m_i\opt - \wh x_i \|_2 = a\opt$ and $d_p\opt$ coincides with the \textit{largest} eigenvalues of $\cov_i\opt$. Next, we expose the reformulation for the minimization problem.

\begin{proposition}[Minimization subproblem] \label{prop:min-Wass}
    %Suppose that $c$ is prescribed as in~\eqref{eq:c-Wass}. 
    Fix any index $i \in \mc I_1$. For any $\wh x_i \in \R^p$, $\x \in \R^p$ and $\eps_1 \in \R_+$, we have
    \[
        \frac{\exp(\alpha_i)}{(2\pi)^{p/2}} = \left\{ 
        \begin{array}{cl}
        \min & f(\x| \m_i, \cov_i) \\
        \st & (\m_i, \cov_i) \in \R^p \times \PDsigma^p \\
        & c((\m_i, \cov_i), (\wh x_i, \sigma^2 I)) \le \eps_1,
        \end{array}
        \right.
    \]
    where $\alpha_i$ is the optimal value of the two-dimensional optimization problem
\begin{align*}
    &\min_{\substack{a \in \R_+,~d_1 \in [\sigma, +\infty) \\ a^2 + p(d_1 - \sigma)^2 \le \eps_1^2}} ~ \left\{-\log d_1 - \frac{(\|\x - \wh x_i\|_2 + a)^2}{2d_1^2} \right. \\
    & \hspace{1.5cm} \left. - (p-1) \log \left(\sigma + \sqrt{\frac{\eps^2 - a^2 - (d_1 - \sigma)^2}{p - 1}}\right) \right\}.
\end{align*}
\end{proposition}

There is a similar relationship between $(\m_i\opt, \cov_i\opt)$ which solves the original minimization problem and $(a\opt, d_1\opt)$ which solves the reduced problem: we have $\| \m_i\opt - \wh x_i \|_2 = a\opt$ and $d_1\opt$ coincides with the \textit{smallest} eigenvalues of $\cov_i\opt$.

The above discussion reveals that we can fully reconstruct the distribution~$\QQ_0\opt$ that solves~\eqref{eq:likelihood-max} and $\QQ_1\opt$ that solves~\eqref{eq:likelihood-min} from the solutions of the reduced subproblems, we provide this reconstruction in Appendix~\ref{sec:recovery}.







% \newpage

\section{Numerical Experiment} 
\label{sec:numerical}


\begin{figure*}
    \centering
    \includegraphics[width=\textwidth]{image/mlp_sygesbgm.pdf}
    % \vspace{-5mm}
    \caption{Pareto frontiers of the cost-validity trade-off with the MLP classifier, on synthetic, German Credit, Small Business Administration, and Give Me Some Credit datasets.}
    \label{fig:pareto}
     \vspace{-3mm}
\end{figure*}

We evaluate in this section the robustness to model shifts of different recourses, together with the trade-off against the cost of adopting the recourse's recommendation. We compare our proposed robust Bayesian recourse method, namely RBR, against the counterfactual explanation of Wachter~\citep{ref:wachter2017counterfactual} and against the robust recourse generated by ROAR~\citep{ref:upadhyay2021towards} using either LIME~\citep{ref:ribeiro2016why} and LIMELS~\citep{ref:laugel2018defining} as a surrogate model\footnote{While LIME samples synthetic data \textit{globally} and train a weighted ridge regression, LIMELS generates the local surrogate model by training a (unweighted) ridge regression on the data sampled \textit{locally} near by the closest counterfactual of the input instance (similar to the sampling procedure described in Section \ref{sec:recourse}).}.

\subsection{Experimental Setup}

\textbf{Datasets.} We examine the recourse generators on both a synthetic dataset and the real-world datasets: \textit{German Credit} \citep{ref:dua2017uci, ref:groemping2019south}, \textit{Small Bussiness Administration (SBA)} \citep{ref:li2018should}, and \textit{Give Me Some Credit (GMC)}. Each dataset contains two sets of data: $D_1$ and $D_2$. The former is the current data which is used to train current classifier to generate recourses. The latter represents the possible data arriving in the future. 

For each dataset, we use 80\% of the instances in the current data $D_1$ to train the underlying predictive model and fix this classifier to construct recourses for the remaining 20\% of the instances. The future data $D_2$ will be used to train future classifiers, which are for evaluation only. 

\textbf{Classifier.} We use a three-layer MLP with 20, 50 and 20 nodes, respectively with a ReLU activation in each consecutive layer. The sigmoid function is used in the last layer to produce predictive probabilities. The performance of the MLP classifier is reported in Table~\ref{tab:clf_prfm}.

\begin{table*}
    \centering
    \begin{tabular}{lcccc}
        \toprule
         \multirow{2}{*}{Dataset} & \multicolumn{2}{c}{Present data $D_1$} & \multicolumn{2}{c}{Shift data $D_2$} \\
         \cmidrule(r){2-3} \cmidrule(r){4-5}
         & \textit{Accuracy} & \textit{AUC} & \textit{Accuracy} & \textit{AUC} \\
         \midrule
         Synthetic data & 0.99 $\pm$ 0.00 & 1.00 $\pm$ 0.00 & 0.94 $\pm$ 0.01 & 0.99 $\pm$ 0.01 \\
         German Credit & 0.67 $\pm$ 0.02 & 0.60 $\pm$ 0.03 & 0.66 $\pm$ 0.23 & 0.60 $\pm$ 0.04 \\
         SBA & 0.96 $\pm$ 0.00 & 0.99 $\pm$ 0.00 & 0.98 $\pm$ 0.01 & 0.96 $\pm$ 0.01 \\
        %  Student & 0.86 $\pm$ 0.02 & 0.93 $\pm$ 0.01 & 0.91 $\pm$ 0.04 & 0.97 $\pm$ 0.02 \\
         GMC & 0.94 $\pm$ 0.00 & 0.84 $\pm$ 0.00 & 0.94 $\pm$ 0.00 & 0.84 $\pm$ 0.00 \\
         \bottomrule
    \end{tabular}
    \caption{Accuracy and AUC results of the MLP classifier on the synthetic and real-world datasets.}
    \label{tab:clf_prfm}
    \vspace{-4mm}
\end{table*}

\textbf{Sampling procedure.} We employ the sampling scheme described in Section~\ref{sec:recourse}. We choose the number of counterfactuals $K = 1000$ and sample $200$ synthetic samples uniformly with a sampling radius $r_p = 0.2$.

\textbf{Metrics.} To measure the ease of adopting a recourse, we use the $\ell_1$-distance as the cost function $\varphi$ on the covariate space $\mc X$, this choice is similar to \citet{ref:ustun2019actionable} and \citet{ref:upadhyay2021towards}. We define the \textit{current validity} as the validity of the recourses with respect to the current classifier $\mc C$. To evaluate the robustness of recourses to the changes in model's parameters, we sample $20\%$ of the instances in the data set $D_2$ as the arrival data. We then re-train the classifier with the old data (80\% of $D_1$) coupled with this arrival data to simulate the future classifiers $\tilde{\mc C}$. We repeat this procedure 100 times to obtain 100 future classifiers and report the \textit{future validity} of a recourse as the fraction of the future classifiers with respect to which the recourse is valid.

% Appendix~\ref{sec:app-exp} provides further details about the experiments.
\subsection{Experimental Details} \label{sec:app-exp}

We use both synthetic and real-world datasets. 

\textbf{Synthetic dataset. } We synthesize the 2-dimensional data by sampling 1000 instances uniformly in a rectangle $ [-2, 4] \times [-2, 7]$. For each sample, we label using the function $f(x) = 1$ if $x_2 \ge 1 + x_1 + 2 x_1^2 + x_1^3 - x_1^4 + \varepsilon$, and $f(x) = $ otherwise,
% \[
%     f(x) = \left\{
%             \begin{array}{cl}
%                 1 & \mathrm{if} \quad x_2 \ge 1 + x_1 + 2 x_1^2 + x_1^3 - x_1^4 + \varepsilon, \\
%                 0 & \mathrm{otherwise},
%             \end{array}
%         \right.
% \]
where $\varepsilon$ is a random noise. We set $\varepsilon = 0$ when generating the present set $D_1$ and $\varepsilon \sim \mc N(0, 1)$ for the future set $D_2$. 

% For real-world dataset, German credit \citep{ref:dua2017uci} contains the information (e.g. age, gender, financial status,...) of 1000 customers who took a loan from a bank. The classification task is to determine the risk (good or bad) of an individual. There is another version of this dataset regarding to corrections of coding error \citep{ref:groemping2019south}. We use the corrected version of this dataset as a shifted data to capture correction shift. The features we used in this dataset include `duration', `amount', `personal\_status\_sex', and `age'.

\textbf{Real-world datasets. } Three real-world datasets are used.

\begin{enumerate}[label=-]
    \item \textit{German Credit} \citep{ref:dua2017uci}. The dataset contains the information (e.g. age, gender, financial status,...) of 1000 customers who took a loan from a bank. The classification task is to determine the risk (good or bad) of an individual. There is another version of this dataset regarding to corrections of coding error \citep{ref:groemping2019south}. We use the corrected version of this dataset as a shifted data to capture correction shift. The features we used in this dataset include `duration', `amount', `personal\_status\_sex', and `age'.
    \item \textit{Small Bussiness Administration (SBA)} \citep{ref:li2018should}. This dataset includes 2102 observations of small business loan approvals from 1987 to 2014. We divide it into two datasets (one is instances from 1989 - 2006 and one is instances from 2006 - 2014) to capture temporal shift. We use the following features: `Term', `NoEmp', `CreateJob', `RetainedJob', `UrbanRural', `ChgOffPrinGr', `GrAppv', `SBA\_Appv', `New', `RealEstate', `Portion', `Recession'.
    % \item \textit{Student performance} \citep{ref:cortez2008using}. This data includes the performance records of 649 students in two schools: Gabriel Pereira (GP) and Mousinho da Silveira (MS). The classification task is to determine if their final score is above average or not. We split this dataset into two sets in two schools to capture geospatial shift. The features we used are: `age', `Medu', `Fedu', `studytime', `famsup', `higher', `internet', `romantic', `freetime', `goout', `health', `absences', `G1', `G2'.
    % \item \textit{Breast Cancer} dataset, each instance has 10 features computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. Each sample is labeled either malignant or benign. The features used in the experiment are `radius', `texture', `perimeter', `area', `smoothness', `compactness', 'concavity', `points', `symmetry', `dimension'.
    \item \textit{Give Me Some Credit (GMC)}\footnote{https://www.kaggle.com/competitions/GiveMeSomeCredit/data}. This dataset is used to predict if a person would experience financial distress in the next two years. Given 150000 entries from the available dataset, we randomly shuffle and partition the data equally into the current set $D_1$ and the shifted set $D_2$. Each entry contains 10 features: `RevolvingUtilizationOfUnsecuredLines', `age', `NumberOfTime30-59DaysPastDueNotWorse', `DebtRatio', `MonthlyIncome', `NumberOfOpenCreditLinesAndLoans', `NumberOfTimes90DaysLate', `NumberRealEstateLoansOrLines', `NumberOfTime60-89DaysPastDueNotWorse', `NumberOfDependents'.
\end{enumerate}

\subsection{Cost-validity trade-off}
We obtain the Pareto front for the trade-off between the cost of adopting recourses produced by RBR and their validity by varying the ambiguity sizes $\eps_1$ and $\eps_0$, along the maximum recourse cost $\delta$, with $\delta = \|x_0 - x_b\|_1 + \delta_+$. Particularly, we consider $\sigma = 1.0$, $\eps_0, \eps_1 \in \{ 0.5k \; | \; k = 0, \ldots, 2\}$, and $\delta_+ \in \{0.2l \; | \; l = 0, \ldots, 5\}$. The frontiers for ROAR-based methods are obtained by varying $\delta_{\max} \in \{ 0.02m \; | \; m = 0, \ldots, 10 \}$, where $\delta_{\max}$ is the tuning parameter of ROAR. As shown in Figure \ref{fig:pareto}, increasing $\eps_1$ and $\delta_+$ generally increase the future validity of recourses yielded by RBR at the sacrifice of the cost, while sustaining the current validity. Yet, the frontiers obtained by RBR either dominate or comparable to other frontiers of Wachter, LIME-ROAR, and LIMELS-ROAR.

\textbf{Conclusions.} 
In this work, we proposed the robust Bayesian recourse which aims to be effective at reversing algorithmic outcome under potential model shifts. It is a model-agnostic approach that does not require approximating the nonlinear classifier by a linear surrogate. Instead, the robust Bayesian recourse minimizes directly the worst-case posterior probability odds ratio subject to the cost constraint bound. The robustness is designed with respect to the Wasserstein-Gaussian mixture ambiguity sets of the conditional distributions, in which the neighborhood is prescribed using an optimal transport (type-$\infty$ Wasserstein) distance.  We showed that the min-max recourse problem can be optimized using a gradient descent algorithm, which exploits separability and dimensionality reduction when evaluating the objective value. Our experiments on synthetic and real-world datasets demonstrate that the robust Bayesian recourse is more robust at a lower cost than other baselines.

While this paper focus on algorithmic transparency, we note that transparency may lead to the tension between transparency and gaming-the-system behaviors: the greater transparent be the decision process the more opportunity for exploitative manipulations ~\citep{ref:yan2022margin}. We envision that robustness techniques may alleviate these gaming behaviors and may lead to more trustworthy guarantees of (machine learning) algorithms.

% \vspace{-3mm}
\textbf{Acknowledgments.}
    Man-Chung Yue is supported by the HKRGC under the General Research Fund project 15305321.
%\end{acknowledgements}

% \newpage
% \vspace{-2mm}
\bibliography{bibliography}


% \newpage
% \clearpage 
% \onecolumn

% \appendix

% \section{Proofs of Section~\ref{sec:compute}}

% \newtheorem*{lemma:separability}{Lemma~\ref{lemma:separability} (re-stated)}
% \begin{lemma:separability}
%     There exists a distribution~$\QQ_0\opt$ that solves~\eqref{eq:likelihood-max} and is a mixture of at most $N_0$ Gaussian components. Moreover, problem~\eqref{eq:likelihood-max} is equivalent to a separable problem of the form
%     \begin{align*}
%         \max~\{ L(\x, \QQ_0) : \QQ_0 \in \mbb B_{\eps_0}(\Pnom_0^\sigma)\} = \left\{\begin{array}{cll}
%             \max &  \frac{1}{N_0} \sum_{i \in \mc I_0} f(\x | \m_i, \cov_i) \\ 
%             \st & (\m_i, \cov_i) \in \R^p \times \PDsigma^p \\
%             & c((\m_i, \cov_i), (\wh x_i, \sigma^2 I)) \le \eps_0 & \forall i \in \mc I_0.
%         \end{array} \right.
%     \end{align*}
%     An analogous result holds for problem~\eqref{eq:likelihood-min} with the corresponding subscript $y=1$.
% \end{lemma:separability}
% \begin{proof}[Proof of Lemma~\ref{lemma:separability}]
%     There exists a distribution~$\QQ_0\opt$ that solves~\eqref{eq:likelihood-max} and is a mixture of at most $N_0$ Gaussian components. Moreover, problem~\eqref{eq:likelihood-max} is equivalent to a separable problem of the form
%     \begin{align*}
%     &\max~\{ L(\x, \QQ_0) : \QQ_0 \in \mbb B_{\eps_0}(\Pnom_0^\sigma)\} \\
%     =& \left\{\begin{array}{cll}
%         \max &  \frac{1}{N_0} \sum_{i \in \mc I_0} f(\x | \m_i, \cov_i) \\ 
%         \st & (\m_i, \cov_i) \in \R^p \times \PDsigma^p \\
%         & c((\m_i, \cov_i), (\wh x_i, \sigma^2 I)) \le \eps_0 & \forall i \in \mc I_0.
%     \end{array} \right.
%     \end{align*}
    
%     We use $\forall i$ implies $\forall i \in \mc I_0$, and $\sum_{i}$ is also taken over the same set. Given any $\x$, the likelihood of $\x$ under any Gaussian mixture $\QQ_0$ can be written using the corresponding measure $\nu_0$ as
%     \[
%         L(\x, \QQ_0) = \int_{\R^p \times \PSD^p} f(\x | \m, \cov) \nu_0(\mathrm{d} \m, \mathrm{d} \cov).
%     \]
%     Recall that $\Xi = \R^p \times \PDsigma^p$. Using the definition of the type-$\infty$ Wasserstein, we find
%     \begin{align*}
%         &\ \Wass_c(\nu_0, \wh \nu_0) \le \eps_0 \\
%         \Leftrightarrow &\  \exists\lambda\in \Lambda(\nu_0, \wh \nu_0) \text{ such that} \\
%         &\  \mathrm{ess} \Sup{\lambda} \big\{ c((\m, \cov), (\m', \cov')) : (\m, \cov, \m', \cov')  \in \Xi \times \Xi \big\} \le \eps_0\\
%         \Leftrightarrow &\  \forall i\  \exists \lambda_i \in \mc P(\Xi) \text{ such that} \\
%         &\ \mathrm{ess} \Sup{\lambda_i} \big\{ c((\m, \cov), (\wh x_i, \sigma I)) : (\m, \cov)  \in \Xi  \big\} \le \eps_0\\
%         \Leftrightarrow &\  \forall i\  \exists \lambda_i \in \mc P(\Xi) \text{ such that}\\
%         &\  c((\m,\cov), (\wh x_i, \sigma I)) \le \eps_0 \quad (\m, \cov)\in \mathrm{supp}(\lambda_i),
%     \end{align*}
%     where the second equivalence follows from that $\wh \nu_0 = \frac{1}{N_0} \sum_{i} \delta_{(\wh x_i, \sigma^2 I)}$ and hence any $\lambda \in \Lambda(\nu_0, \wh \nu_0)$ takes the form $\frac{1}{N_0} \sum_{i} \lambda_i \otimes \delta_{(\wh x_i, \sigma^2 I)}$ for some probability measures $\lambda_i \in \mc P(\Xi)$, and the third equivalence follows from Lemma~\ref{lem:sup_esssup}.
%     % \begin{align*}
%     %     &\{ \nu_0 \in \mc P(\Xi): \Wass_c(\nu_0, \wh \nu_0) \le \eps_0 \} \\
%     %     =& \left\{ \nu_0 \in \mc P(\Xi): 
%     %     \begin{array}{l}
%     %     \exists \lambda \in \Lambda(\nu_0, \wh \nu_0) \text{ such that}: \\
%     %     \mathrm{ess} \Sup{\lambda} \big\{ c((\m, \cov), (\m', \cov')) : (\m, \cov, \m', \cov')  \in \Xi \times \Xi \big\} \le \eps_0
%     %     \end{array}
%     %     \right\} \\
%     %     =& \left\{ \nu_0 \in \mc P(\Xi): 
%     %     \begin{array}{l}
%     %     \exists \lambda_i \in \mc P(\Xi) \quad \forall i \text{ such that}: \nu_0 = \frac{1}{N_0} \sum_{i} \lambda_i\\
%     %     \mathrm{ess} \Sup{\frac{1}{N_0} \sum_{i} \lambda_i \otimes \delta_{(\wh x_i, \sigma^2 I)}} \big\{ c((\m, \cov), (\m', \cov')) : (\m, \cov, \m', \cov')  \in \Xi \times \Xi \big\} \le \eps_0
%     %     \end{array}
%     %     \right\},
%     % \end{align*}
%     % where in the second equality, we exploited the fact that $\wh \nu_0 = \frac{1}{N_0} \sum_{i} \delta_{(\wh x_i, \sigma^2 I)}$, and that any $\lambda \in \Lambda(\nu_0, \wh \nu_0)$ can be written using law of conditional distribution as $\frac{1}{N_0} \sum_{i} \lambda_i \otimes \delta_{(\wh x_i, \sigma^2 I)}$ for some collection of probability measures $\lambda_i \in \mc P(\Xi)$ satisfying $\nu_0 = \frac{1}{N_0} \sum_{i} \lambda_i$. Notice that the essential supremum constraint can now be written as
%     % \[
%     %     c((\m_i, \cov_i), (\wh x_i, \sigma^2 I)) \le \eps_0  \quad \forall (\m_i, \cov_i) \in \mathrm{supp}(\lambda_i) \qquad \forall i,
%     % \]
%     % where $\mathrm{supp}(\lambda_i)$ denotes the support of the probability measure $\lambda_i$~\citep[pp.~441]{ref:aliprantis06hitchhiker}. 
%     Hence, problem~\eqref{eq:likelihood-max} is equivalent to
%     \begin{align*}
%     &\left\{
%     \begin{array}{cl}
%         \max &  \int_{\R^p \times \PDsigma^p} f(\x | \m, \cov) \nu_0(\mathrm{d} \m, \mathrm{d} \cov) \\
%         \st & \nu_0 \in \mc P(\R^p \times \PDsigma^p) \\
%         & \Wass_c(\nu_0, \wh \nu_0) \le \eps_0
%     \end{array}
%     \right. \\
%     =&\left\{
%     \begin{array}{cl}
%         \max &  \frac{1}{N_0} \sum_{i} \int_{\R^p \times \PDsigma^p} f(\x | \m_i, \cov_i) \lambda_i(\mathrm{d} \m_i, \mathrm{d} \cov_i) \\
%         \st & \lambda_i \in \mc P(\R^p \times \PDsigma^p) \qquad \forall i\\
%         & c((\m_i, \cov_i), (\wh x_i, \sigma^2 I)) \le \eps_0  \quad \forall (\m_i, \cov_i) \in \mathrm{supp}(\lambda_i) \qquad \forall i.
%     \end{array}
%     \right.
%     \end{align*}
%     It is easy now to employ a greedy argument to show that the optimal solution for $\lambda_i$ should be a Dirac delta distribution supported on one point in the space of $\R^p \times \PDsigma^p$. This leads to the conclusion regarding the maximization problem~\eqref{eq:likelihood-max}. 
    
%     An similar argument can be applied for the minimization problem~\eqref{eq:likelihood-min}, the detailed proof is omitted.
% \end{proof}

% \begin{lemma}
%     \label{lem:sup_esssup}
%     For any $\lambda\in \mc P (\Xi)$, $\wh x\in \R^p$, $\sigma ,\eps >0$ and any function $c:\Xi\times \Xi \to \R$ such that the map $(\m, \cov) \mapsto c( (\m,\cov), (\wh x, \sigma^2 I))$ is continuous, we have $\mathrm{ess}\sup_{\lambda} c((\m, \cov), (\wh x, \sigma^2 I)) \le \eps$ if and only if $c((\m, \cov), (\wh x, \sigma^2 I)) \le \eps$ for any $(\m, \cov)\in \mathrm{supp}(\lambda)$.
% \end{lemma}

% \begin{proof}[Proof of Lemma~\ref{lem:sup_esssup}]
% We first prove the ``only if'' direction. Suppose that there exists $(\m', \cov')\in \mathrm{supp}(\lambda)$ such that 
% \[c ((\m', \cov'), (\wh x, \sigma^2 I)) > \eps .\]
% By continuity of the map $(\m, \cov) \mapsto c( (\m,\cov), (\wh x, \sigma^2 I))$, there exists an open neighbourhood $U\subseteq \Xi$ containing $(\m', \cov')$ such that
% \[ c ((\m, \cov), (\wh x, \sigma^2 I)) > \eps\quad\forall (\m,\cov) \in U . \]
% By the definition of of support, $\lambda (U) > 0$. Therefore,
% \[ \Pr_{\lambda} ( c((\m, \cov), (\wh x, \sigma^2 I)) \le \eps ) = 1- \Pr_{\lambda} ( c((\m, \cov), (\wh x, \sigma^2 I)) > \eps ) \le 1 - \lambda (U) < 1, \]
% which contradicts to that $\mathrm{ess}\sup_{\lambda} c((\m, \cov), (\wh x, \sigma^2 I)) \le \eps$.

% We next prove the ``if'' direction. By the law of total probability and the fact that $c((\m, \cov), (\wh x, \sigma^2 I)) \le \eps$ for any $(\m, \cov)\in \mathrm{supp}(\lambda)$,
% \begin{align*}
%     & \Pr_\lambda \left( c((\m, \cov), (\wh x, \sigma^2 I)) \le \eps \right) \\
%     = & \Pr_\lambda \left( c((\m, \cov), (\wh x, \sigma^2 I)) \le \eps | (\m,\cov)\in\mathrm{supp}(\lambda) \right)\lambda (\mathrm{supp}(\lambda)) \\
%     &\quad + \Pr_\lambda \left( c((\m, \cov), (\wh x, \sigma^2 I)) \le \eps | (\m,\cov)\not\in\mathrm{supp}(\lambda) \right) (1- \lambda( \mathrm{supp}(\lambda)))\\
%     = & 1\cdot 1 + \Pr_\lambda \left( c((\m, \cov), (\wh x, \sigma^2 I)) \le \eps | (\m,\cov)\not\in\mathrm{supp}(\lambda) \right)\cdot 0 = 1,
% \end{align*}
% which completes the proof.
% \end{proof}

% \newtheorem*{prop:max-Wass}{Proposition~\ref{prop:max-Wass} (re-stated)}
% \begin{prop:max-Wass}
%     Fix any index $i \in \mc I_0$. For any $\wh x_i \in \R^p$, $\x \in \R^p$ and $\eps_0 \in \R_+$, we have
%     \[
%          \frac{\exp(-\alpha_i)}{(2\pi)^{p/2}} = \left\{
%             \begin{array}{cl}
%             \max & f(\x| \m_i, \cov_i) \\
%             \st & (\m_i, \cov_i) \in \R^p \times \PDsigma^p \\
%              &c((\m_i, \cov_i), (\wh x_i, \sigma^2 I)) \le \eps_0, 
%             \end{array}
%          \right.
%     \]
%     where $\alpha_i$ is the optimal value of the two-dimensional optimization problem
%      \[
%         \min_{\substack{a \in \R_+,~d_p \in [\sigma, +\infty) \\ a^2 + (d_p - \sigma)^2 \le \eps_0^2 }} ~ \log d_p + \frac{(\|\x - \wh x_i\|_2 - a)^2}{2d_p^2}  + (p-1) \log \sigma.
%     \]
% \end{prop:max-Wass}
% \begin{proof}[Proof of Proposition~\ref{prop:max-Wass}] Let $\alpha_i$ be the optimal value of the negative log-likelihood minimization problem
% \[
%     \alpha_i = \left\{ \begin{array}{cl}
%         \min & \half \log \det \Sigma_i + \half (\x - \m_i)^\top \cov_i^{-1} (\x - \m_i)  \\
%         \st & \m_i \in \R^p,~\cov_i \in \PSD^p \\
%         & \| \m_i - \wh x_i \|_2^2 + \Tr{\cov_i + \sigma^2 I - 2 \big( (\sigma^2 I)^{\half} \cov_i (\sigma^2 I)^{\half} \big)^{\frac{1}{2}} } \leq \eps_0^2 \\
%         & \cov_i \succeq \sigma^2 I.
%     \end{array}
%     \right.
% \]
% It is easy to see that
% \[
%     \max\{ f(\x| \m_i, \cov_i) : (\m_i, \cov_i) \in \R^p \times \PDsigma^p,~c((\m_i, \cov_i), (\wh x_i, \sigma^2 I)) \le \eps_0\} = \frac{1}{\sqrt{(2\pi)^p}} \exp(-\alpha_i ).
% \]
% It remains to provide a simpler formulation to determine $\alpha_i$. To simplify the notation, we omit the index $i$ on all variables and parameters.
% We reparameterize $\cov = V \diag(d^2) V^\top$ for a vector $d \in \R_+^p$, where $\diag(d^2)$ denotes a $\R^{p\times p}$ diagonal matrix with its $j$-th diagonal entries equals to $d_j^2$, and $\mathrm{O}(p)$ is the set of $p$-dimensional orthogonal matrices
% \[
%     \mathrm{O}(p) = \{ V \in \R^{p \times p} : V^\top V = I_p\}.
% \] 
% The negative log-likelihood minimization problem is further equivalent to
% \[
%     \begin{array}{cl}
%         \min & \sum_{j=1}^p \log d_j + \half (V^\top(\x - \m))^\top \diag(d^{-2}) (V^\top(\x - \m))  \\
%         \st & ~ d \in \R_+^p, ~V \in \mathrm{O}(p),~\m \in \R^p  \\
%         & 
%         \| \m - \wh x \|_2^2 + \sum_{j=1}^p (d_j - \sigma)^2 \leq \eps_0^2 \\
%         & d \ge \sigma,
%     \end{array}
% \]
% where $d \ge \sigma$ implies the element-wise constraints $d_j \ge \sigma$ for any $j = 1, \ldots, p$.
% We introduce an auxiliary variable $a \in \R_+$ and rewrite the optimization problem in an equivalent way as
% \[
% \min_{\substack{a \in \R_+,~d \in \R_+^p, ~ d \ge \sigma  \\ a^2 + \sum_j (d_j - \sigma)^2 \le \eps_0^2} } ~ \min_{\substack{\m \in \R^p \\ \|\m - \wh x\|_2^2 = a^2}} ~ \min_{V \in \mathrm{O}(p)}~ \sum_{j=1}^p \log d_j + \half (V^\top(\x - \m))^\top \diag(d^{-2}) (V^\top(\x - \m)). 
% \]
% Notice that the above optimization problem is invariant to the ordering of the entries of $d$.
% As a consequence, without any loss of generality, we can assume that $d_p$ is the maximum value across all $d_j$. By Lemma~\ref{lemma:optimal_V}, the above optimization problem becomes
% \[
% \min_{\substack{a \in \R_+,~d \in \R_+^p, ~ d \ge \sigma \\ a^2 + \sum_j (d_j - \sigma)^2 \le \eps_0^2\\ d_p = \max\{d\} }} ~ \min_{\substack{\m \in \R^p \\ \|\m - \wh x\|_2^2 = a^2}} ~  \sum_{j=1}^p \log d_j + \frac{1}{2d_p^2} \|\x - \m\|_2^2. 
% \]
% Using Lemma~\ref{lemma:quadratic}, we obtain the equivalent optimization problem
% \[
% \min_{\substack{a \in \R_+,~d \in \R_+^p, ~ d \ge \sigma \\ a^2 + \sum_j (d_j - \sigma)^2 \le \eps_0^2\\ d_p = \max\{d\} }} ~  \sum_{j=1}^p \log d_j + \frac{1}{2d_p^2} (\|\x - \wh x\|_2 - a)^2.
% \]
% Rewriting the above problem into a two-layer optimization problem
% \begin{equation}
% \label{eq:main_eq_max_ot}
% \min_{\substack{a \in \R_+,~d_p \in \R_+, ~ d_p \ge \sigma \\ a^2 + (d_p - \sigma)^2 \le \eps_0^2 }} ~ \left\{\log d_p + \frac{1}{2d_p^2} (\|\x - \wh x\|_2 - a)^2 + \Min{\substack{d_j \in \R_+, ~ d_j \ge \sigma ~ \forall j=1, \ldots, p-1\\ \sum_{j=1}^{p-1} (d_j - \sigma)^2 \le \eps_0^2 - a^2 - (d_p - \sigma)^2\\
% d_j \le d_p ~ \forall j=1, \ldots, p-1}} \sum_{j=1}^{p-1} \log d_j \right\}.
% \end{equation}
% Notice that for any $d_p$ that is feasible for the outer minimization problem, the inner minimization problem over $d_j$, $\forall j = 1, \ldots, p-1$ admits a non-empty feasible set. Indeed, because $d_p \ge \sigma$, the value $d_j = \sigma$, $j = 1, \ldots, p-1$ is a feasible solution for the inner problem. We now focus on solving the inner minimization problem. As $\log (\cdot)$ is an increasing function, for any $s \ge 0$, we find
% \[
% \min_{\substack{d_p\ge d_j \ge \sigma ~ \forall j=1, \ldots, p-1 \\ \sum_{j=1}^{p-1} (d_j - \sigma)^2 \le s}}~\sum_{j=1}^{p-1} \log d_j = (p-1) \log \sigma,
% \]
%  which holds because the optimization problem on the left hand side admits the optimal solution $d\opt_j = \sigma$ for all $j = 1, \dots, p-1$. This completes the proof.
% \end{proof}

% \newtheorem*{prop:min-Wass}{Proposition~\ref{prop:min-Wass} (re-stated)}
% \begin{prop:min-Wass}
%     %Suppose that $c$ is prescribed as in~\eqref{eq:c-Wass}. 
%     Fix any index $i \in \mc I_1$. For any $\wh x_i \in \R^p$, $\x \in \R^p$ and $\eps_1 \in \R_+$, we have
%     \[
%         \frac{\exp(\alpha_i)}{(2\pi)^{p/2}} = \left\{ 
%         \begin{array}{cl}
%         \min & f(\x| \m_i, \cov_i) \\
%         \st & (\m_i, \cov_i) \in \R^p \times \PDsigma^p \\
%         & c((\m_i, \cov_i), (\wh x_i, \sigma^2 I)) \le \eps_1,
%         \end{array}
%         \right.
%     \]
%     where $\alpha_i$ is the optimal value of the two-dimensional optimization problem
%     \begin{align*}
%         \min_{\substack{a \in \R_+,~d_1 \in [\sigma, +\infty) \\ a^2 + p(d_1 - \sigma)^2 \le \eps_1^2}} ~ \left\{-\log d_1 - \frac{(\|\x - \wh x_i\|_2 + a)^2}{2d_1^2} - (p-1) \log \left(\sigma + \sqrt{\frac{\eps^2 - a^2 - (d_1 - \sigma)^2}{p - 1}}\right) \right\}.
%     \end{align*}
% \end{prop:min-Wass}
% \begin{proof}[Proof of Proposition~\ref{prop:min-Wass}]
% Let $\alpha_i$ be the optimal value of the log-likelihood \textit{minimization} problem
% \[
%     \alpha_i = \left\{ \begin{array}{cl}
%         \min & -\half \log \det \Sigma_i - \half (\x - \m_i)^\top \cov^{-1} (\x - \m_i)  \\
%         \st & \m_i \in \R^p,~\cov_i \in \PSD^p \\
%         & \| \m_i - \wh x_i \|_2^2 + \Tr{\cov_i + \sigma^2 I - 2 \big( (\sigma^2 I)^{\half} \cov_i (\sigma^2 I)^{\half} \big)^{\frac{1}{2}} } \leq \eps_1^2 \\
%         & \cov_i \succeq \sigma^2 I.
%     \end{array}
%     \right.
% \]
% It is easy to see that
% \[
%     \min\{ f(\x| \m_i, \cov_i) : (\m_i, \cov_i) \in \R^p \times \PDsigma^p,~c((\m_i, \cov_i), (\wh x_i, \sigma^2 I)) \le \eps_1\} =  \frac{1}{(2\pi)^{p/2}} \exp(\alpha_i).
% \]
% It remains to provide the computational routine to determine $\alpha_i$. To simplify the notation, we omit the index $i$ on all variables and parameters.
% We reparameterize $\cov = V \diag(d^2) V^\top$ for a vector $d \in \R_+^p$, where $\diag(d^2)$ denotes a $\R^{p\times p}$ diagonal matrix with its $j$-th diagonal entries equals to $d_j^2$, and $\mathrm{O}(p)$ is the set of $p$-dimensional orthogonal matrices
% \[
%     \mathrm{O}(p) = \{ V \in \R^{p \times p} : V^\top V = I_p\}.
% \] 
% The log-likelihood minimization problem is further equivalent to
% \[
%     \begin{array}{cl}
%         \min & -\sum_{j=1}^p \log d_j - \half (V^\top(\x - \m))^\top \diag(d^{-2}) (V^\top(\x - \m))  \\
%         \st & d \in \R_+^p,~V \in \mathrm{O}(p),~\m \in \R^p  \\
%         & 
%         \| \m - \wh x \|_2^2 + \sum_{j=1}^p (d_j - \sigma)^2 \leq \eps_1^2 \\
%         & d \ge \sigma,
%     \end{array}
% \]
% where $d \ge \sigma$ implies the element-wise constraints $d_j \ge \sigma$ for any $j = 1, \ldots, p$.
% We introduce an auxiliary variable $a \in \R_+$ and rewrite the optimization problem in an equivalent way as
% \[
% \min_{\substack{a \in \R_+,~d \in \R_+^p, ~d \ge \sigma \\ a^2 + \sum_j (d_j - \sigma)^2 \le \eps_1^2} } ~ \min_{\substack{\m \in \R^p \\ \|\m - \wh x\|_2^2 = a^2}} ~ \min_{V \in \mathrm{O}(p)}~ -\sum_{j=1}^p \log d_j - \half (V^\top(\x - \m))^\top \diag(d^{-2}) (V^\top(\x - \m)). 
% \]
% Notice that the above optimization problem is invariant to the ordering of the entries of $d$.
% As a consequence, without any loss of generality, we can assume that $d_1$ is the minimum value across all $d_j$. By Lemma~\ref{lemma:optimal_V}, the above optimization problem becomes
% \[
% \min_{\substack{a \in \R_+,~d \in \R_+^p,~ d \ge \sigma \\ a^2 + \sum_j (d_j - \sigma)^2 \le \eps_1^2\\ d_1 = \min\{d\} }} ~ \min_{\substack{\m \in \R^p \\ \|\m - \wh x\|_2^2 = a^2}} ~  -\sum_{j=1}^p \log d_j - \frac{1}{2d_1^2} \|\x - \m\|_2^2. 
% \]
% Using Lemma~\ref{lemma:quadratic}, we obtain the equivalent optimization problem
% \[
% \min_{\substack{a \in \R_+,~d \in \R_+^p, ~d \ge \sigma \\ a^2 + \sum_j (d_j - \sigma)^2 \le \eps_1^2\\ d_1 = \min\{d\} }} ~  -\sum_{j=1}^p \log d_j - \frac{1}{2d_1^2} (\|\x - \wh x\|_2 + a)^2.
% \]
% Notice that the constraint $\sigma \le d_1 = \min\{d\}$ implies that $p(d_1 - \sigma)^2 \le \sum_j (d_j - \sigma)^2$. As a consequence, any feasible value for $d_1$ should satisfy $a^2 + p (d_1 - \sigma)^2 \le \eps_1^2$. Separating the variable $d$ into two groups $d_1$ and $d_2,\dots, d_p$ leads to a two-layer optimization problem 
% \begin{equation}
% \label{eq:main_eq_min_ot}
% \min_{\substack{a \in \R_+,~d_1 \in \R_+,~ d_1 \ge \sigma \\ a^2 + p(d_1 - \sigma)^2 \le \eps_1^2 }} ~ \left\{-\log d_1 - \frac{1}{2d_1^2} (\|\x - \wh x\|_2 + a)^2 + \Min{\substack{d_j \in \R_+, ~ d_j \ge d_1 ~\forall j=2, \ldots, p\\ \sum_{j=2}^p (d_j - \sigma)^2 \le \eps_1^2 - a^2 - (d_1 - \sigma)^2 }} -\sum_{j=2}^{p} \log d_j \right\}.
% \end{equation}
% Consider momentarily the minimization problem 
% \begin{equation*}
% \Min{\substack{d_j \in \R_+ \quad \forall j = 2, \ldots, p \\ \sum_{j=2}^p (d_j - \sigma)^2 \le \eps_1^2 - a^2 - (d_1 - \sigma)^2 }} -\sum_{j=2}^{p} \log d_j,
% \end{equation*}
% where the constraints $d_j \ge d_1$ have been intentionally omitted. Proposition~\ref{prop:max-e} asserts that this optimization problem has the optimal value
% \[
% -(p-1) \log \left(\sigma + \sqrt{\frac{\eps_1^2 - a^2 - (d_1 - \sigma)^2}{p - 1}}\right)
% \]
% at the optimal solution $d\opt_j=\sigma + \sqrt{\frac{\eps_1^2 - a^2 - (d_1 - \sigma)^2}{p-1}}$, which also by the outer constraint $a^2 + p(d_1 - \sigma)^2 \le \eps_1^2$ satisfies $d_j \ge d_1 ~ \forall j=2,\dots,p$. Thus it is indeed the optimal solution to the inner minimization problem in~\eqref{eq:main_eq_min_ot}. As a consequence, problem~\eqref{eq:main_eq_min_ot} is equivalent to 
% \[
% \min_{\substack{a \in \R_+,~d_1 \in \R_+, ~d_1 \ge \sigma\\ a^2 + p(d_1 - \sigma)^2 \le \eps_1^2}} ~ \left\{-\log d_1 - \frac{1}{2d_1^2} (\|\x - \wh x\|_2 + a)^2 - (p-1) \log \left(\sigma + \sqrt{\frac{\eps_1^2 - a^2 - (d_1 - \sigma)^2}{p - 1}}\right) \right\}.
% \]
% This completes the proof.
% \end{proof}


% \section{Auxiliary Results} 
% \label{sec:app-aux}

% The following preparatory results are necessary to prove Propositions~\ref{prop:max-Wass} and~\ref{prop:min-Wass}. 

% \begin{lemma}[Eigenbasis solution] \label{lemma:optimal_V}
% Let $E\in\R^{p\times p}$ be a diagonal matrix satisfying $E_{11}\le \cdots\le E_{pp}$. Then, for any $w \in \R^p$, we have
% \begin{equation*}
%     \Max{V \in \mathrm{O}(p)}~w^\top V E V^\top w = E_{pp} \|w\|_2^2.
% \end{equation*}
% \end{lemma}
% \begin{proof}[Proof of Lemma~\ref{lemma:optimal_V}]
%     The claim holds trivially when $w = 0$. Consider now any $w \in \R^p \backslash \{0\}$. Since $V E V^\top \preceq E_{pp}\cdot I_p$, we find
%     \begin{align*}
%         \Max{V \in \mathrm{O}(p)}~w^\top V E V^\top w \le \Max{V \in \mathrm{O}(p)}~w^\top V (E_{pp}\cdot I_p) V^\top w = E_{pp} \|w\|_2^2.
%     \end{align*}
%     On the other hand, taking $V\opt = [v_1\opt, \ldots, v_p\opt] \in \mathrm{O}(p)$ with $v_p\opt = \frac{w}{\|w\|_2}$, and using the orthogonality of the columns of $V\opt$, we have
%     \[ w^\top V\opt E {V\opt}^\top w = E_{pp} \|w\|_2^2.  \]
%     This shows that $V\opt$ is an optimal solution and completes the proof.
% \end{proof}


% \begin{lemma}[Quadratic optimization] \label{lemma:quadratic}
% For any $\x \in \R^p$, $\wh x \in \R^p$ and $a \in \R_+$, the following assertions hold.
% \begin{itemize}
%     \item Convex quadratic minimization:
%     \[
%         \Min{\m \in \R^p: \| \m - \wh x \|_2^2 = a^2}~ \| \x - \m \|_2^2 =  (\| \x - \wh x \|_2 - a)^2,
%     \]
%     where the minimum is attained at $\m\opt = \frac{a}{\| \x - \wh x \|_2}\x + (1 - \frac{a}{\| \x - \wh x \|_2})\wh x$.
%     \item Convex quadratic maximization:
%     \[
%         \Max{\m \in \R^p: \| \m - \wh x \|_2^2 = a^2}~ \| \x - \m \|_2^2 =  (\| \x - \wh x \|_2 + a)^2,
%     \]
%     where the maximum is attained at $\mu\opt = -\frac{a}{\|\x - \wh x\|_2}\x + (1 + \frac{a}{\|\x - \wh x\|_2})\wh x$.
% \end{itemize}
% \end{lemma}
% The results in Lemma~\ref{lemma:quadratic} are dispersed in the literature. An elementary proof is provided here for completeness.
% \begin{proof}[Proof of Lemma~\ref{lemma:quadratic}]
% By the triangle inequality, for any $\m$ such that $\| \m - \wh x \|_2 = a$, we have
% \[ \| \x - \m \|_2 \ge \left| \| \x - \wh x \| - \| \m - \wh x \| 
% \right| = \left| \| \x - \wh x \| - a \right|, \]
% where the lower bound can be attained by taking $\m = \frac{a}{\| \x - \wh x \|_2}\x + (1 - \frac{a}{\| \x - \wh x \|_2})\wh x$. Therefore, 
% \[\Min{\m \in \R^p: \| \m - \wh x \|_2^2 = a^2}~ \| \x - \m \|_2^2 =  (\| \x - \wh x \|_2 - a)^2\]
% Similarly, by the triangle inequality we have
% \[ \| \x - \m \|_2 \le \| \x - \wh x \| + \| \wh x - \m \| = \| \x - \wh x \| + a, \]
% and the upper bound can be attained by $\mu = -\frac{a}{\|\x - \wh x \|_2}\x + (1 + \frac{a}{\|\x - \wh x \|_2})\wh x$. This completes the proof.
% \end{proof}

% \begin{proposition}[Logarithm maximization] \label{prop:max-e}
% For any $s ,\sigma\ge 0$ and positive integer $k$, we have
% \begin{equation}\label{opt:sum_log_max_subproblem}
%     k \log\left( \sqrt{\frac{s}{k}} + \sigma \right) = \left\{
%     \begin{array}{cl}
%         \displaystyle\max_{e\in\R_+^{k}} & \displaystyle\sum_{j=1}^{k} \log e_j  \\
%         \st & \displaystyle\sum_{j = 1}^{k} (\sigma - e_j)^2 \le s.
%     \end{array}
%     \right.
% \end{equation}
% Moreover, the optimal solution $e\opt$ satisfies $e\opt_j = \sqrt{\frac{s}{k}} + \sigma$ for any  $j = 1, \ldots, k$.
% \end{proposition}
% \begin{proof}[Proof of Proposition~\ref{prop:max-e}]
% Let $e\opt \in \R^{k}_+$ be an optimal solution to the maximization problem~\eqref{opt:sum_log_max_subproblem}. Suppose there exist two indices $m$ and $n$ such that $e\opt_m \neq e\opt_n$. Consider $e'$ defined by
% \[
% e'_j = 
% \begin{cases}
% \half(e\opt_m + e\opt_n), & \text{if } j \in \{m,n\},\\
% e\opt_j, &\text{otherwise}.
% \end{cases}
% \]
% By the convexity of the function $x\mapsto (x-\sigma)^2$,
% \[  \left(e'_m - \sigma\right)^2 + \left(e'_n - \sigma\right)^2= 2\left(\frac{e\opt_m + e\opt_n}{2} - \sigma \right)^2 \le (e\opt_m - \sigma)^2 + (e\opt_n - \sigma)^2 , \]
% which implies that $e'$ is a feasible solution to problem~\eqref{opt:sum_log_max_subproblem}.
% Furthermore, since $e\opt_m \neq e\opt_n$, by the concavity of the function $x\mapsto \log x$, we have that 
% \[\log e\opt_m + \log e\opt_n < 2\log \left(\frac{e\opt_m + e\opt_n}{2}\right) = \log e'_m + \log e'_n,\]
% which violates the optimality of $e\opt$. Therefore, any optimal solution $e\opt$ must have all entries identical. Using this, we get from the constraint that
% \[ |e\opt_j - \sigma| \le \sqrt{\frac{s}{k}}\quad\forall j = 1,\dots, k. \] 
% By continuity of the objective and constraint functions, we must have
% \[ |e\opt_j - \sigma| = \sqrt{\frac{s}{k}}\quad\forall j = 1,\dots, k. \] 
% Since the objective function is increasing in $e\opt_j$, the optimal solution is given by
% \[ e\opt_j = \sigma + \sqrt{\frac{s}{k}} \quad\forall j = 1,\dots, k.\]
% The optimal value can then be obtained by direct computation. This completes the proof.
% \end{proof}


% \section{First-Order Algorithms} \label{sec:app:foa}

% \subsection{Optimistic Likelihood Problem} \label{sec:app:opp}

% For the optimistic likelihood problem, Theorem~\ref{thm:max} reduces the task to solving the 2-dimensional problem
%  \[
%     \min_{\substack{a \in \R_+,~d_p \in [\sigma, +\infty) \\ a^2 + (d_p - \sigma)^2 \le \eps_0^2 }} ~ \log d_p + \frac{(\|\x - \wh x_i\|_2 - a)^2}{2d_p^2}  + (p-1) \log \sigma.
%     \]
% By letting 
% \[
%     d_p = v_2 + \sigma, \quad \text{and} \quad a = v_1,
% \]
% we can obtain the equivalent form
% \be \label{eq:newprob}
%     \min_{\substack{v_1, v_2 \ge 0 \\ v_1^2 + v_2^2 \le \eps_0^2 }} ~ F(v),
% \ee
% where the objective function is given by
% \[
% F(v) = \log (v_2 + \sigma) + \frac{(\|\x - \wh x_i\|_2 - v_1)^2}{2(v_2 + \sigma)^2}  + (p-1) \log \sigma.
% \]
% If we denote by $\mathcal V = \{ v \in \R^2: v_1, v_2 \ge 0, v_1^2 + v_2^2 \le \eps_0^2 \}$ the feasible region of the above minimization problem, then the projection $\Proj_{\mathcal{V}}(v)$ can be computed in closed-form via
% \begin{equation*}
%     \Proj_{\mathcal{V}}(v) = \begin{cases}
%     v, &\text{if } v_1, v_2\ge 0, v_1^2 + v_2^2 \le \eps_0^2,\\
%     \frac{\eps_0}{\|v\|_2}v, & \text{if } v_1, v_2\ge 0, v_1^2 + v_2^2 > \eps_0^2,\\
%     (0,\eps_0)^\top, & \text{if } v_1 < 0, v_2 > \eps_0,\\
%     (0,v_2)^\top, & \text{if } v_1 < 0, 0\le v_2 \le \eps_0,\\
%     (\eps_0, 0)^\top, & \text{if } v_1 > \eps_0, v_2 < 0,\\
%     (v_1,0)^\top, & \text{if } 0\le v_1 \le \eps_0, v_2 < 0,\\
%     (0,0)^\top, &\text{if } v_1, v_2<0.
%     \end{cases}
% \end{equation*}
% % \viet{The above projection operator can be re-expressed in two steps:
% % \[
% %     \Proj_{\mathcal{V}}(v) = \Proj_{\mc B_{\eps_0}}(\Proj_{\R_+^2}(v)),
% % \]
% % which first projects onto $\R_+^2$, then projects onto the ball of radius $\eps_0$ around the origin.
% % \[
% % \Proj_{\mc B_{\eps_0}}(v) = \min\{1, \frac{\eps_0}{\|v\|_2}\} v =\eps_0 \times \min\{\frac{1}{\eps_0}, \frac{1}{\| v\|_2}\} v = \eps_0 \times \frac{1}{\max\{\eps_0, \|v\|_2\}} v.
% % \]}
% Algorithm~\ref{alg:pgd} is a projected gradient descent routine to solve problem~\eqref{eq:newprob}. The convergence guarantee for Algorithm~\ref{alg:pgd} follows from \citet[Theorem~10.15]{beck2017first}.


% \begin{algorithm}[h]
% 	\caption{Projected Gradient Descent Algorithm with Backtracking Line-Search}
% 	\label{alg:pgd}
% 	\begin{algorithmic}
% 		%\STATE {\bfseries Input:} Sample $\x \in \R^p$, Radius $\eps \in \R_+$, mean value $\msa \in \R^p$, smoothing variance $\sigma \in \R_+$, $\theta\in (0,1)$, $\beta>0$ 
% 		\STATE {\bfseries Algorithm parameters:} Line search parameters $\theta\in (0,1)$, $\beta>0$ 
% 		\STATE {\bfseries Initialization:} Set $ v^0 \leftarrow 0$
%         \FOR{$t = 0, 1, \ldots$}
%             \STATE Find the smallest integer $k\ge 0$ such that 
%             \begin{align*}
%                 & F\left( \Proj_{\mathcal{V}} (v^t - \theta^k \beta \nabla F(v^t)) \right) \le F(v^t )  - \frac{1}{2 \theta^k \beta} \| v^t - \Proj_{\mathcal{V}} (v^t - \theta^k \beta \nabla F(v^t)) \|_2^2
%             \end{align*}
%             \STATE Set $s^t = \theta^k \beta$ and set $v^{t+1} = \Proj_{\mc V}(u^t - s^t \nabla F(v^t))$.
%         \ENDFOR
% 		%\STATE{\bfseries Output:} $u^T$.
% 	\end{algorithmic}
% \end{algorithm}





% \subsection{Pessimistic Likelihood Problem} \label{sec:app:pes}

% For the pessimistic likelihood problem, Theorem~\ref{thm:min} reduces the task to solving the 2-dimensional problem
% \[
% \min_{\substack{a \in \R_+,~d_1 \in [ \sigma, +\infty) \\ a^2 + p(d_1 - \sigma)^2 \le \eps_1^2}} ~ \left\{-\log d_1 - \frac{1}{2d_1^2} (\|\x - \wh x_i\|_2 + a)^2 - (p-1) \log \left(\sigma + \sqrt{\frac{\eps_1^2 - a^2 - (d_1 - \sigma)^2}{p - 1}}\right) \right\}.
% \]
% Note that the gradient of the objective function is a non-Lipschitz function. Worse still, the gradient is even undefined on at the feasible point $(d_1, a) = (\sigma, \eps_1)$. These properties induce numerical issues for the optimization algorithm. Therefore, we solve the following perturbed problem instead:
% \begin{equation}\label{opt:max-KL-perturbed}
%     \min_{\substack{a \in \R_+,~d_1 \in [\sigma, +\infty) \\ a^2 + p(d_1 - \sigma)^2 \le \eps_1^2}} ~ \left\{-\log d_1 - \frac{1}{2d_1^2} (\|\x - \msa\|_2 + a)^2 - (p-1) \log \left(\sigma + \sqrt{\frac{\zeta + \eps_1^2 - a^2 - (d_1 - \sigma)^2}{p - 1}}\right) \right\},
% \end{equation}
% for some small $\zeta >0$. By \citet[Proposition~4.4]{ref:bonnans2013perturbation}, the optimal value of problem~\eqref{opt:max-KL-perturbed} is continuous in $\zeta$ and the optimal solution set is upper semi-continuous in $\zeta$ as a set-valued mapping, see \citet[Section~4.1]{ref:bonnans2013perturbation}.

% We now derive a projected gradient descent algorithm with backtracking line search for solving problem~\eqref{opt:max-KL-perturbed}. First, by letting
% \[d_1 = u_2 + \sigma, \quad \text{and} \quad a = \sqrt{p}u_1,\] 
% we can equivalently transform problem~\eqref{opt:max-KL-perturbed} to the following one:
% \begin{equation}\label{opt:max-KL-u}
%     \min_{\substack{u_1,u_2\ge 0\\ u_1^2 + u_2^2 \le (\eps_1/\sqrt{p})^2}} ~ F(u), 
% \end{equation}
% where the objective function is given by
% \[
% F(u) = -\log (u_2 + \sigma) - \frac{1}{2(u_2 + \sigma)^2} (\|\x - \wh x_i\|_2 + \sqrt{p} u_1 )^2 - (p-1) \log \left(\sigma + \sqrt{\frac{\zeta + \eps_1^2 - p u_1^2 - u_2^2}{p - 1}}\right) .
% \]
% The upshot of problem~\eqref{opt:max-KL-u} is that the feasible region is the intersection of the non-negative orthant with a circular disk of radius $\eps_1/\sqrt{p}$ centered at the origin. As we will see below, this enables easy computation of the projection and linear optimization oracle. 
% Indeed, denoting by $\mathcal{U} = \{ u\in\R^2: u_1, u_2\ge 0, u_1^2 + u_2^2 \le (\eps_1/\sqrt{p})^2 \}$ the feasible region of problem~\eqref{opt:max-KL-u}, the projection $\Proj_{\mathcal{U}}(u)$ can be computed in closed-form via
% \begin{equation*}
%     \Proj_{\mathcal{U}}(u) = \begin{cases}
%     u, &\text{if } u_1, u_2\ge 0, u_1^2 + u_2^2 \le (\eps_1/\sqrt{p})^2,\\
%     \frac{(\eps_1 / \sqrt{p})}{\|u\|_2}u, & \text{if } u_1, u_2\ge 0, u_1^2 + u_2^2 > (\eps_1/\sqrt{p})^2,\\
%     (0,\frac{\eps_1}{\sqrt{p}})^\top, & \text{if } u_1 < 0, u_2 > \frac{\eps_1}{\sqrt{p}},\\
%     (0,u_2)^\top, & \text{if } u_1 < 0, 0\le u_2 \le \frac{\eps_1}{\sqrt{p}},\\
%     (\frac{\eps_1}{\sqrt{p}}, 0)^\top, & \text{if } u_1 > \frac{\eps_1}{\sqrt{p}}, u_2 < 0,\\
%     (u_1,0)^\top, & \text{if } 0\le u_1 \le \frac{\eps_1}{\sqrt{p}}, u_2 < 0,\\
%     (0,0)^\top, &\text{if } u_1,u_2<0.
%     \end{cases}
% \end{equation*}
% A projected gradient descent algorithm can now be employed to solve problem~\eqref{opt:max-KL-u}.

% \section{Recovery of the Adversarial Distribution} \label{sec:recovery}

% It is often instructive to recover and analyze the optimal distribution that maximizes the posterior probability odds ratio, or more directly, the likelihood ratio in~\eqref{eq:dro}. Equivalent, it suffices to characterize the distribution $\QQ_0\opt$ that maximizes~\eqref{eq:likelihood-max}, and the distribution $\QQ_1\opt$ that minimizes~\eqref{eq:likelihood-min}.

% \begin{lemma}[Likelihood maximizer] \label{lemma:max-dist}
%     For each $i \in \mc I_0$, let $(a_i\opt, d_{pi}\opt)$ be the optimal solution of the following two-dimensional optimization problem
%      \[
%     \min_{\substack{a \in \R_+,~d_p \in [\sigma, +\infty) \\ a^2 + (d_p - \sigma)^2 \le \eps_0^2 }} ~ \log d_p + \frac{(\|\x - \wh x_i\|_2 - a)^2}{2d_p^2}  + (p-1) \log \sigma.
%     \]
%     Then, the maximizero $\QQ_0\opt$ of problem~\eqref{eq:likelihood-max} is a Gaussian mixture with $N_0$ components, and for $i \in \mc I_0$, the $i$-th components has mean
%     \[
%         \m_i\opt =  \frac{a_i\opt}{\|x - \wh x_i \|_2 } x + \left(1 - \frac{a_i\opt}{\|x - \wh x_i \|_2 }\right) \wh x_i, 
%     \]
%     and covariance matrix 
%     \[ \cov_i\opt = V_i\opt \diag( \sigma, \dots, \sigma, d_{pi}\opt )^2 (V_i\opt)^\top,\]
%     where $V_i\opt$ is any orthogonal matrix with the $p$-th column given by $\frac{x - \m_i\opt}{\|x - \m_i\opt\|_2}$.
% \end{lemma}

% \begin{proof}[Proof of Lemma~\ref{lemma:max-dist}]
% The result follows directly by inspecting the proofs of Proposition~\ref{prop:max-Wass}, Lemma~\ref{lemma:optimal_V} and Lemma~\ref{lemma:quadratic}.
% \end{proof}


% \begin{lemma}[Likelihood minimizer] \label{lemma:min-dist}
%     For each $i \in \mc I_1$, let $(a_i\opt, d_{1i}\opt)$ be the optimal solution of the following two-dimensional optimization problem
%     \begin{align*}
%     &\min_{\substack{a \in \R_+,~d_1 \in [\sigma, +\infty) \\ a^2 + p(d_1 - \sigma)^2 \le \eps_1^2}} ~ \left\{-\log d_1 - \frac{(\|\x - \wh x_i\|_2 + a)^2}{2d_1^2} - (p-1) \log \left(\sigma + \sqrt{\frac{\eps_1^2 - a^2 - (d_1 - \sigma)^2}{p - 1}}\right) \right\}.
%     \end{align*}
%     Then, the minimizer $\QQ_1\opt$ of problem~\eqref{eq:likelihood-min} is a Gaussian mixture with $N_1$ components, and for $i \in \mc I_1$, the $i$-th components has mean
%     \[
%         \m_i\opt =  - \frac{a_i\opt}{\|x - \wh x_i \|_2 } x + \left(1 + \frac{a_i\opt}{\|x - \wh x_i \|_2 }\right) \wh x_i, 
%     \]
%     and covariance matrix 
%     \[ \cov_i\opt = V_i\opt \diag\left( d_{1i}\opt, \sigma + \sqrt{\frac{\eps_1^2 - {a_i\opt}^2 - (d_{1i}\opt - \sigma)^2}{p-1}}, \dots, \sigma + \sqrt{\frac{\eps_1^2 - {a_i\opt}^2 - (d_{1i}\opt - \sigma)^2}{p-1}} \right)^2 (V_i\opt)^\top,\]
%     where $V_i\opt$ is any orthogonal matrix with the $1$st column given by $\frac{x - \m_i\opt}{\|x - \m_i\opt\|_2}$.
% \end{lemma}
% \begin{proof}[Proof of Lemma~\ref{lemma:min-dist}]
% The result follows directly by inspecting the proofs of Proposition~\ref{prop:min-Wass}, Lemma~\ref{lemma:optimal_V} and Lemma~\ref{lemma:quadratic}.
% \end{proof}


% \begin{figure}
%     \centering
%     \includegraphics[width=0.6\linewidth]{image/illus_3.png}
%     \caption{Visualization of the worst-case distributions on a toy dataset, color codes are similar to Figure~\ref{fig:illustration}. The dashed, opaque dots and circles represent the isotropic Gaussian around each data sample. The solid dots and circles represent the worst-case distributions corresponding to the boundary point $x^b$. For blue (unfavorably predicted) samples, the worst-case distribution is formed by perturbing the distribution towards $x^b$ -- which leads to maximizing the posterior probability of unfavorable prediction. For green (favorably predicted) samples, the worst-case distribution is formed by perturbing the distribution away from $x^b$ -- which leads to minimizing the posterior probability of favorable prediction. These worst-case distributions will maximize the posterior probability odds ratio.}
%     \label{fig:illus3}
% \end{figure}


% The following convergence guarantee for Algorithm~\ref{alg:pgd} follows from \citet[Theorem~10.15]{beck2017first}.
% \begin{theorem}
% Let $\{u^t\}_{t = 0,1,\dots, T}$ be the sequence generated by Algorithm~\ref{alg:pgd}. Then, all limit points of the sequence $\{u^t\}_{t = 0,1,\dots, T}$ are stationary points of problem~\eqref{opt:max-KL-u}. Furthermore, there exists some constant $C_1 >0$ such that for any $T \ge 1$,
% \[ \min_{t = 0,1,\dots,T} \frac{\left\| u^t - \Proj_{\mathcal{U}} \left(u^t - \beta \nabla F (u^t) \right)\right\|_2}{\beta} \le \frac{C_1}{\sqrt{T}}. \]
% \end{theorem}
% \begin{algorithm}[h]
% 	\caption{Projected Gradient Descent Algorithm with Backtracking Line-Search}
% 	\label{alg:pgd}
% 	\begin{algorithmic}
% 		%\STATE {\bfseries Input:} Sample $\x \in \R^p$, Radius $\eps \in \R_+$, mean value $\msa \in \R^p$, smoothing variance $\sigma \in \R_+$, $\theta\in (0,1)$, $\beta>0$ 
% 		\STATE {\bfseries Algorithm parameters:} Line search parameters $\theta\in (0,1)$, $\beta>0$ 
% 		\STATE {\bfseries Initialization:} Set $ u^0 \leftarrow 0$
%         \FOR{$t = 0, 1, \ldots$}
%             \STATE Find the smallest integer $k\ge 0$ such that 
%             \begin{align*}
%                 & F\left( \Proj_{\mathcal{U}} (u^t - \theta^k \beta \nabla F(u^t)) \right) \le F(u^t )  - \frac{1}{2 \theta^k \beta} \| u^t - \Proj_{\mathcal{U}} (u^t - \theta^k \beta \nabla F(u^t)) \|_2^2
%                 %\le&  F(u^t ) - \nabla F(u^t)^\top \left( u^t - \Proj_{\mathcal{U}} (u^t - \theta^k \beta \nabla F(u^t)) \right) + \frac{1}{2 \theta^k \beta} \| u^t - \Proj_{\mathcal{U}} (u^t - \theta^k \beta \nabla F(u^t)) \|_2^2 .
%             \end{align*}
%             \STATE Set $s^t = \theta^k \beta$. 
%             %\WHILE 
%             \STATE Set $u^{t+1} = \Proj_{\mc U}(u^t - s^t \nabla F(u^t))$.
%         \ENDFOR
% 		\STATE{\bfseries Output:} $u^T$.
% 	\end{algorithmic}
% \end{algorithm}


% \newpage
% \begin{algorithm}[h]
% 	\caption{Projected Gradient Descent Algorithm with Fixed Stepsize}
% 	\label{alg:pgd}
% 	\begin{algorithmic}
% 		%\STATE {\bfseries Input:} Sample $\x \in \R^p$, Radius $\eps \in \R_+$, mean value $\msa \in \R^p$, smoothing variance $\sigma \in \R_+$, $\theta\in (0,1)$, $\beta>0$ 
% 		\STATE {\bfseries Algorithm parameters:} Iteration $T$
% 		\STATE {\bfseries Initialization:} Set $ u^0 \leftarrow 0$
%         \FOR{$t = 0, 1, \ldots, T$}
%             \STATE Set $u^{t+1} = \Proj_{\mc U}(u^t - \frac{1}{\sqrt{T}} \nabla F(u^t))$.
%         \ENDFOR
% 		\STATE{\bfseries Output:} $u^T$.
% 	\end{algorithmic}
% \end{algorithm}




%\viet{remember to use multiple retrial}

% For the Frank-Wolfe algorithm, at each iteration, we need to solve a subproblem of the form
% \begin{equation}
%     \label{opt:FW-oracle}
%     \min_{u\in \mathcal{U}} g^\top u,
% \end{equation}
% where $g$ is the gradient of the objective function $F$ at the current iterate. Problem~\eqref{opt:FW-oracle} is called the linear optimization oracle (also called the Frank-Wolfe oracle) of problem~\eqref{opt:max-KL-perturbed} and admits a closed-form solution given by
% \begin{equation}\label{eq:FW-subproblem-opt}
%     u\opt = \begin{cases}
%     \frac{\eps}{\sqrt{p}}\frac{g}{\| g \|_2}, &\text{if } g_1,g_2 \le 0,\\
%     (0,0)^\top , &\text{if } g_1,g_2 >0,\\
%     (0,\frac{\eps}{\sqrt{p}})^\top  , &\text{if } g_1 >0, g_2\le 0,\\
%     (\frac{\eps}{\sqrt{p}}, 0)^\top  , &\text{if } g_1 \le 0, g_2> 0.
%     \end{cases}
% \end{equation}
% Note that for problem~\eqref{opt:max-KL-perturbed}, the curvature constant is finite, see~\citet[Equations~(4) and (5)]{ref:lacoste2016convergence} for details. Therefore, we have the following convergence guarantee from~\citet[Theorem~1]{ref:lacoste2016convergence}.
% \begin{theorem}\label{thm:FW}
% Let $\{u^t\}_{t = 0,1,\dots, T}$ be the sequence generated by Algorithm~\ref{alg:FW}. Then, there exists some constant $C_2 >0$ such that for any $T \ge 1$,
% \[ \min_{t = 0,1,\dots,T} c_t \le \frac{C_2}{\sqrt{T}}. \]
% \end{theorem}
% \noindent As pointed out in~\cite{ref:lacoste2016convergence}, the quantity $c_t$ serves as a proxy for measuring local optimality: if $u^t$ lies in a convex subset $\mathcal{U}'\subseteq \mathcal{U}$ on which $F$ is convex, then $c_t \ge F(u^t) - \min_{u \in \mathcal{U}'} F(u)$ by convexity.

% % Define \[e = \sqrt{\frac{\zeta + \eps^2 - pu_1^2 - u_2^2}{p-1}}. \] 
% % Then, the gradient is given by
% % \begin{align*}
% %     \frac{\partial F}{\partial u_1} & = - \frac{p u_1 + \sqrt{p} \| \x - \msa \|_2}{(u_2 + \sigma)^2} + \frac{p u_1 }{e\left(e +\sigma \right)},\\
% %     \frac{\partial F}{\partial u_2} & = \frac{\left(\sqrt{p} u_1 +  \| \x - \msa \|_2\right)^2}{(u_2 + \sigma)^3} - \frac{1}{u_2+\sigma} + \frac{u_2 }{e \left(  e +\sigma \right)}.
% % \end{align*}

% % \khiem{@MC: Can we sequentially project into the non-negative orthant then the circular disk? Also, is it necessary to consider $u > \eps/\sqrt{p}$?}

% % \viet{For Frank-Wolfe algorithm, we need to find the direction by solving
% % \[
% % u\opt = \arg\min_{\substack{u_1,u_2\ge 0\\ u_1^2 + u_2^2 \le (\eps/\sqrt{p})^2}} ~ u_1 \times \frac{\partial F(u^t)}{\partial u_1} + u_2 \times \frac{\partial F(u^t)}{\partial u_2}
% % \]
% % and the next solution is
% % \[
% %     u^{t+1} = u^t + \alpha^t (u\opt - u^t)
% % \]
% % for some $\alpha^t \in [0, 1]$.
% % The correct algorithm is Algorithm 1 of \cite{ref:lacoste2016convergence}. There's also some guarantee (to stationary point) in this paper.}


% \begin{algorithm}[th]
% 	\caption{Frank-Wolfe Algorithm}
% 	\label{alg:FW}
% 	\begin{algorithmic}
% 		\STATE {\bfseries Input:} Boundary sample $\x_b \in \R^p$, Radius $\eps_0, \eps_1 \in \R_+$, smoothing variance $\sigma \in \R_+$, a large constant $C>0$, accuracy $\epsilon >0$.
% 		\STATE {\bfseries Initialization:} Set $t \leftarrow 0$, $ x^t \leftarrow x^b$
%         \FOR{$t =0,1, \ldots, $}
%             \STATE Compute 
%             \[s^t = \Argmin_{x: c(x, x_0) \le \delta} \nabla F(x^t)^\top x\]
%             \STATE Set $z^t = s^t - x^t$.
%             \STATE Compute $c_t = - \nabla F(x^t)^\top z^t$.
%             \IF{$c_t \le \epsilon$}
%             \STATE {\bfseries Return } $x^t$
%             \ENDIF
%             \STATE Set $\alpha^t = \min\{ \frac{c_t}{C} ,1\}$.
%             %\WHILE 
%             \STATE Set $x^{t+1} = x^t + \alpha^t z^t$.
%         \ENDFOR
% 		%\STATE{\bfseries Output:} $u^T$.
% 	\end{algorithmic}
% \end{algorithm}

\end{document}
