% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
% \usepackage{mathtools} % amsmath with fixes and additions
% % \usepackage{siunitx} % for proper typesetting of numbers and units
% \usepackage{booktabs} % commands to create good-looking tables
% \usepackage{tikz} % nice language for creating drawings and diagrams

% \input{style}
\usepackage{graphicx}

\usepackage{float,epstopdf}
\usepackage{bbm}

\usepackage{microtype}

% \usepackage{subfig}
% \usepackage{subfigure}
\usepackage{subcaption}
\usepackage{booktabs}

\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{dsfont}
\usepackage{multicol}
\usepackage{multirow} 
\usepackage{amsfonts} 
\usepackage{mathrsfs}
\usepackage{fancyhdr}
\usepackage[amssymb, thickqspace]{SIunits}
\usepackage{enumitem}
\usepackage{pgfplotstable}
\usepackage{arydshln}

\usepackage{cases}


\usepackage{algorithm}
\usepackage{algorithmic}
\renewcommand{\algorithmicrequire}{\textbf{Input:}} 
\renewcommand{\algorithmicensure}{\textbf{Output:}}

\usepackage{url}
\def\UrlBreaks{\do\/\do-}
\usepackage{xr}

% Todonotes is useful during development; simply uncomment the next line
%    and comment out the line below the next line to turn off comments
%\usepackage[disable,textsize=tiny]{todonotes}
\usepackage[textsize=tiny]{todonotes}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

% \input{comments}

% for underbar
\makeatletter
\def\munderbar#1{\underline{\sbox\tw@{$#1$}\dp\tw@\z@\box\tw@}}
\makeatother

\newtheorem{definition}{Definition}[section]
\newtheorem{theorem}[definition]{Theorem}
\newtheorem{lemma}[definition]{Lemma}
\newtheorem{remark}[definition]{Remark}
\newtheorem{corollary}[definition]{Corollary}
\newtheorem{proposition}[definition]{Proposition}
\newtheorem{observation}[definition]{Observation}
\newtheorem{claim}[definition]{Claim}
\newtheorem{example}[definition]{Example}
\newtheorem{assumption}[definition]{Assumption}
%==================================
\newcommand{\tduy}[1]{\textbf{{\color{red}(Tuan-Duy: #1)}}}
\newcommand{\viet}[1]{\textbf{{\color{magenta}(Viet says: #1)}}}
\newcommand{\note}[1]{\textbf{{\color{red}(#1)}}}
\newcommand{\MC}[1]{\textbf{{\color{blue}(MC says: #1)}}}

%======================================
\newcommand{\be}{\begin{equation}}
\newcommand{\ee}{\end{equation}}
\newcommand{\bea}{\begin{equation*}\begin{aligned}}
\newcommand{\eea}{\end{aligned}\end{equation*}}
\newcommand{\ds}{\displaystyle}
\newcommand{\independent}{\protect\mathpalette{\protect\independenT}{\perp}}
\newcommand{\R}{\mathbb{R}}

\newcommand{\Max}{\max\limits_}
\newcommand{\Min}{\min\limits_}
\newcommand{\Sup}{\sup\limits_}
\newcommand{\Inf}{\inf\limits_}
\newcommand{\Tr}[1]{\Trace \big[ #1 \big]}

\newcommand{\wh}{\widehat}
\newcommand{\mc}{\mathcal}
\newcommand{\mbb}{\mathbb}
\newcommand{\inner}[2]{\big \langle #1, #2 \big \rangle }

\newcommand{\PP}{\mbb P}
\newcommand{\Pnom}{\wh{\mbb P}}
\newcommand{\QQ}{\mbb Q}
\newcommand{\DD}{\mbb D}
\newcommand{\dd}{\mathrm{d}}




%\newcommand{\est}{X\opt}
%\newcommand{\estL}{\lambda\opt}
%\newcommand{\estx}{x\opt}

\DeclareMathOperator{\Trace}{Tr}
\DeclareMathOperator{\diag}{diag}
\DeclareMathOperator{\Diag}{Diag}
\DeclareMathOperator{\sign}{sign}

\DeclareMathOperator{\st}{s.t.}
\DeclareMathOperator{\nullspace}{null}
\DeclareMathOperator{\vect}{vec}
\DeclareMathOperator{\KL}{KL}
\DeclareMathOperator{\FR}{FR}
\DeclareMathOperator*{\Argmin}{Argmin}
\DeclareMathOperator{\Minval}{Min}
\DeclareMathOperator{\grad}{grad}
\DeclareMathOperator{\Hess}{Hess}
\DeclareMathOperator{\Proj}{Proj}

\newcommand{\defeq}{%
	\mathrel{\vbox{\offinterlineskip\ialign{%
				\hfil##\hfil\cr
				$\scriptscriptstyle\triangle$\cr
				%\noalign{\kern0ex}
				$=$\cr
}}}}

% Peyman's comments
\newcommand{\PSD}{\mathbb{S}_{+}} % the set of positive semi-definite matrices of dimension p
\newcommand{\PD}{\mathbb{S}_{++}} % the set of positive semi-definite matrices of dimension p
\newcommand{\PDsigma}{\mathbb{S}_{\ge \sigma}}
\newcommand{\Let}{\triangleq}%https://www.overleaf.com/project/5ea9d406e0687e00012e28ab
\newcommand{\opt}{^\star}
\newcommand{\eps}{\varepsilon}
\newcommand{\ra}{\rightarrow}
\newcommand{\M}{\mc M}
\newcommand{\BB}{\mbb B}
\newcommand{\B}{\mc B}
\newcommand{\X}{X}
\newcommand{\Y}{Y}
\newcommand{\Wass}{\mathds{W}}
\newcommand{\V}{\mathds{W}_S}
\newcommand{\Q}{\mbb{Q}}
\newcommand{\EE}{\mathds{E}}
\newcommand{\x}{x}


\newcommand{\half}{\frac{1}{2}}
\newcommand{\dualvar}{\gamma}


\newcommand{\ie}{{\em i.e.}}
\newcommand{\da}{\downarrow}
\newcommand{\J}{\mc J}
\newcommand{\Dataset}{\wh{\mc D}}
\newcommand{\m}{\mu}
\newcommand{\cov}{\Sigma}
\newcommand{\msa}{\wh \m}
\newcommand{\covsa}{\wh \cov}
\newcommand{\bayes}{\mathrm{Bayes}}

%% xr preamble
\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}

\myexternaldocument{nguyen_12}

\title{Robust Bayesian Recourse (Supplementary Material)}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Tuan-Duy H. Nguyen}
\author[1]{Ngoc Bui}
\author[1]{Duy Nguyen}
\author[2]{Man-Chung Yue}
\author[1]{Viet Anh Nguyen}
% Add affiliations after the authors
\affil[1]{%
    VinAI Research, Vietnam
}
\affil[2]{%
    The University of Hong Kong
}
  
\begin{document}
\onecolumn
\maketitle

\appendix

\section{Proofs of Section~\ref{sec:compute}}

\newtheorem*{lemma:separability}{Lemma~\ref{lemma:separability} (re-stated)}
\begin{lemma:separability}
    There exists a distribution~$\QQ_0\opt$ that solves~\eqref{eq:likelihood-max} and is a mixture of at most $N_0$ Gaussian components. Moreover, problem~\eqref{eq:likelihood-max} is equivalent to a separable problem of the form
    \begin{align*}
        \max~\{ L(\x, \QQ_0) : \QQ_0 \in \mbb B_{\eps_0}(\Pnom_0^\sigma)\} = \left\{\begin{array}{cll}
            \max &  \frac{1}{N_0} \sum_{i \in \mc I_0} f(\x | \m_i, \cov_i) \\ 
            \st & (\m_i, \cov_i) \in \R^p \times \PDsigma^p \\
            & c((\m_i, \cov_i), (\wh x_i, \sigma^2 I)) \le \eps_0 & \forall i \in \mc I_0.
        \end{array} \right.
    \end{align*}
    An analogous result holds for problem~\eqref{eq:likelihood-min} with the corresponding subscript $y=1$.
\end{lemma:separability}
\begin{proof}[Proof of Lemma~\ref{lemma:separability}]
    There exists a distribution~$\QQ_0\opt$ that solves~\eqref{eq:likelihood-max} and is a mixture of at most $N_0$ Gaussian components. Moreover, problem~\eqref{eq:likelihood-max} is equivalent to a separable problem of the form
    \begin{align*}
    &\max~\{ L(\x, \QQ_0) : \QQ_0 \in \mbb B_{\eps_0}(\Pnom_0^\sigma)\} \\
    =& \left\{\begin{array}{cll}
        \max &  \frac{1}{N_0} \sum_{i \in \mc I_0} f(\x | \m_i, \cov_i) \\ 
        \st & (\m_i, \cov_i) \in \R^p \times \PDsigma^p \\
        & c((\m_i, \cov_i), (\wh x_i, \sigma^2 I)) \le \eps_0 & \forall i \in \mc I_0.
    \end{array} \right.
    \end{align*}
    
    We use $\forall i$ implies $\forall i \in \mc I_0$, and $\sum_{i}$ is also taken over the same set. Given any $\x$, the likelihood of $\x$ under any Gaussian mixture $\QQ_0$ can be written using the corresponding measure $\nu_0$ as
    \[
        L(\x, \QQ_0) = \int_{\R^p \times \PSD^p} f(\x | \m, \cov) \nu_0(\mathrm{d} \m, \mathrm{d} \cov).
    \]
    Recall that $\Xi = \R^p \times \PDsigma^p$. Using the definition of the type-$\infty$ Wasserstein, we find
    \begin{align*}
        &\ \Wass_c(\nu_0, \wh \nu_0) \le \eps_0 \\
        \Leftrightarrow &\  \exists\lambda\in \Lambda(\nu_0, \wh \nu_0) \text{ such that} \\
        &\  \mathrm{ess} \Sup{\lambda} \big\{ c((\m, \cov), (\m', \cov')) : (\m, \cov, \m', \cov')  \in \Xi \times \Xi \big\} \le \eps_0\\
        \Leftrightarrow &\  \forall i\  \exists \lambda_i \in \mc P(\Xi) \text{ such that} \\
        &\ \mathrm{ess} \Sup{\lambda_i} \big\{ c((\m, \cov), (\wh x_i, \sigma I)) : (\m, \cov)  \in \Xi  \big\} \le \eps_0\\
        \Leftrightarrow &\  \forall i\  \exists \lambda_i \in \mc P(\Xi) \text{ such that}\\
        &\  c((\m,\cov), (\wh x_i, \sigma I)) \le \eps_0 \quad (\m, \cov)\in \mathrm{supp}(\lambda_i),
    \end{align*}
    where the second equivalence follows from that $\wh \nu_0 = \frac{1}{N_0} \sum_{i} \delta_{(\wh x_i, \sigma^2 I)}$ and hence any $\lambda \in \Lambda(\nu_0, \wh \nu_0)$ takes the form $\frac{1}{N_0} \sum_{i} \lambda_i \otimes \delta_{(\wh x_i, \sigma^2 I)}$ for some probability measures $\lambda_i \in \mc P(\Xi)$, and the third equivalence follows from Lemma~\ref{lem:sup_esssup}.
    % \begin{align*}
    %     &\{ \nu_0 \in \mc P(\Xi): \Wass_c(\nu_0, \wh \nu_0) \le \eps_0 \} \\
    %     =& \left\{ \nu_0 \in \mc P(\Xi): 
    %     \begin{array}{l}
    %     \exists \lambda \in \Lambda(\nu_0, \wh \nu_0) \text{ such that}: \\
    %     \mathrm{ess} \Sup{\lambda} \big\{ c((\m, \cov), (\m', \cov')) : (\m, \cov, \m', \cov')  \in \Xi \times \Xi \big\} \le \eps_0
    %     \end{array}
    %     \right\} \\
    %     =& \left\{ \nu_0 \in \mc P(\Xi): 
    %     \begin{array}{l}
    %     \exists \lambda_i \in \mc P(\Xi) \quad \forall i \text{ such that}: \nu_0 = \frac{1}{N_0} \sum_{i} \lambda_i\\
    %     \mathrm{ess} \Sup{\frac{1}{N_0} \sum_{i} \lambda_i \otimes \delta_{(\wh x_i, \sigma^2 I)}} \big\{ c((\m, \cov), (\m', \cov')) : (\m, \cov, \m', \cov')  \in \Xi \times \Xi \big\} \le \eps_0
    %     \end{array}
    %     \right\},
    % \end{align*}
    % where in the second equality, we exploited the fact that $\wh \nu_0 = \frac{1}{N_0} \sum_{i} \delta_{(\wh x_i, \sigma^2 I)}$, and that any $\lambda \in \Lambda(\nu_0, \wh \nu_0)$ can be written using law of conditional distribution as $\frac{1}{N_0} \sum_{i} \lambda_i \otimes \delta_{(\wh x_i, \sigma^2 I)}$ for some collection of probability measures $\lambda_i \in \mc P(\Xi)$ satisfying $\nu_0 = \frac{1}{N_0} \sum_{i} \lambda_i$. Notice that the essential supremum constraint can now be written as
    % \[
    %     c((\m_i, \cov_i), (\wh x_i, \sigma^2 I)) \le \eps_0  \quad \forall (\m_i, \cov_i) \in \mathrm{supp}(\lambda_i) \qquad \forall i,
    % \]
    % where $\mathrm{supp}(\lambda_i)$ denotes the support of the probability measure $\lambda_i$~\citep[pp.~441]{ref:aliprantis06hitchhiker}. 
    Hence, problem~\eqref{eq:likelihood-max} is equivalent to
    \begin{align*}
    &\left\{
    \begin{array}{cl}
        \max &  \int_{\R^p \times \PDsigma^p} f(\x | \m, \cov) \nu_0(\mathrm{d} \m, \mathrm{d} \cov) \\
        \st & \nu_0 \in \mc P(\R^p \times \PDsigma^p) \\
        & \Wass_c(\nu_0, \wh \nu_0) \le \eps_0
    \end{array}
    \right. \\
    =&\left\{
    \begin{array}{cl}
        \max &  \frac{1}{N_0} \sum_{i} \int_{\R^p \times \PDsigma^p} f(\x | \m_i, \cov_i) \lambda_i(\mathrm{d} \m_i, \mathrm{d} \cov_i) \\
        \st & \lambda_i \in \mc P(\R^p \times \PDsigma^p) \qquad \forall i\\
        & c((\m_i, \cov_i), (\wh x_i, \sigma^2 I)) \le \eps_0  \quad \forall (\m_i, \cov_i) \in \mathrm{supp}(\lambda_i) \qquad \forall i.
    \end{array}
    \right.
    \end{align*}
    It is easy now to employ a greedy argument to show that the optimal solution for $\lambda_i$ should be a Dirac delta distribution supported on one point in the space of $\R^p \times \PDsigma^p$. This leads to the conclusion regarding the maximization problem~\eqref{eq:likelihood-max}. 
    
    An similar argument can be applied for the minimization problem~\eqref{eq:likelihood-min}, the detailed proof is omitted.
\end{proof}

\begin{lemma}
    \label{lem:sup_esssup}
    For any $\lambda\in \mc P (\Xi)$, $\wh x\in \R^p$, $\sigma ,\eps >0$ and any function $c:\Xi\times \Xi \to \R$ such that the map $(\m, \cov) \mapsto c( (\m,\cov), (\wh x, \sigma^2 I))$ is continuous, we have $\mathrm{ess}\sup_{\lambda} c((\m, \cov), (\wh x, \sigma^2 I)) \le \eps$ if and only if $c((\m, \cov), (\wh x, \sigma^2 I)) \le \eps$ for any $(\m, \cov)\in \mathrm{supp}(\lambda)$.
\end{lemma}

\begin{proof}[Proof of Lemma~\ref{lem:sup_esssup}]
We first prove the ``only if'' direction. Suppose that there exists $(\m', \cov')\in \mathrm{supp}(\lambda)$ such that 
\[c ((\m', \cov'), (\wh x, \sigma^2 I)) > \eps .\]
By continuity of the map $(\m, \cov) \mapsto c( (\m,\cov), (\wh x, \sigma^2 I))$, there exists an open neighbourhood $U\subseteq \Xi$ containing $(\m', \cov')$ such that
\[ c ((\m, \cov), (\wh x, \sigma^2 I)) > \eps\quad\forall (\m,\cov) \in U . \]
By the definition of of support, $\lambda (U) > 0$. Therefore,
\[ \Pr_{\lambda} ( c((\m, \cov), (\wh x, \sigma^2 I)) \le \eps ) = 1- \Pr_{\lambda} ( c((\m, \cov), (\wh x, \sigma^2 I)) > \eps ) \le 1 - \lambda (U) < 1, \]
which contradicts to that $\mathrm{ess}\sup_{\lambda} c((\m, \cov), (\wh x, \sigma^2 I)) \le \eps$.

We next prove the ``if'' direction. By the law of total probability and the fact that $c((\m, \cov), (\wh x, \sigma^2 I)) \le \eps$ for any $(\m, \cov)\in \mathrm{supp}(\lambda)$,
\begin{align*}
    & \Pr_\lambda \left( c((\m, \cov), (\wh x, \sigma^2 I)) \le \eps \right) \\
    = & \Pr_\lambda \left( c((\m, \cov), (\wh x, \sigma^2 I)) \le \eps | (\m,\cov)\in\mathrm{supp}(\lambda) \right)\lambda (\mathrm{supp}(\lambda)) \\
    &\quad + \Pr_\lambda \left( c((\m, \cov), (\wh x, \sigma^2 I)) \le \eps | (\m,\cov)\not\in\mathrm{supp}(\lambda) \right) (1- \lambda( \mathrm{supp}(\lambda)))\\
    = & 1\cdot 1 + \Pr_\lambda \left( c((\m, \cov), (\wh x, \sigma^2 I)) \le \eps | (\m,\cov)\not\in\mathrm{supp}(\lambda) \right)\cdot 0 = 1,
\end{align*}
which completes the proof.
\end{proof}

\newtheorem*{prop:max-Wass}{Proposition~\ref{prop:max-Wass} (re-stated)}
\begin{prop:max-Wass}
    Fix any index $i \in \mc I_0$. For any $\wh x_i \in \R^p$, $\x \in \R^p$ and $\eps_0 \in \R_+$, we have
    \[
         \frac{\exp(-\alpha_i)}{(2\pi)^{p/2}} = \left\{
            \begin{array}{cl}
            \max & f(\x| \m_i, \cov_i) \\
            \st & (\m_i, \cov_i) \in \R^p \times \PDsigma^p \\
             &c((\m_i, \cov_i), (\wh x_i, \sigma^2 I)) \le \eps_0, 
            \end{array}
         \right.
    \]
    where $\alpha_i$ is the optimal value of the two-dimensional optimization problem
     \[
        \min_{\substack{a \in \R_+,~d_p \in [\sigma, +\infty) \\ a^2 + (d_p - \sigma)^2 \le \eps_0^2 }} ~ \log d_p + \frac{(\|\x - \wh x_i\|_2 - a)^2}{2d_p^2}  + (p-1) \log \sigma.
    \]
\end{prop:max-Wass}
\begin{proof}[Proof of Proposition~\ref{prop:max-Wass}] Let $\alpha_i$ be the optimal value of the negative log-likelihood minimization problem
\[
    \alpha_i = \left\{ \begin{array}{cl}
        \min & \half \log \det \Sigma_i + \half (\x - \m_i)^\top \cov_i^{-1} (\x - \m_i)  \\
        \st & \m_i \in \R^p,~\cov_i \in \PSD^p \\
        & \| \m_i - \wh x_i \|_2^2 + \Tr{\cov_i + \sigma^2 I - 2 \big( (\sigma^2 I)^{\half} \cov_i (\sigma^2 I)^{\half} \big)^{\frac{1}{2}} } \leq \eps_0^2 \\
        & \cov_i \succeq \sigma^2 I.
    \end{array}
    \right.
\]
It is easy to see that
\[
    \max\{ f(\x| \m_i, \cov_i) : (\m_i, \cov_i) \in \R^p \times \PDsigma^p,~c((\m_i, \cov_i), (\wh x_i, \sigma^2 I)) \le \eps_0\} = \frac{1}{\sqrt{(2\pi)^p}} \exp(-\alpha_i ).
\]
It remains to provide a simpler formulation to determine $\alpha_i$. To simplify the notation, we omit the index $i$ on all variables and parameters.
We reparameterize $\cov = V \diag(d^2) V^\top$ for a vector $d \in \R_+^p$, where $\diag(d^2)$ denotes a $\R^{p\times p}$ diagonal matrix with its $j$-th diagonal entries equals to $d_j^2$, and $\mathrm{O}(p)$ is the set of $p$-dimensional orthogonal matrices
\[
    \mathrm{O}(p) = \{ V \in \R^{p \times p} : V^\top V = I_p\}.
\] 
The negative log-likelihood minimization problem is further equivalent to
\[
    \begin{array}{cl}
        \min & \sum_{j=1}^p \log d_j + \half (V^\top(\x - \m))^\top \diag(d^{-2}) (V^\top(\x - \m))  \\
        \st & ~ d \in \R_+^p, ~V \in \mathrm{O}(p),~\m \in \R^p  \\
        & 
        \| \m - \wh x \|_2^2 + \sum_{j=1}^p (d_j - \sigma)^2 \leq \eps_0^2 \\
        & d \ge \sigma,
    \end{array}
\]
where $d \ge \sigma$ implies the element-wise constraints $d_j \ge \sigma$ for any $j = 1, \ldots, p$.
We introduce an auxiliary variable $a \in \R_+$ and rewrite the optimization problem in an equivalent way as
\[
\min_{\substack{a \in \R_+,~d \in \R_+^p, ~ d \ge \sigma  \\ a^2 + \sum_j (d_j - \sigma)^2 \le \eps_0^2} } ~ \min_{\substack{\m \in \R^p \\ \|\m - \wh x\|_2^2 = a^2}} ~ \min_{V \in \mathrm{O}(p)}~ \sum_{j=1}^p \log d_j + \half (V^\top(\x - \m))^\top \diag(d^{-2}) (V^\top(\x - \m)). 
\]
Notice that the above optimization problem is invariant to the ordering of the entries of $d$.
As a consequence, without any loss of generality, we can assume that $d_p$ is the maximum value across all $d_j$. By Lemma~\ref{lemma:optimal_V}, the above optimization problem becomes
\[
\min_{\substack{a \in \R_+,~d \in \R_+^p, ~ d \ge \sigma \\ a^2 + \sum_j (d_j - \sigma)^2 \le \eps_0^2\\ d_p = \max\{d\} }} ~ \min_{\substack{\m \in \R^p \\ \|\m - \wh x\|_2^2 = a^2}} ~  \sum_{j=1}^p \log d_j + \frac{1}{2d_p^2} \|\x - \m\|_2^2. 
\]
Using Lemma~\ref{lemma:quadratic}, we obtain the equivalent optimization problem
\[
\min_{\substack{a \in \R_+,~d \in \R_+^p, ~ d \ge \sigma \\ a^2 + \sum_j (d_j - \sigma)^2 \le \eps_0^2\\ d_p = \max\{d\} }} ~  \sum_{j=1}^p \log d_j + \frac{1}{2d_p^2} (\|\x - \wh x\|_2 - a)^2.
\]
Rewriting the above problem into a two-layer optimization problem
\begin{equation}
\label{eq:main_eq_max_ot}
\min_{\substack{a \in \R_+,~d_p \in \R_+, ~ d_p \ge \sigma \\ a^2 + (d_p - \sigma)^2 \le \eps_0^2 }} ~ \left\{\log d_p + \frac{1}{2d_p^2} (\|\x - \wh x\|_2 - a)^2 + \Min{\substack{d_j \in \R_+, ~ d_j \ge \sigma ~ \forall j=1, \ldots, p-1\\ \sum_{j=1}^{p-1} (d_j - \sigma)^2 \le \eps_0^2 - a^2 - (d_p - \sigma)^2\\
d_j \le d_p ~ \forall j=1, \ldots, p-1}} \sum_{j=1}^{p-1} \log d_j \right\}.
\end{equation}
Notice that for any $d_p$ that is feasible for the outer minimization problem, the inner minimization problem over $d_j$, $\forall j = 1, \ldots, p-1$ admits a non-empty feasible set. Indeed, because $d_p \ge \sigma$, the value $d_j = \sigma$, $j = 1, \ldots, p-1$ is a feasible solution for the inner problem. We now focus on solving the inner minimization problem. As $\log (\cdot)$ is an increasing function, for any $s \ge 0$, we find
\[
\min_{\substack{d_p\ge d_j \ge \sigma ~ \forall j=1, \ldots, p-1 \\ \sum_{j=1}^{p-1} (d_j - \sigma)^2 \le s}}~\sum_{j=1}^{p-1} \log d_j = (p-1) \log \sigma,
\]
 which holds because the optimization problem on the left hand side admits the optimal solution $d\opt_j = \sigma$ for all $j = 1, \dots, p-1$. This completes the proof.
\end{proof}

\newtheorem*{prop:min-Wass}{Proposition~\ref{prop:min-Wass} (re-stated)}
\begin{prop:min-Wass}
    %Suppose that $c$ is prescribed as in~\eqref{eq:c-Wass}. 
    Fix any index $i \in \mc I_1$. For any $\wh x_i \in \R^p$, $\x \in \R^p$ and $\eps_1 \in \R_+$, we have
    \[
        \frac{\exp(\alpha_i)}{(2\pi)^{p/2}} = \left\{ 
        \begin{array}{cl}
        \min & f(\x| \m_i, \cov_i) \\
        \st & (\m_i, \cov_i) \in \R^p \times \PDsigma^p \\
        & c((\m_i, \cov_i), (\wh x_i, \sigma^2 I)) \le \eps_1,
        \end{array}
        \right.
    \]
    where $\alpha_i$ is the optimal value of the two-dimensional optimization problem
    \begin{align*}
        \min_{\substack{a \in \R_+,~d_1 \in [\sigma, +\infty) \\ a^2 + p(d_1 - \sigma)^2 \le \eps_1^2}} ~ \left\{-\log d_1 - \frac{(\|\x - \wh x_i\|_2 + a)^2}{2d_1^2} - (p-1) \log \left(\sigma + \sqrt{\frac{\eps^2 - a^2 - (d_1 - \sigma)^2}{p - 1}}\right) \right\}.
    \end{align*}
\end{prop:min-Wass}
\begin{proof}[Proof of Proposition~\ref{prop:min-Wass}]
Let $\alpha_i$ be the optimal value of the log-likelihood \textit{minimization} problem
\[
    \alpha_i = \left\{ \begin{array}{cl}
        \min & -\half \log \det \Sigma_i - \half (\x - \m_i)^\top \cov^{-1} (\x - \m_i)  \\
        \st & \m_i \in \R^p,~\cov_i \in \PSD^p \\
        & \| \m_i - \wh x_i \|_2^2 + \Tr{\cov_i + \sigma^2 I - 2 \big( (\sigma^2 I)^{\half} \cov_i (\sigma^2 I)^{\half} \big)^{\frac{1}{2}} } \leq \eps_1^2 \\
        & \cov_i \succeq \sigma^2 I.
    \end{array}
    \right.
\]
It is easy to see that
\[
    \min\{ f(\x| \m_i, \cov_i) : (\m_i, \cov_i) \in \R^p \times \PDsigma^p,~c((\m_i, \cov_i), (\wh x_i, \sigma^2 I)) \le \eps_1\} =  \frac{1}{(2\pi)^{p/2}} \exp(\alpha_i).
\]
It remains to provide the computational routine to determine $\alpha_i$. To simplify the notation, we omit the index $i$ on all variables and parameters.
We reparameterize $\cov = V \diag(d^2) V^\top$ for a vector $d \in \R_+^p$, where $\diag(d^2)$ denotes a $\R^{p\times p}$ diagonal matrix with its $j$-th diagonal entries equals to $d_j^2$, and $\mathrm{O}(p)$ is the set of $p$-dimensional orthogonal matrices
\[
    \mathrm{O}(p) = \{ V \in \R^{p \times p} : V^\top V = I_p\}.
\] 
The log-likelihood minimization problem is further equivalent to
\[
    \begin{array}{cl}
        \min & -\sum_{j=1}^p \log d_j - \half (V^\top(\x - \m))^\top \diag(d^{-2}) (V^\top(\x - \m))  \\
        \st & d \in \R_+^p,~V \in \mathrm{O}(p),~\m \in \R^p  \\
        & 
        \| \m - \wh x \|_2^2 + \sum_{j=1}^p (d_j - \sigma)^2 \leq \eps_1^2 \\
        & d \ge \sigma,
    \end{array}
\]
where $d \ge \sigma$ implies the element-wise constraints $d_j \ge \sigma$ for any $j = 1, \ldots, p$.
We introduce an auxiliary variable $a \in \R_+$ and rewrite the optimization problem in an equivalent way as
\[
\min_{\substack{a \in \R_+,~d \in \R_+^p, ~d \ge \sigma \\ a^2 + \sum_j (d_j - \sigma)^2 \le \eps_1^2} } ~ \min_{\substack{\m \in \R^p \\ \|\m - \wh x\|_2^2 = a^2}} ~ \min_{V \in \mathrm{O}(p)}~ -\sum_{j=1}^p \log d_j - \half (V^\top(\x - \m))^\top \diag(d^{-2}) (V^\top(\x - \m)). 
\]
Notice that the above optimization problem is invariant to the ordering of the entries of $d$.
As a consequence, without any loss of generality, we can assume that $d_1$ is the minimum value across all $d_j$. By Lemma~\ref{lemma:optimal_V}, the above optimization problem becomes
\[
\min_{\substack{a \in \R_+,~d \in \R_+^p,~ d \ge \sigma \\ a^2 + \sum_j (d_j - \sigma)^2 \le \eps_1^2\\ d_1 = \min\{d\} }} ~ \min_{\substack{\m \in \R^p \\ \|\m - \wh x\|_2^2 = a^2}} ~  -\sum_{j=1}^p \log d_j - \frac{1}{2d_1^2} \|\x - \m\|_2^2. 
\]
Using Lemma~\ref{lemma:quadratic}, we obtain the equivalent optimization problem
\[
\min_{\substack{a \in \R_+,~d \in \R_+^p, ~d \ge \sigma \\ a^2 + \sum_j (d_j - \sigma)^2 \le \eps_1^2\\ d_1 = \min\{d\} }} ~  -\sum_{j=1}^p \log d_j - \frac{1}{2d_1^2} (\|\x - \wh x\|_2 + a)^2.
\]
Notice that the constraint $\sigma \le d_1 = \min\{d\}$ implies that $p(d_1 - \sigma)^2 \le \sum_j (d_j - \sigma)^2$. As a consequence, any feasible value for $d_1$ should satisfy $a^2 + p (d_1 - \sigma)^2 \le \eps_1^2$. Separating the variable $d$ into two groups $d_1$ and $d_2,\dots, d_p$ leads to a two-layer optimization problem 
\begin{equation}
\label{eq:main_eq_min_ot}
\min_{\substack{a \in \R_+,~d_1 \in \R_+,~ d_1 \ge \sigma \\ a^2 + p(d_1 - \sigma)^2 \le \eps_1^2 }} ~ \left\{-\log d_1 - \frac{1}{2d_1^2} (\|\x - \wh x\|_2 + a)^2 + \Min{\substack{d_j \in \R_+, ~ d_j \ge d_1 ~\forall j=2, \ldots, p\\ \sum_{j=2}^p (d_j - \sigma)^2 \le \eps_1^2 - a^2 - (d_1 - \sigma)^2 }} -\sum_{j=2}^{p} \log d_j \right\}.
\end{equation}
Consider momentarily the minimization problem 
\begin{equation*}
\Min{\substack{d_j \in \R_+ \quad \forall j = 2, \ldots, p \\ \sum_{j=2}^p (d_j - \sigma)^2 \le \eps_1^2 - a^2 - (d_1 - \sigma)^2 }} -\sum_{j=2}^{p} \log d_j,
\end{equation*}
where the constraints $d_j \ge d_1$ have been intentionally omitted. Proposition~\ref{prop:max-e} asserts that this optimization problem has the optimal value
\[
-(p-1) \log \left(\sigma + \sqrt{\frac{\eps_1^2 - a^2 - (d_1 - \sigma)^2}{p - 1}}\right)
\]
at the optimal solution $d\opt_j=\sigma + \sqrt{\frac{\eps_1^2 - a^2 - (d_1 - \sigma)^2}{p-1}}$, which also by the outer constraint $a^2 + p(d_1 - \sigma)^2 \le \eps_1^2$ satisfies $d_j \ge d_1 ~ \forall j=2,\dots,p$. Thus it is indeed the optimal solution to the inner minimization problem in~\eqref{eq:main_eq_min_ot}. As a consequence, problem~\eqref{eq:main_eq_min_ot} is equivalent to 
\[
\min_{\substack{a \in \R_+,~d_1 \in \R_+, ~d_1 \ge \sigma\\ a^2 + p(d_1 - \sigma)^2 \le \eps_1^2}} ~ \left\{-\log d_1 - \frac{1}{2d_1^2} (\|\x - \wh x\|_2 + a)^2 - (p-1) \log \left(\sigma + \sqrt{\frac{\eps_1^2 - a^2 - (d_1 - \sigma)^2}{p - 1}}\right) \right\}.
\]
This completes the proof.
\end{proof}


\section{Auxiliary Results} 
\label{sec:app-aux}

The following preparatory results are necessary to prove Propositions~\ref{prop:max-Wass} and~\ref{prop:min-Wass}. 

\begin{lemma}[Eigenbasis solution] \label{lemma:optimal_V}
Let $E\in\R^{p\times p}$ be a diagonal matrix satisfying $E_{11}\le \cdots\le E_{pp}$. Then, for any $w \in \R^p$, we have
\begin{equation*}
    \Max{V \in \mathrm{O}(p)}~w^\top V E V^\top w = E_{pp} \|w\|_2^2.
\end{equation*}
\end{lemma}
\begin{proof}[Proof of Lemma~\ref{lemma:optimal_V}]
    The claim holds trivially when $w = 0$. Consider now any $w \in \R^p \backslash \{0\}$. Since $V E V^\top \preceq E_{pp}\cdot I_p$, we find
    \begin{align*}
        \Max{V \in \mathrm{O}(p)}~w^\top V E V^\top w \le \Max{V \in \mathrm{O}(p)}~w^\top V (E_{pp}\cdot I_p) V^\top w = E_{pp} \|w\|_2^2.
    \end{align*}
    On the other hand, taking $V\opt = [v_1\opt, \ldots, v_p\opt] \in \mathrm{O}(p)$ with $v_p\opt = \frac{w}{\|w\|_2}$, and using the orthogonality of the columns of $V\opt$, we have
    \[ w^\top V\opt E {V\opt}^\top w = E_{pp} \|w\|_2^2.  \]
    This shows that $V\opt$ is an optimal solution and completes the proof.
\end{proof}


\begin{lemma}[Quadratic optimization] \label{lemma:quadratic}
For any $\x \in \R^p$, $\wh x \in \R^p$ and $a \in \R_+$, the following assertions hold.
\begin{itemize}
    \item Convex quadratic minimization:
    \[
        \Min{\m \in \R^p: \| \m - \wh x \|_2^2 = a^2}~ \| \x - \m \|_2^2 =  (\| \x - \wh x \|_2 - a)^2,
    \]
    where the minimum is attained at $\m\opt = \frac{a}{\| \x - \wh x \|_2}\x + (1 - \frac{a}{\| \x - \wh x \|_2})\wh x$.
    \item Convex quadratic maximization:
    \[
        \Max{\m \in \R^p: \| \m - \wh x \|_2^2 = a^2}~ \| \x - \m \|_2^2 =  (\| \x - \wh x \|_2 + a)^2,
    \]
    where the maximum is attained at $\mu\opt = -\frac{a}{\|\x - \wh x\|_2}\x + (1 + \frac{a}{\|\x - \wh x\|_2})\wh x$.
\end{itemize}
\end{lemma}
The results in Lemma~\ref{lemma:quadratic} are dispersed in the literature. An elementary proof is provided here for completeness.
\begin{proof}[Proof of Lemma~\ref{lemma:quadratic}]
By the triangle inequality, for any $\m$ such that $\| \m - \wh x \|_2 = a$, we have
\[ \| \x - \m \|_2 \ge \left| \| \x - \wh x \| - \| \m - \wh x \| 
\right| = \left| \| \x - \wh x \| - a \right|, \]
where the lower bound can be attained by taking $\m = \frac{a}{\| \x - \wh x \|_2}\x + (1 - \frac{a}{\| \x - \wh x \|_2})\wh x$. Therefore, 
\[\Min{\m \in \R^p: \| \m - \wh x \|_2^2 = a^2}~ \| \x - \m \|_2^2 =  (\| \x - \wh x \|_2 - a)^2\]
Similarly, by the triangle inequality we have
\[ \| \x - \m \|_2 \le \| \x - \wh x \| + \| \wh x - \m \| = \| \x - \wh x \| + a, \]
and the upper bound can be attained by $\mu = -\frac{a}{\|\x - \wh x \|_2}\x + (1 + \frac{a}{\|\x - \wh x \|_2})\wh x$. This completes the proof.
\end{proof}

\begin{proposition}[Logarithm maximization] \label{prop:max-e}
For any $s ,\sigma\ge 0$ and positive integer $k$, we have
\begin{equation}\label{opt:sum_log_max_subproblem}
    k \log\left( \sqrt{\frac{s}{k}} + \sigma \right) = \left\{
    \begin{array}{cl}
        \displaystyle\max_{e\in\R_+^{k}} & \displaystyle\sum_{j=1}^{k} \log e_j  \\
        \st & \displaystyle\sum_{j = 1}^{k} (\sigma - e_j)^2 \le s.
    \end{array}
    \right.
\end{equation}
Moreover, the optimal solution $e\opt$ satisfies $e\opt_j = \sqrt{\frac{s}{k}} + \sigma$ for any  $j = 1, \ldots, k$.
\end{proposition}
\begin{proof}[Proof of Proposition~\ref{prop:max-e}]
Let $e\opt \in \R^{k}_+$ be an optimal solution to the maximization problem~\eqref{opt:sum_log_max_subproblem}. Suppose there exist two indices $m$ and $n$ such that $e\opt_m \neq e\opt_n$. Consider $e'$ defined by
\[
e'_j = 
\begin{cases}
\half(e\opt_m + e\opt_n), & \text{if } j \in \{m,n\},\\
e\opt_j, &\text{otherwise}.
\end{cases}
\]
By the convexity of the function $x\mapsto (x-\sigma)^2$,
\[  \left(e'_m - \sigma\right)^2 + \left(e'_n - \sigma\right)^2= 2\left(\frac{e\opt_m + e\opt_n}{2} - \sigma \right)^2 \le (e\opt_m - \sigma)^2 + (e\opt_n - \sigma)^2 , \]
which implies that $e'$ is a feasible solution to problem~\eqref{opt:sum_log_max_subproblem}.
Furthermore, since $e\opt_m \neq e\opt_n$, by the concavity of the function $x\mapsto \log x$, we have that 
\[\log e\opt_m + \log e\opt_n < 2\log \left(\frac{e\opt_m + e\opt_n}{2}\right) = \log e'_m + \log e'_n,\]
which violates the optimality of $e\opt$. Therefore, any optimal solution $e\opt$ must have all entries identical. Using this, we get from the constraint that
\[ |e\opt_j - \sigma| \le \sqrt{\frac{s}{k}}\quad\forall j = 1,\dots, k. \] 
By continuity of the objective and constraint functions, we must have
\[ |e\opt_j - \sigma| = \sqrt{\frac{s}{k}}\quad\forall j = 1,\dots, k. \] 
Since the objective function is increasing in $e\opt_j$, the optimal solution is given by
\[ e\opt_j = \sigma + \sqrt{\frac{s}{k}} \quad\forall j = 1,\dots, k.\]
The optimal value can then be obtained by direct computation. This completes the proof.
\end{proof}


\section{First-Order Algorithms} \label{sec:app:foa}

\subsection{Optimistic Likelihood Problem} \label{sec:app:opp}

For the optimistic likelihood problem, Theorem~\ref{thm:max} reduces the task to solving the 2-dimensional problem
 \[
    \min_{\substack{a \in \R_+,~d_p \in [\sigma, +\infty) \\ a^2 + (d_p - \sigma)^2 \le \eps_0^2 }} ~ \log d_p + \frac{(\|\x - \wh x_i\|_2 - a)^2}{2d_p^2}  + (p-1) \log \sigma.
    \]
By letting 
\[
    d_p = v_2 + \sigma, \quad \text{and} \quad a = v_1,
\]
we can obtain the equivalent form
\be \label{eq:newprob}
    \min_{\substack{v_1, v_2 \ge 0 \\ v_1^2 + v_2^2 \le \eps_0^2 }} ~ F(v),
\ee
where the objective function is given by
\[
F(v) = \log (v_2 + \sigma) + \frac{(\|\x - \wh x_i\|_2 - v_1)^2}{2(v_2 + \sigma)^2}  + (p-1) \log \sigma.
\]
If we denote by $\mathcal V = \{ v \in \R^2: v_1, v_2 \ge 0, v_1^2 + v_2^2 \le \eps_0^2 \}$ the feasible region of the above minimization problem, then the projection $\Proj_{\mathcal{V}}(v)$ can be computed in closed-form via
\begin{equation*}
    \Proj_{\mathcal{V}}(v) = \begin{cases}
    v, &\text{if } v_1, v_2\ge 0, v_1^2 + v_2^2 \le \eps_0^2,\\
    \frac{\eps_0}{\|v\|_2}v, & \text{if } v_1, v_2\ge 0, v_1^2 + v_2^2 > \eps_0^2,\\
    (0,\eps_0)^\top, & \text{if } v_1 < 0, v_2 > \eps_0,\\
    (0,v_2)^\top, & \text{if } v_1 < 0, 0\le v_2 \le \eps_0,\\
    (\eps_0, 0)^\top, & \text{if } v_1 > \eps_0, v_2 < 0,\\
    (v_1,0)^\top, & \text{if } 0\le v_1 \le \eps_0, v_2 < 0,\\
    (0,0)^\top, &\text{if } v_1, v_2<0.
    \end{cases}
\end{equation*}
% \viet{The above projection operator can be re-expressed in two steps:
% \[
%     \Proj_{\mathcal{V}}(v) = \Proj_{\mc B_{\eps_0}}(\Proj_{\R_+^2}(v)),
% \]
% which first projects onto $\R_+^2$, then projects onto the ball of radius $\eps_0$ around the origin.
% \[
% \Proj_{\mc B_{\eps_0}}(v) = \min\{1, \frac{\eps_0}{\|v\|_2}\} v =\eps_0 \times \min\{\frac{1}{\eps_0}, \frac{1}{\| v\|_2}\} v = \eps_0 \times \frac{1}{\max\{\eps_0, \|v\|_2\}} v.
% \]}
Algorithm~\ref{alg:pgd} is a projected gradient descent routine to solve problem~\eqref{eq:newprob}. The convergence guarantee for Algorithm~\ref{alg:pgd} follows from \citet[Theorem~10.15]{beck2017first}.


\begin{algorithm}[h]
	\caption{Projected Gradient Descent Algorithm with Backtracking Line-Search}
	\label{alg:pgd}
	\begin{algorithmic}
		%\STATE {\bfseries Input:} Sample $\x \in \R^p$, Radius $\eps \in \R_+$, mean value $\msa \in \R^p$, smoothing variance $\sigma \in \R_+$, $\theta\in (0,1)$, $\beta>0$ 
		\STATE {\bfseries Algorithm parameters:} Line search parameters $\theta\in (0,1)$, $\beta>0$ 
		\STATE {\bfseries Initialization:} Set $ v^0 \leftarrow 0$
        \FOR{$t = 0, 1, \ldots$}
            \STATE Find the smallest integer $k\ge 0$ such that 
            \begin{align*}
                & F\left( \Proj_{\mathcal{V}} (v^t - \theta^k \beta \nabla F(v^t)) \right) \le F(v^t )  - \frac{1}{2 \theta^k \beta} \| v^t - \Proj_{\mathcal{V}} (v^t - \theta^k \beta \nabla F(v^t)) \|_2^2
            \end{align*}
            \STATE Set $s^t = \theta^k \beta$ and set $v^{t+1} = \Proj_{\mc V}(u^t - s^t \nabla F(v^t))$.
        \ENDFOR
		%\STATE{\bfseries Output:} $u^T$.
	\end{algorithmic}
\end{algorithm}





\subsection{Pessimistic Likelihood Problem} \label{sec:app:pes}

For the pessimistic likelihood problem, Theorem~\ref{thm:min} reduces the task to solving the 2-dimensional problem
\[
\min_{\substack{a \in \R_+,~d_1 \in [ \sigma, +\infty) \\ a^2 + p(d_1 - \sigma)^2 \le \eps_1^2}} ~ \left\{-\log d_1 - \frac{1}{2d_1^2} (\|\x - \wh x_i\|_2 + a)^2 - (p-1) \log \left(\sigma + \sqrt{\frac{\eps_1^2 - a^2 - (d_1 - \sigma)^2}{p - 1}}\right) \right\}.
\]
Note that the gradient of the objective function is a non-Lipschitz function. Worse still, the gradient is even undefined on at the feasible point $(d_1, a) = (\sigma, \eps_1)$. These properties induce numerical issues for the optimization algorithm. Therefore, we solve the following perturbed problem instead:
\begin{equation}\label{opt:max-KL-perturbed}
    \min_{\substack{a \in \R_+,~d_1 \in [\sigma, +\infty) \\ a^2 + p(d_1 - \sigma)^2 \le \eps_1^2}} ~ \left\{-\log d_1 - \frac{1}{2d_1^2} (\|\x - \msa\|_2 + a)^2 - (p-1) \log \left(\sigma + \sqrt{\frac{\zeta + \eps_1^2 - a^2 - (d_1 - \sigma)^2}{p - 1}}\right) \right\},
\end{equation}
for some small $\zeta >0$. By \citet[Proposition~4.4]{ref:bonnans2013perturbation}, the optimal value of problem~\eqref{opt:max-KL-perturbed} is continuous in $\zeta$ and the optimal solution set is upper semi-continuous in $\zeta$ as a set-valued mapping, see \citet[Section~4.1]{ref:bonnans2013perturbation}.

We now derive a projected gradient descent algorithm with backtracking line search for solving problem~\eqref{opt:max-KL-perturbed}. First, by letting
\[d_1 = u_2 + \sigma, \quad \text{and} \quad a = \sqrt{p}u_1,\] 
we can equivalently transform problem~\eqref{opt:max-KL-perturbed} to the following one:
\begin{equation}\label{opt:max-KL-u}
    \min_{\substack{u_1,u_2\ge 0\\ u_1^2 + u_2^2 \le (\eps_1/\sqrt{p})^2}} ~ F(u), 
\end{equation}
where the objective function is given by
\[
F(u) = -\log (u_2 + \sigma) - \frac{1}{2(u_2 + \sigma)^2} (\|\x - \wh x_i\|_2 + \sqrt{p} u_1 )^2 - (p-1) \log \left(\sigma + \sqrt{\frac{\zeta + \eps_1^2 - p u_1^2 - u_2^2}{p - 1}}\right) .
\]
The upshot of problem~\eqref{opt:max-KL-u} is that the feasible region is the intersection of the non-negative orthant with a circular disk of radius $\eps_1/\sqrt{p}$ centered at the origin. As we will see below, this enables easy computation of the projection and linear optimization oracle. 
Indeed, denoting by $\mathcal{U} = \{ u\in\R^2: u_1, u_2\ge 0, u_1^2 + u_2^2 \le (\eps_1/\sqrt{p})^2 \}$ the feasible region of problem~\eqref{opt:max-KL-u}, the projection $\Proj_{\mathcal{U}}(u)$ can be computed in closed-form via
\begin{equation*}
    \Proj_{\mathcal{U}}(u) = \begin{cases}
    u, &\text{if } u_1, u_2\ge 0, u_1^2 + u_2^2 \le (\eps_1/\sqrt{p})^2,\\
    \frac{(\eps_1 / \sqrt{p})}{\|u\|_2}u, & \text{if } u_1, u_2\ge 0, u_1^2 + u_2^2 > (\eps_1/\sqrt{p})^2,\\
    (0,\frac{\eps_1}{\sqrt{p}})^\top, & \text{if } u_1 < 0, u_2 > \frac{\eps_1}{\sqrt{p}},\\
    (0,u_2)^\top, & \text{if } u_1 < 0, 0\le u_2 \le \frac{\eps_1}{\sqrt{p}},\\
    (\frac{\eps_1}{\sqrt{p}}, 0)^\top, & \text{if } u_1 > \frac{\eps_1}{\sqrt{p}}, u_2 < 0,\\
    (u_1,0)^\top, & \text{if } 0\le u_1 \le \frac{\eps_1}{\sqrt{p}}, u_2 < 0,\\
    (0,0)^\top, &\text{if } u_1,u_2<0.
    \end{cases}
\end{equation*}
A projected gradient descent algorithm can now be employed to solve problem~\eqref{opt:max-KL-u}.

\section{Recovery of the Adversarial Distribution} \label{sec:recovery}

It is often instructive to recover and analyze the optimal distribution that maximizes the posterior probability odds ratio, or more directly, the likelihood ratio in~\eqref{eq:dro}. Equivalent, it suffices to characterize the distribution $\QQ_0\opt$ that maximizes~\eqref{eq:likelihood-max}, and the distribution $\QQ_1\opt$ that minimizes~\eqref{eq:likelihood-min}.

\begin{lemma}[Likelihood maximizer] \label{lemma:max-dist}
    For each $i \in \mc I_0$, let $(a_i\opt, d_{pi}\opt)$ be the optimal solution of the following two-dimensional optimization problem
     \[
    \min_{\substack{a \in \R_+,~d_p \in [\sigma, +\infty) \\ a^2 + (d_p - \sigma)^2 \le \eps_0^2 }} ~ \log d_p + \frac{(\|\x - \wh x_i\|_2 - a)^2}{2d_p^2}  + (p-1) \log \sigma.
    \]
    Then, the maximizero $\QQ_0\opt$ of problem~\eqref{eq:likelihood-max} is a Gaussian mixture with $N_0$ components, and for $i \in \mc I_0$, the $i$-th components has mean
    \[
        \m_i\opt =  \frac{a_i\opt}{\|x - \wh x_i \|_2 } x + \left(1 - \frac{a_i\opt}{\|x - \wh x_i \|_2 }\right) \wh x_i, 
    \]
    and covariance matrix 
    \[ \cov_i\opt = V_i\opt \diag( \sigma, \dots, \sigma, d_{pi}\opt )^2 (V_i\opt)^\top,\]
    where $V_i\opt$ is any orthogonal matrix with the $p$-th column given by $\frac{x - \m_i\opt}{\|x - \m_i\opt\|_2}$.
\end{lemma}

\begin{proof}[Proof of Lemma~\ref{lemma:max-dist}]
The result follows directly by inspecting the proofs of Proposition~\ref{prop:max-Wass}, Lemma~\ref{lemma:optimal_V} and Lemma~\ref{lemma:quadratic}.
\end{proof}


\begin{lemma}[Likelihood minimizer] \label{lemma:min-dist}
    For each $i \in \mc I_1$, let $(a_i\opt, d_{1i}\opt)$ be the optimal solution of the following two-dimensional optimization problem
    \begin{align*}
    &\min_{\substack{a \in \R_+,~d_1 \in [\sigma, +\infty) \\ a^2 + p(d_1 - \sigma)^2 \le \eps_1^2}} ~ \left\{-\log d_1 - \frac{(\|\x - \wh x_i\|_2 + a)^2}{2d_1^2} - (p-1) \log \left(\sigma + \sqrt{\frac{\eps_1^2 - a^2 - (d_1 - \sigma)^2}{p - 1}}\right) \right\}.
    \end{align*}
    Then, the minimizer $\QQ_1\opt$ of problem~\eqref{eq:likelihood-min} is a Gaussian mixture with $N_1$ components, and for $i \in \mc I_1$, the $i$-th components has mean
    \[
        \m_i\opt =  - \frac{a_i\opt}{\|x - \wh x_i \|_2 } x + \left(1 + \frac{a_i\opt}{\|x - \wh x_i \|_2 }\right) \wh x_i, 
    \]
    and covariance matrix 
    \[ \cov_i\opt = V_i\opt \diag\left( d_{1i}\opt, \sigma + \sqrt{\frac{\eps_1^2 - {a_i\opt}^2 - (d_{1i}\opt - \sigma)^2}{p-1}}, \dots, \sigma + \sqrt{\frac{\eps_1^2 - {a_i\opt}^2 - (d_{1i}\opt - \sigma)^2}{p-1}} \right)^2 (V_i\opt)^\top,\]
    where $V_i\opt$ is any orthogonal matrix with the $1$st column given by $\frac{x - \m_i\opt}{\|x - \m_i\opt\|_2}$.
\end{lemma}
\begin{proof}[Proof of Lemma~\ref{lemma:min-dist}]
The result follows directly by inspecting the proofs of Proposition~\ref{prop:min-Wass}, Lemma~\ref{lemma:optimal_V} and Lemma~\ref{lemma:quadratic}.
\end{proof}
\begin{figure}
    \centering
    \includegraphics[width=0.6\linewidth]{image/illus_3.png}
    \caption{Visualization of the worst-case distributions on a toy dataset, color codes are similar to Figure~\ref{fig:illustration}. The dashed, opaque dots and circles represent the isotropic Gaussian around each data sample. The solid dots and circles represent the worst-case distributions corresponding to the boundary point $x^b$. For blue (unfavorably predicted) samples, the worst-case distribution is formed by perturbing the distribution towards $x^b$ -- which leads to maximizing the posterior probability of unfavorable prediction. For green (favorably predicted) samples, the worst-case distribution is formed by perturbing the distribution away from $x^b$ -- which leads to minimizing the posterior probability of favorable prediction. These worst-case distributions will maximize the posterior probability odds ratio.}
    \label{fig:illus3}
\end{figure}

\clearpage
\bibliography{bibliography}

\end{document}
