% \documentclass{uai2024} % for initial submission
\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{amsthm}
\usepackage{comment}
% \usepackage[british]{babel}
% \theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
% \theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
% \theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}
%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{amsfonts}
\usepackage{subcaption}

\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\newcommand{\Pind}{\mathbb{P}_{\text{ind}}}
\newcommand{\PP}{\mathbb{P}}
\newcommand{\PN}{\hat{\mathbb{P}}}

\newcommand{\E}{\mathbb{E}}

\title{Decision-Focused Evaluation of Worst-Case Distribution Shift}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<kevinren@andrew.cmu.edu>?Subject=Your UAI 2024 paper}{Kevin Ren}{}}
\author[2]{Yewon Byun}
\author[2]{Bryan Wilder}
% Add affiliations after the authors
\affil[1]{%
    Statistics and Data Science Dept.\\
    Carnegie Mellon University\\
    Pittsburgh, Pennsylvania, USA
}
\affil[2]{%
    Machine Learning Dept.\\
    Carnegie Mellon University\\
    Pittsburgh, Pennsylvania, USA
}
  
  \begin{document}
\maketitle

\begin{abstract}
Recent studies have shown that performance on downstream optimization tasks often diverges from standard accuracy-based losses, highlighting that the loss function of a predictive model should align with the decision task of the downstream optimizer \citep{Wilder1, Elmachtoub}. Despite this observation, no work— to our knowledge—has yet examined the impact of this divergence for distribution shift. In this paper, we demonstrate that worst-case distribution shifts identified by traditional average accuracy-based metrics fundamentally differ from those for the downstream decision task at hand. We introduce a novel framework that employs a hierarchical model structure to identify worst-case distribution shifts in predictive resource allocation settings. This task is more difficult than in standard distribution shift settings because of combinatorial interactions, where decisions depend on the \textit{joint} presence of individuals in the allocation task. We show that the problem can be reformulated as a submodular optimization problem, enabling efficient approximations, to capture shifts both within and across instances of the optimization problem.
We apply our solution to real-world datasets in public service settings, providing empirical evidence that worst-case shifts for one metric often significantly diverge from worst-case distributions for other metrics.


\end{abstract}

\section{Introduction}\label{sec:intro}
In many real-world prediction settings, machine learning algorithms frequently encounter performance degradation due to distribution shifts, which are characterized by statistical differences between the data encountered during deployment and the data used in training \citep{quinonero2008dataset,  zech2018variable, koh2021wilds}. In particular, we are motivated by resource allocation settings, in which predictions are used to prioritize individuals within a given decision problem to receive a scarce resource. Here, performance drops in unseen populations can lead to both inefficient and inequitable allocation policies, whether they be potentially live-saving treatments for a disease or public service programs to mitigate risks such as unemployment \citep{Singh, Roland}. 


Developing methods for estimating worst-case distribution shifts
\citep{ subbaswamy2021evaluating, li2021evaluating, thams2022evaluating, huang2022two} is crucial to help practitioners identify problems ahead of time, and either mitigate them or re-evaluate whether to proceed with deployment. Similarly, previous works in the distributionally robust optimization (DRO) literature seek to minimize worst-case loss over a feasible set of distributions \citep{Duchi}. However, these methods universally focus on identifying shifts under which the model suffers a loss in average accuracy, as measured by traditional loss functions. The key motivation for our work is that the worst-case shifts identified by such individual-level processes may \emph{not} coincide with the worst-case shifts for decisions that require decision-focused optimization over an entire group of individuals, which introduces specific objectives and constraints: a model may be more robust than expected if errors do not flip the optimal decision, and conversely less robust if decisions are sensitive to small errors.

As a simple example, consider a decision maker (e.g. the operator of an emergency room) deciding whether or not to administer a limited treatment to a population consisting of two types of individuals (people seeking treatment). For the first type, their outcomes are very noisy; however, they virtually never require intensive treatment. Thus, all decisions are equally good. For the second type, we can predict outcomes with moderate accuracy, but there is substantial variation in treatment needs. A traditional DRO-style algorithm, identifying worst-case shift with respect to an individual-level loss, will typically concentrate more probability on individuals of the first type because predictive accuracy is worse for them -- this algorithm would solely seek to maximize worst-case alpha-tail performance in the dataset. However, from a resource allocation perspective, a population composed largely of the first type does not impose difficult trade-offs. This is due to the fact that demand for the resource is low; thus, even uninformed predictions will lead to near-optimal decisions. Furthermore, a population composed of the second type of individuals may be much more challenging, even if the model is more accurate per-individual, since finer distinctions must be made when weighing treatment costs and predicted benefits. This matches the intuition behind recent interest in predict-then-optimize settings, where many studies have shown that the loss function of a predictive model ought to be tuned to reflect the decision task undertaken by the downstream optimizer \citep{Wilder1, Elmachtoub, vanderschueren2022predict, mandi2020smart}. However, despite the understanding that performance on downstream optimization tasks often diverges from standard accuracy-based losses, no work -- to our best knowledge -- has examined the consequences of this divergence for distribution shift.

Capturing the population-level dependencies of resource allocation tasks necessitates a new approach to modeling potential distribution shifts. Standard approaches model individuals as iid and consider perturbations of the marginal probability associated with each individual. However, the arrivals of different individuals are often not plausibly independent. For example, consider an emergency department attempting to triage patients. Due to factors like seasonality in the frequency of many medical conditions, the arrival of different kinds of individuals are in fact correlated, e.g., seeing one patient with respiratory illness makes it more likely that many other patients with respiratory illness will arrive that day. Moreover, triage decisions are made jointly over the entire set of individuals for each day, not marginally for each individual. Formally, decisions are made on the level of optimization instances that consist of many individuals. Since decisions depend on the joint set of individuals present, we must be able to model shifts in the entire joint distribution, not just the marginal probability of each individual.

% Furthermore, this DRO-style method of representing distribution shift, in which all individuals from whom we have data are considered simultaneously in the same optimization problem, may violate i.i.d. sampling assumptions during the process of worst-case optimization. In many allocation settings, the data may be composed of individuals belonging to heterogenous and discrete groups that should not be considered in the same allocation problem.

% For example, a predictive model used to allocate medical resource to COVID patients, trained on historical patient data, will likely take into account patients who came in for treatment from a wide variety of dates. A DRO-style solution would seek to optimize a loss metric among all individuals simultanously: regardless of when a given patient came in for treatment, if the loss associated with that patient is high, then that patient will be upweighted in the DRO optimization algorithm. Considering one patient from several years ago alongside a patient who came in very recently, in the same allocation problem, could lead to inaccurate representations of worst-case shift due to distribution shift (i.e., temporal shift) once the model is deployed into the real world. Specifically, sampling between individuals who are temporally far apart i.i.d., despite the fact that these individuals may represent two distinct distributions of COVID patients (e.g. these two patients came in during two different waves of COVID). These \textit{optimization instances} of individuals, comprised of partitions of individuals into heterogenous groups from which we may sample i.i.d., are necessitated by our problem setting.

To address this, we use a hierarchical model for the data generating distribution to capture the optimization instance-style of modeling real-world allocation tasks. More precisely, we model the task of estimating worst-case allocation outcomes as an optimization problem over a hierarchical model, where shift can take place between optimization instances, as well as between the individuals belonging to each optimization instance. This task is significantly more difficult than in standard distribution shift settings because of combinatorial interactions, where decisions depend on the \textit{joint} presence of individuals in the allocation task. We show that, by reformulating the aforementioned optimization problem as a submodular maximization task, we are also able to address the complexity of combinatorial interactions.

To summarize, our contributions are as follows:
\begin{enumerate}
    \item We introduce a novel framework that employs a hierarchical model structure to identify worst-case distribution shifts in predictive allocation settings. We show that the problem can be reformulated as a submodular optimization problem, enabling efficient approximations. This captures shifts both within and across instances of the optimization problem and addresses the complexity of combinatorial interactions.
    \item In real world predictive allocation settings (e.g., public service data), we empirically show that worst-case shifts substantially differ from those estimated by traditional methods (i.e., metrics that focus on individual-level accuracy).
\end{enumerate}

Our findings highlight that in order to \emph{safely} build and assess ML systems for high-stakes allocation settings, systems must account for the decision task at hand to avoid over-estimating their robustness to distribution shift.

\section{Problem Setup}

\textbf{Predictive Modeling with Downstream Allocation.} We address scenarios in which a decision maker must allocate a limited resource within cohorts of individuals (e.g., an emergency department that must triage individuals who arrive every day). We model this process as a distribution over \textit{instances} of the allocation problem, where each instance is composed of individuals $i$ with their own features $x_i \in \mathbb{R}^{d_1}$ and outcome $y_i \in \mathbb{R}^{d_2}$. Let $X$ be the feature matrix that collects the feature vectors of each individual in a given instance and $Y$ the corresponding outcome matrix. Let $\mathbb{P}$ be the distribution over instances, i.e., $X, Y \sim \mathbb{P}$. We will also sometimes need to refer to the marginal distribution that $\mathbb{P}$ induces over individuals, denoted as $\mathbb{P}_{\text{ind}}$.  The decision maker observes training data $\{X_j, Y_j\}_{j = 1}^k$, where instance $j$ contains $n_j$ individuals. They select a predictive model $m$ which outputs a prediction $m(X)$ based on the features of each instance. Typical architectures accomplish this by making a separate prediction $m(x_i)$ for each $x_i \in X$ and aggregating the results, but we do not assume this. 

The decision maker uses the predictions made by $m$ to solve an optimization problem that models the constrained resource allocation, resulting in an allocation vector $Z$. The goal of this problem is to maximize an objective function $f$ which depends both on the decision $Z$ and on the (unknown) labels $Y$ over a feasible set $\Phi$. Given the predictions $\hat{Y} = m(X)$, the corresponding decision is
\begin{align*}
    Z^*(\hat{Y}) = \arg\max_{Z \in \Phi} f(Z, \hat{Y}). 
\end{align*}
We define the \textit{decision loss} on a given instance to be the regret relative to if the true $Y$ were known:
\begin{align*}
    DL(Y, \hat{Y}) = g(f(Z^*(Y), Y), f(Z^*(\hat{Y}); Y)).
\end{align*}

Common choices for $g$ may include subtraction (regret) or division (relative regret).

The goal for our model is to do well in expectation over $\mathbb{P}$, minimizing  $\mathbb{E}_{X, Y \sim \mathbb{P}}[DL(Y, m(X))]$. 

% amongst a sample of individuals drawn from some underlying population $\mathbb{P}$. Formally, let $\{x_j, y_j\}_{j=1}^{N} \sim \mathbb{P}$, where $x_i$ is a feature vector, $y_i$ is a label representing the reward of allocating to the $i$th individual, and $(x_i, y_i)$ represents the available data for an individual. Let $m$ be a predictive model and $\hat{c} = m(x)$ be its output, the predicted benefit vector over individuals in the sample. Given a constraint set $\Phi$ over the permissible decision vectors, we are interested in the following optimization program to obtain optimal decisions $z^*$ based on some real valued function $f$ from the predicted vector $\hat{c}$:


% Formally, let $\{x_j, y_j\}_{j=1}^{N} \sim S \sim \{S_1, \cdots , S_k\}$, where $x_i$ is a feature vector, $y_i$ is a label representing the reward of allocating to the $i$th individual, $(x_i, y_i)$ represents the available data for an individual, and $S$ represents an optimization instance of interest out of $k$ optimization instances. Let $m$ be a predictive model and $\hat{c} = m(x)$ be its output, the predicted propensity vector. Given a constraint set $\Phi$ over the allowed decision vectors, we are interested in the following optimization program to obtain optimal decisions $z^*$ based on some real valued function $f$ from the predicted vector $\hat{c}$:

% \begin{align*}
%     z^{*} (\hat{c}) = \text{argmax}_{w \in \Phi} f(w; \hat{c})
% \end{align*}

% Finally, the decisions are evaluated using the \textit{decision loss} (DL), i.e, the decisions are compared against the induced decision by the ground truth cost:
% \begin{align*}
%     DL( z^{*} (\hat{c}), y) = \ell(z^{*} (\hat{c}), y)
% \end{align*}

\textbf{Identifying worst-case distribution shifts.} We consider the common setting that the distribution $\mathbb{P}$ over instances of the allocation problem may differ in deployment, compared to what the model has encountered during training. Specifically, we consider the challenge of identifying the worst-case distribution shift for allocation performance within a constrained set parameterized by the total size of the shift allowed. Let $\Theta$ denote such a set of potential distributions. Our objective is to find 
\begin{align*}
\text{argmax}_{P \in \Theta} \mathbb{E}_{X, Y \sim P} [DL(Y, m(X))].
\end{align*}
Identification of such worst-case distributions is a common objective in order to allow model practitioners to understand and ameliorate potential failures in deployment \citep{pfohl2022comparison, subbaswamy2021evaluating, li2021evaluating, thams2022evaluating, huang2022two}. Previous work considers this problem for standard loss functions, which are \textit{separable} across individuals, i.e., which can be written in the form $\mathbb{E}_{x_i, y_i \sim \mathbb{P}_{\text{ind}}}[\ell(m(x_i), y_i)]$ for some individual-level loss $\ell$ with an expectation taken over the marginal distribution over individuals $\mathbb{P}_{\text{ind}}$. For instance, $\ell$ might be the mean squared error or cross-entropy loss. The key motivation for our work is that the worst-case shifts identified by an individual-level process may \textit{not} coincide with the worst-case shifts for the instance-level decisions. 

% As a simple example, suppose that our population consists of two types of individuals. For the first type, their outcomes are very noisy; however, they virtually never require intensive treatment and so all decisions are equally good. For the second type, we can predict outcomes with moderate accuracy but there is substantial variation in treatment needs. The worst-case shift with respect to an individual-level loss $\ell$ will typically concentrate more probability on individuals of the first type because our predictive accuracy is worst for them. However, from a resource allocation perspective, a population composed largely of the first type does not actually impose difficult tradeoffs because demand for the resource is low and so even an uninformed prediction will lead to near-optimal decisions. However, a population composed of the second type may be much more challenging even if there the model is more accurate per-individual because much finer distinctions must be made.   

Modeling and solving the worst shift identification problem for predictive resource allocation requires us to address two new challenges that are not present for previous work at the individual level. First, we must provide a parameterized family of distribution shifts over instances, which are composed of \textit{sets} of individuals. Second, we must develop algorithms to solve the resulting optimization problem over distributions, a task which turns out to be considerably more challenging because of the associated combinatorial structure where the impact of one individual on the loss depends on the presence of other individuals in the set.

\textbf{Additional Related Work.} There is a large body of literature that develops real-world resource allocation models, as well as criticisms of their shortcomings when exposed to distributions shift \citep{verma2023increasing, athey2023machine, wang2022against, schultz2019risk}. Our work can be seen as offering a precise way to operationalize and test for distribution shift concerns before deployment in such settings. Also related is work in statistics on learning robust individual treatment rules. E.g., \cite{mo2021learning} devise a set of methods to obtain distributionally robust treatment allocation policies given covariate shift. Our work differs in considering the consequences of joint optimization over a population of individuals, and is the first to account for downstream optimization in assessing worst-case distribution shift..

% \cite{luedtke2016optimal} proposes a method for determining optimal treatment allocation methods in settings where a limited fraction of a sample may receive treatment, focusing on in-distribution estimation of covariate parameters to do so. 

There is an extensive literature devoted to distribution shift in typical ML settings, as opposed to the resource allocation problems that are our focus. Our work is closest to the challenge of diagnosing worst case shifts \citep{li2021evaluating, subbaswamy2021evaluating, thams2022evaluating}. There is also a great deal of work devoted to training models that are robust to distributions shift via methods like Distributionally Robust Optimization (DRO) \citep{Duchi, rahimian2019distributionally, levy2020large}, and we build on some techniques from this literature. See Appendix \ref{sec:related} for further related work.

% This might occur for two reasons. First, the decision loss $DL$ can only be represented at the instance level, and the literature on predict-then-optimize problems highlights how it may be misaligned with common individual-level losses $\ell$ which measure predictive accuracy as opposed to impact on decisions. Second,  


% In real-world treatment allocation settings, it is rare that the process of allocation takes place in a global or universal setting where all individuals are considered for all decision tasks. More commonly we need only consider individuals sampled from some discrete partitioning of the population into optimization instances. For instance, the allocation of a social assistance program may be partitioned by state; here individuals are likely to be compared against others in the same geographic setting for the limited resource at hand. By De Finetti's Theorem we posit that, given these optimization instances and a partitioning of the population into the instances, we can ensure that the sampling of individuals within a given optimization instance results in independently generated samples.

% We also assert that worst-case distribution shifts in such settings are not necessarily equivalent to worst-case shifts in straightline machine learning. As an example, consider two loss functions: one that penalizes based solely on poor accuracy, and one that assigns a treatment to $\frac{1}{10}$ of the sample and calculates a regret metric relative to an optimal allocation. We note that in unbounded optimization settings, the choice of worst-case distribution shift with respect to the accuracy-based metric becomes a trivial task. In this scenario a worst-case distribution simply consists of a dataset containing nothing but the worst-performing individual. With these functions, which we term separable loss function, the loss over a dataset can be expressed as a sum of the losses over all individuals in the dataset:

% \begin{align*}
% \ell(\hat{y}, y) \propto \sum_{i=1}^{N} \ell(\hat{y_i}, y_i)
% \end{align*}

% While setting constraints on properties such as the chi-square divergence between the shifted distribution and the empirical distribution makes this solution less trivial, \cite{Duchi} demonstrates that traditional DRO approaches effectively evaluate a generalization of conditional value-at-risk. In other words, these models optimize the worst alpha-tail of their performance over the training set, where alpha depends on the magnitude of the constraint. This illustrates the broader point that typical DRO more or less upweights individual samples with poor model performance.

% In contrast to this paradigm, the loss function of predict-then-optimize is often (usually) non-separable with respect to individuals in a given sample. With regards to the second example loss function, if only 10\% of patients can receive a limited resource, it is possible that the worst-case shift is more nuanced than indiscriminately placing weight on incorrectly classified samples judged under some decision boundary. In the binary classification case, consider in a sample of $n$ class-balanced individuals, all with model predictions higher than the decision threshold, but with the true negative samples having linearly separable and smaller-magnitude predicted values than the true positive individuals. Here an accuracy-based task would yield a loss of about one-half. However, if in the same problem we utilize a function that compares the \textit{relative} values of model predictions rather than measuring against a fixed baseline, we could yield a perfect treatment allocation. Furthermore, in this setting the outcome of one patient depends in part on the outcome of other patients. As such it is possible that no such loss function exists that uniformly decomposes the loss over an entire sample of individuals to whom we assign treatment into a sum of losses over the individuals. We term this phenomenon as the coupling of the loss function over individuals. Thus, our goal more concretely becomes identifying worst-case distribution shifts in these coupled predictive decision making settings.

% More formally, we want to find

% \begin{align*}
% \text{argmax}_{P \in \Theta} \mathbb{E}_{X_{i=1}^{N}, Y_{i=1}^{N} \sim P} [l(z^{*}(m(x)), y)]
% \end{align*}

% In many DRO settings distribution shift is generally represented as a singular mass function $\mathbb{P}$ over the samples in the dataset of interest \cite{Duchi}. However, we note that in resource allocation settings this model begins to break down. Intuitively, the individuals being assessed within a given decision problem are likely to possess certain common characteristics, such as a similar geographic location and time of treatment. These spatiotemporal factors represent a confounding factor using this traditional method of representing distribution shift, as now the data are only independent given these latent subpopulation features. In order to better capture the independence of individuals given access to these features (which we also term optimization instances), we propose a novel application of Bayesian Hierarchical modeling to allocation tasks. By De Finetti's theorem we assume that, given these confounding factors, individuals belonging to the same optimization instance are exchangable \cite{orbanz2014bayesian}. This assumption justifies a two-level hierarchy with the upper level representing a latent subpopulation variable, which we use to condition upon individuals.

% Concretely, we model the distribution of some population $(\mathcal{X}, \mathcal{Y})$ using latent subpopulations imposed upon the data factored into a 2-layered Bayesian Hierarchical Model. We approximate $(\mathcal{X}, \mathcal{Y})$ with available training data $(X, Y)$, with each datapoint $x_i \in X$ belonging to a latent subpopulation, or optimization instance $1 \leq S \leq k$, each containing $\beta_S$ individuals. Let $P^{(0)}$ represent a probability mass function of size $|P^{(0)}| = k$. Furthermore, let distributions $P^{(1)}, \cdots P^{(k)}$ of sizes $\beta_1, \cdots, \beta_k$ be probability mass functions over the individuals in each optimization instance. 

% To sample from this distribution, we sample $S \sim P^{(0)}$. Samples are generated using  $U =  \{(x_1, y_1), \cdots, (x_n, y_n)\sim P^{(S)}\}$. This structure is described in Figure 1.

% \begin{figure}
% \includegraphics[scale=.3]{./images/hierarchy_2d.png}
% \caption{Bayesian Hierarchical Model. The root node leads into one subpopulation type with $k$ optimization instances. At the bottom level, the individuals belonging to each optimization instance are grouped into buckets, over which $k$ probability density functions are placed. Probability distributions over nodes are represented with dotted rectangles; each distribution is labeled as $P^{(n)}$, where $n$ represents an index to the subpopulation (or if $n=0$, a distribution over all optimization instances). }
% \label{fig:hierarchical_visualizer}
% \end{figure}

% To establish a constrained optimization problem we add the constraint that no set of weights $P^{(i)}$ can stray too far away from the empirical distribution, meaning each set of weights over both optimization instances and individuals within problem instances must belong to some 

% $\Theta^{(i)} = \{P^{(i)} | D_{\chi^2} (P^{(i)}, P_{\text{uniform}}) \leq \rho ;  ||P^{(i)}|| = 1\}$

% , where the feasible set represents a chi-square ball surrounding the empirical distribution. Our model contains a single set of weights over optimization instances $P_0$, and each optimization instance has its own $P_S$ representing a probability mass function over the individuals belonging to that optimization instance. 

% Formally, we express the optimization problem of identifying worst-case expected loss over distribution shift in the optimization instance-level and individual-level weights in this environment with a single level of subpopulations as:

% $\text{max}_{P^(0) \in \Theta^{(0)}} \mathbb{E}_{S \sim P^{(0)}}[\text{max}_{P^{(S)} \in \Theta^{(S)}} \mathbb{E}_{U^{(S)} \sim P_S}[\ell (U^{(S)})]]$

% We assert in this paper that the above optimization problem can be addressed with coordinate gradient descent.

% We assert in this paper that the above optimization problem can be expressed as a continuous-submodular problem, and, as such, can be addressed with Frank-Wolfe programming optimization methods.

% Our approach contrasts with the above works by considering the task of predictive resource allocation as a problem subject to arbitrary distribution shifts within and between a partitioning of the population into a set of optimziation instances. Furthermore, we utilize a novel transformation of the objective function to reframe the problem of worst-case shifts in this setting as a submodular maximization problem. We finally build upon the intuition that identifying a worst-case shift for one decision problem may not be the same as identifying the worst-case shift for another with a novel experiment to compare worst-case shifts with respect to multiple loss functions.

\section{Methods}

\textbf{Defining a constrained set of shifts.} The first challenge to identifying worst-case distribution shifts for predictive resource allocation is to formulate a model for the set of possible distributions $\Theta$. This is more difficult than in the standard supervised learning setting, where typical approaches define a set centered on the empirical distribution over individuals. For example, common choices include a $f$-divergence or Wasserstein uncertainty set \citep{namkoong2016stochastic, kuhn2019wasserstein}. Implicitly, such formulations are based on the assumption that individuals are sampled independently from $\Pind$ and so can be represented just by a vector containing the marginal probability of each individual. However, in our setting the instance-level structure means that individuals are not marginally independent: the patients who all arrive at a hospital on a specific day (forming an instance of the allocation problem) may differ systematically from those who arrive a month later. To account for this, we represent our setting via a two-level generative model. First, a latent instance-level parameter $\xi$ is sampled. We denote the marginal distribution over $\xi$ as $\PP_\xi$. Second, individuals within the instance are sampled iid conditional on $\xi$:
\begin{align*}
    \xi &\sim \mathbb{P}_{\xi}\\
    x_i, y_i &\overset{\text{iid}}{\sim} \PP_{\text{ind}}(\cdot | \xi), i = 1...n_j
\end{align*}
This represents the assumption that individuals are conditionally independent given instance-level information. For example, after conditioning on circumstances in the community (e.g.\ current disease levels), we suppose that the individual patients arriving at the hospital are independent. We remark that some appropriate $\xi$ is guaranteed to exist by De Finetti's theorem so long as individuals are modeled as exchangeable \citep{orbanz2010bayesian}. 

 Given this generative model, we adapt the common strategy of using the empirical distribution over the samples $\PN$ as a proxy for the unobserved $\PP$. Specifically, we allow both deviation from the empirical distribution over instances (to model shift in $\PP_\xi$) as well as deviation from the empirical distribution over individuals within each instance (to model shift in each $\PP_\text{ind}(\cdot|\xi)$). Let $\xi_j$ be the value of the latent variable in observed instance $j$. Importantly, even though $\xi_j$ is not itself observed, we only need to be able to model shifts in the distribution over $X, Y$ conditional on $\xi_j$. For this purpose, let $\PN_{\text{ind}, j}$ denote the empirical distribution over individuals within observed instance $j$; this will be our empirical proxy for $\PP_{\text{ind}}(\cdot | \xi_j)$. If the decision maker happens to have additional samples believed to be from the same population, these could be used as well. Accordingly, we define the feasible set of shifts for $\PP_{\text{ind}}(\cdot | \xi_j)$ to be 
 \begin{align*}
     \Theta_j = \{Q_{j}|\, D(Q_{ j}, \hat{\mathbb{P}}_{\text{ind}, j}) \leq \rho_{\text{ind}}\}
 \end{align*}
where $D$ is a standard divergence on distributions (e.g., the $\chi^2$ divergence) and $\rho_{\text{ind}}$ is a parameter chosen by the user to control the amount of distribution shift allowed at the individual level. To obtain the overall set of feasible shifts, we additionally allow a controlled level of shift in the distribution over instances. Let $\PN_\xi$ be the empirical distribution over the sampled instances (emphasizing again that the values of $\xi$ are irrelevant and we treat $\PN_\xi$ just as a distribution over the indices $1...k$). We will represent our feasible set by the combination of a distribution $Q_\xi$ over the sampled instances alongside a set $\{Q_{j}\}_{j = 1}^k$ of within-instance distributions over individuals. The final feasible set is thus
\begin{align*}
    \Theta = \{(Q_\xi, \{Q_{j}\}_{j 
 = 1}^k)| D(Q_\xi, \PN_\xi) \leq \rho_\xi, Q_{j} \in \Theta_j \,\, \forall j\}
\end{align*}
where $\rho_\xi$ is a final parameter specifying the allowed degree of shift across instances. Each element of $\Theta$ defines a distribution, which can be sampled from by first sampling an instance identifier $j$ from $Q_\xi$ and then sampling individuals iid from $Q_{j}$. $Q_\xi$ can be represented as a vector in $\mathbb{R}^k$ giving the probability of each instance, and $Q_j$ can be represented as a vector of size $n_j$ giving the marginal probability placed on each observed individual. 

\textbf{Optimization over the set of shifts.} The model above induces an optimization problem to identify the worst-case shift with respect to the decision loss. Specifically, we wish to solve
\begin{align*}
\max_{Q \in \Theta} \E_{j \sim Q_\xi}\left[\E_{X, Y \sim Q_{ j}}\left[DL(Y, m(X))\right]\right] \tag{1} \label{eq:eq1}.
\end{align*}
To analyze the structure further, we expand the expectations into sums, using that samples are iid within instances. Let $S_j$ denote the set of all possible draws (with replacement) of $n_j$ individuals from the observed samples. The problem becomes
{\small \begin{align*}
    \max_{Q \in \Theta} \sum_{j = 1}^k Q_\xi(j) \sum_{X, Y \in S_j} \left(\prod_{i = 1}^{n_j} Q_{j}(x_i, y_i)\right) DL(Y, m(X)).
\end{align*}}%
A first step towards solving this problem is to note that each $Q_{j}$ can in fact be computed separately, since the outer objective is a sum with non-negative coefficients. That is, we can define
{\small \begin{align*}
    Q_j^* = \arg\max_{Q_j \in \Theta_j}  \sum_{X, Y \in S_j} \left(\prod_{i = 1}^{n_j} Q_{j}(x_i, y_i)\right) DL(Y, m(X))
\end{align*}}%
and then solve 
\begin{align*}
    \max_{Q \in \Theta} \sum_{j = 1}^k Q_\xi(j) \sum_{X, Y \in S_j} \left(\prod_{i = 1}^{n_j} Q_j^*(x_i, y_i)\right) DL(Y, m(X))
\end{align*}
to obtain the optimal distribution over instances. The outer problem has a relatively tractable structure which is closely related to existing work on distributionally robust optimization. Given knowledge of $Q_j^*$, it can be solved using off-the-shelf convex optimization techniques. However, the inner optimization problem for each instance $j$ is much more difficult as it is a \textit{nonconvex} problem. Indeed, polynomial optimization is in general computationally intractable \citep{karp2010reducibility, cook2023complexity}. This structure reflects the fundamental change in perspective from individual-level losses to resource allocation: the decision loss encapsulates  the joint dependence of decisions on the entire set of individuals who arrive, so the contribution of the parameter for each individual to the loss cannot be neatly disentangled. 

To solve this problem, we draw on techniques from the combinatorial optimization literature and prove that it can actually be reformulated as a \textit{DR-submodular} optimization problem. This special structure allows us to develop efficient algorithms with provable approximation guarantees. 


% The nested expectations that define our objective along with clear non-convexity in our optimization parameters $P$ present two challenges to solve in the optimization problem. These complications make simultaneous optimization of all weights over all optimization instances technically infeasible, and also rule out traditional convex optimization methods such as stochastic gradient descent.
% As a result of the first challenge, we break the greater optimization problem down into two different problems. First, we identify for each optimization instance a set of weights on individuals that result in worst-case performance within that optimization instance. Then, we identify over all optimization instances a set of weights that result in worst-case performance within the feasible set, by first sampling an optimization instance and then sampling individuals from that optimization instance. We argue that this procedure can be expressed, following a novel change of variables, as a submodular maximization problem which can be optimized using Frank-Wolfe programming methods.

\textbf{Reformulation as a submodular optimization problem.} We assert that our optimization objective falls under DR-submodular problems -- one class of generally non-convex functions. We demonstrate below that under a novel transformation of the objective, our problem is non-monotone DR-submodular in the individual-level and convex in the optimization instance-level. This justifies the use of Frank-Wolfe methods for more optimal solutions to our optimization problems. We assume without loss of generality that the decision loss is either naturally non-positive or bounded in the range $[-\infty,1]$. Nonnegativity always holds by definition of the regret, and any uniformly bounded loss can be rescaled to an upper value of 1. Thus, for a given loss function $DL$ with such bounds, we define:
{\small \begin{align*}
    DL'(Y, \hat{Y}) = DL(Y, \hat{Y}) - 1
\end{align*}}%

% We prove that using $DL'$ in place of $DL$ still yields in an equivalent quality approximation to Equation \ref{eq:eq1}, when we add 1 to the returned maximized objective value of $DL'$ as an estimate for the worst-case expectation of $DL$.

Then, we introduce a change of variables that supports our non-monotone Frank-Wolfe solution, built on \cite{Bian}. Our solution requires that our feasible set contains the zero vector. Since this condition does not hold for the simplex, we instead optimize over the \textit{offset} from the empirical distribution, where an initialized offset value of 0 is equivalent to the empirical distribution. We define these offsets, along with their feasible sets in the constrained optimization setting, as:
\begin{align*}
    &W_j(x_i, y_i) = Q_j(x_i, y_i) - \frac{1}{|Q_j|} \,\, \forall i, j \\
    &\Omega_j = \left\{W_j | \,Q_j + \frac{1}{|Q_j|} \in \Theta_j\right\}
\end{align*}

% \begin{align*}
%     &\mathbb{Z}_{\xi} = \PP_\xi - \frac{1}{k}\\
%     &\mathbb{Z}_{\text{ind}}(\cdot | \xi) = \mathbb{P}_{\text{ind}}(\cdot | \xi) - \frac{1}{|\mathbb{P}_{\text{ind}}(\cdot | \xi)|}\\
%     &\Omega_{j} = \{W_j = Q_j - \frac{1}{|Q_j|} \,\, \forall Q_j \in \Theta_j\}\\
%     \Omega = &\{(W_\xi, \{W_{j}\}_{j 
%  = 1}^m)| D(W_\xi + \frac{1}{k}, \PN_\xi) \leq \rho_\xi, W_{j} \in \Omega_j \,\, \forall j\}
% \end{align*}
% Furthermore, our sampling process becomes as follows:
% \begin{align*}
%     &\mathbb{P}_{\xi} = \mathbb{Z}_{\xi} + \frac{1}{k}\\
%     &\xi \sim \mathbb{P}_{\xi}\\
%     &\mathbb{P}_{\text{ind}}(\cdot | \xi) = \mathbb{Z}_{\text{ind}} + \frac{1}{|\mathbb{Z}_{\text{ind}}|}\\
%     x_i, y_i &\overset{\text{iid}}{\sim} \PP_{\text{ind}}(\cdot | \xi), i = 1...n_j
% \end{align*}
This change of variables is finally represented by the following modified optimization problem. 
% {\small \begin{align*}
% \text{max}_{X^{(0)} \in \Omega^{(0)}} \mathbb{E}_{S \sim X^{(0)}}[\text{max}_{X^{(S)} \in \Omega^{(S)}} \mathbb{E}_{U^{(S)} \sim \frac{1}{\beta_S} + X^{(S)}} [\ell (U^{(S)})]]
% \end{align*}}%
\begin{align*}
\max_{W \in \Omega} \E_{j \sim W_\xi}\left[\E_{X, Y \sim W_{ j}}\left[DL'(Y, m(X))\right]\right] + 1. \tag{2} \label{eq:eq2}
\end{align*}
Note that we apply a correction term of 1 to our final worst-case estimation of $DL'$, in order to account for the adjustment from $DL$ in Equation \ref{eq:eq1} to $DL'$ in Equation \ref{eq:eq2}.

To begin the justification of this approach, we will prove that an approximate solution to Equation \ref{eq:eq2} results in an equivalent quality approximation to the original problem in Equation \ref{eq:eq1}. We will then prove that the optimization problem over offsets $W$ is DR-submodular, with full proofs in Appendix \ref{sec:proofs}.

\begin{theorem}
\label{thm:optthm}
Suppose we have a solution $W$ to Equation \ref{eq:eq2} with value at least $\alpha \cdot OPT'_W  - \epsilon$ for some $\alpha \in \mathbb{R}, \epsilon \in \mathbb{R}$, where $OPT'_W$ is  the optimal value. $W$ corresponds to a $Q$ with value at least $\alpha \cdot OPT_Q - \epsilon$ where $OPT_Q$ is the optimal value of solving Equation \ref{eq:eq1}.
\end{theorem}

% This theorem is proven using a simple substitution of variables.

In order to prove that the change-of-variables objective is DR-submodular, we first note that the following definition may be helpful:

% \begin{definition}
% \label{def:inj}
% Consider some loss function bounded between 0 and 1 inclusive, $\omega$. We define a modified loss function $\ell$ by $\ell(U) = \omega(U) - 1$, for some sample $U$.
% \end{definition}

% \begin{definition}
% \label{def:inj}
% A function $f:\mathbb{X} \to \mathbb{Y}$ is monotone if, for all $x \in X$, $\frac{\partial f(x)}{\partial x} \geq 0$.
% \end{definition}

\begin{definition}
\label{def:inj}
A twice-differentiable function $f:\mathbb{X} \to \mathbb{Y}$ is DR-submodular if all entries of the Hessian matrix are non-positive.

% you will probably need a more mathematical definition with a lemma to make this statement
\end{definition}

We then demonstrate that the objective function of our reformulated inner optimization problem satisfies this definition:
\begin{theorem}
\label{thm:bigtheorem}
The objective function of the optimization problem in Equation \ref{eq:eq2} is non-monotone DR-submodular in $W_j$.
\end{theorem}

% \begin{theorem}
% \label{thm:bigtheorem}
% The objective function of the optimization problem in Equation \ref{eq:eq2} is non-monotone DR-submodular in $W_j$.
% \end{theorem}

Next, we build on this result to introduce efficient approximation algorithms for the reformulated problem. 
% \subsection{Theorems and such}
% The preferred way is to number definitions, propositions, lemmas, etc. consecutively, within sections, as shown below.
% \begin{definition}
% \label{def:inj}
% A function $f:X \to Y$ is injective if for any $x,y\in X$ different, $f(x)\ne f(y)$.
% \end{definition}
% Using \cref{def:inj} we immediate get the following result:
% \begin{proposition}
% If $f$ is injective mapping a set $X$ to another set $Y$, 
% the cardinality of $Y$ is at least as large as that of $X$
% \end{proposition}
% \begin{proof} 
% Left as an exercise to the reader. 
% \end{proof}
% \cref{lem:usefullemma} stated next will prove to be useful.
% \begin{lemma}
% \label{lem:usefullemma}
% For any $f:X \to Y$ and $g:Y\to Z$ injective functions, $f \circ g$ is injective.
% \end{lemma}


% We demonstrate that the Hessian matrix of the objective is non-positive in the weights over individuals. As a result, we assert that our objective function is DR-submodular in all $W_j$ for $1 \leq j \leq k$. Because the high-level task of identifying worst-case expectations over optimization instances is a convex linear combination of the elements of the distribution $W_{\xi}$ we assert that our approach is a valid adaptation of the DR-submodular maximization literature.

% \begin{theorem}
% \label{thm:bigtheorem}
% In the context of optimizing worst-cast loss between optimization instance using a probability mass function $P^{(0)}$ to assign likelihoods to sampling optimization instances, the objective function is non-monotone DR-submodular in the elements of $P^{(0)}$.
% \end{theorem}

% This theorem is also proved by leveraging the non-positive loss function.

% Lastly we prove that the transformed objective is also DR-submodular:

% \begin{theorem}
% \label{thm:bigtheorem}
% The new objective function is DR-submodular.
% \end{theorem}

% Here way may use a similar change-of-variables method as with Theorem 4.1.


\textbf{Approximation algorithms.} \cite{Bian} introduce several methods for maximizing non-monotone DR-submodular problems; we utilize an adaptation of their non-monotone Frank-Wolfe variant in order to solve the above DR-submodular problems, in the context of resource allocation. Since this algorithm requires that the zero vector be a member of the feasible set, we optimize over the offset from the empirical distribution. Further, we take advantage of strong empirical results identified by past works in applying momentum-based methods to Frank-Wolfe methods \citep{Mokharti,li2020does}. These methods store gradients from prior iterations of the algorithm and consider them to adjust the current iteration's gradients.

% ; our probability distributions $P$ are therefore initialized as empirical distributions over either individuals or optimization instances, and successive iterations of the Frank-Wolfe solution solve the problem\\

% For all $1 \leq S \leq k$:\\
% $\text{max}_{X^{(S)}} \mathbb{E}_{U \sim \frac{1}{\beta_S} + X^{(S)}}[\ell(U)]$.\\
% subject to\\
% $\sum X^{(S)}_i = 0$\\
% $D_{\chi^2}(\frac{1}{\beta_S} + X^{(S)}, P_{\text{uniform}}) < \rho_1$.\\

% Let the solutions to the above problems be represented as $X^{(1)*}, \cdots , X^{(k)*}$. Furthermore, define $\Omega^{(S)} = \{\theta^{(S)} - \frac{1}{\beta_S} ,\forall \theta^{(S)} \in \{\Theta^{(0)}, \cdots , \Theta^{(k)}\}\}$. At the higher level of the hierarchy, we seek to maximize the expected value of loss, given the converged weights from the above problems:

% $\text{max}_{X^{(0)} \in \Omega^{(0)}} \mathbb{E}_{S \sim \frac{1}{k} + X^{(0)}}[\mathbb{E}_{U \sim \frac{1}{\beta_S} + X^{(S)*}}[\ell(U)]]$.\\
% subject to\\
% $\sum X^{(0)}_i = 0$\\
% $D_{\chi^2}(\frac{1}{k} + x^{(0)}, P_{\text{uniform}}) < \rho_2$.\\

% In our algorithm, we first optimize over individuals within a given optimization instance and then optimize over all optimization instances. 
% Our full algorithm is detailed in Algorithm 1 in Appendix \ref{sec:algo}, which solves the above optimization problem.


% for each optimization instance, we calculate the expectation of loss after the first step of identifying worst-case shifts within each optimization instance. These worst-case losses are then stored in a vector used as input to gradmax, which then returns the probability distribution within the feasible set that maximizes the dot product of the loss vector over optimization instances with probability distributions in the feasible set.

% insert algo here

% \begin{itemize}
%     \item bian et al introduce a couple of sick algorithms specifically for non-monotone DR-submodular problems
%     \item and we find that this fits perfectly into our problem of maximizing the expected value of some non-positive function over a probability distribution of samples
%     \item note in particular that we require this function to be between 0 and 1 inclusive, and then we subtract 1 in order to get the hessian to be non-positive
%     \item as a result we are able to justify the use of bian et al's 2nd algorithm
%     \item so the process of converging to a set of worst-case weights for a given model involves with L=1 first optimizing over the individual subpopulations, and then optimizing over all subpopulations (write out the 2 optimization problems explicitly here)
%     \item so for each subpopulation, we run fw, calculating expected value of gradient and backpropping as necessary
%     \item we then apply this process to k models trained on each of the k subpopulations, getting $k(k+1)$ different pmfs
%     \item we can then take the same 'class' of model (eg blindly trained on the train set) and optimize worst case loss with respect to 2+ different metrics, getting worst-case loss for each set of distributions
%     \item the intuition is that when we swap distributions, we see lower losses (these are not the worst-case distributions for the other metric)
%     \item TODO run pseudocode/algorithm for the entire experiment? necessary?
% \end{itemize}

% An easy corollary of \cref{thm:bigtheorem} is the following:
% \begin{corollary}
% If $f:X\to Y$ is bijective, 
% the cardinality of $X$ is at least as large as that of $Y$.
% \end{corollary}
% \begin{assumption}
% The set $X$ is finite.
% \label{ass:xfinite}
% \end{assumption}
% \begin{remark}
% According to some, it is only the finite case (cf. \cref{ass:xfinite}) that is interesting.
% \end{remark}
% %restatable

% We note that in traditional ML settings, the choice of worst-case data is trivial - simply take whatever individual datum performed the worst, and fill up your entire dataset with copies of that datum. This assumes that the prediction on datum $x_i$ is independent on the prediction on datum $x_j$, $i\neq j$. In other words, the loss on a dataset can be expressed as a sum of the losses of individuals:

% $l(\hat{y}, y) = \sum_{i=1}^{N} g(\hat{y_i}, y_i)$

% However, the loss function of predict-then-optimize is often (usually) non-separable; in settings where you, for example, force 10\% of patients to get treatment, the outcome of one patient depends in part on the outcome of other patients; this function $g$ may not exist. The question now becomes: how can we parameterize away distribution shift to determine the shift that maximizes loss? 

% More formally, we want to find

% $\text{argmax}_{P \sim \Theta} \mathbb{E}_{X_{i=1}^{N}, Y_{i=1}^{N} \sim P} [l(z^{*}(m(x)), y)]$

% To approach the parameterization problem, we might assume that our data is composed of some sort of hierarchical Bayesian model, where we sample individuals from some set of subpopulations $D_i, 1 \leq i \leq k$, where each $D_i$ corresponds to some subpopulation based on facility, race, first day of hospital admission, or other division of people. We thus parameterize each $P_j \sim \Theta \approx \{w_1, ..., w_k\}$, where each $w_i$ represents a weight placed on the subdistribution of each subpopulation $D_i$. Furthermore, the set of $n_i$ individuals in each subdistribution are each assigned some set of weights each $s_1, ..., s_n$. We then model subpopulation shift as a change in weights in some subset of the subpopulation and individual weights, relative to their initialized or observed values. $\Theta$ therefore represents the set of all reasonable sets of weights that might occur in a population; its most unconstrained form is simply the set of all $k$ weights on the subpopulations, as well as the $k$ sets of $n_i$ weights on the individuals in a given population, each of which lies on the simplex:

% $\{w_1, ..., w_k\}, \{S^{(i)}= \{s_1, ..., s_n\} \forall i \leq k\}$ where $\sum_{i=1}^{k} w_i = 1, \sum_{j=1}^{n_i} S^{(i)}_j = 1$

% The goal of this optimization problem concretely is focused on finding the set of weights $P \in \Theta^*$, consisting of the weights on both subpopulations and individuals in all subpopulations, that maximize the expected value of loss. 

% Note that, because our loss given a sample from a fixed distribution is differentiable w.r.t. the parameters of that distribution, we should be able to do some kind of gradient descent to find the set of weights that maximize the expected loss of a sample over individuals within an optimization instance. We express the expected value of loss under a distribution as the sum over all possible data of the likelihood of that data times that data's loss:

% $\mathbb{E}[l(x, y) | P_j] = \sum_{\{x,y\}_{i=1}^{N}} P(\{x,y\})_{i=1}^{N} l(z^{*}(m(x)),y)$

% After optimizing the p.m.f. over individuals within each optimization instance $P_S | 1 \leq S \leq k$, we then seek to optimize the p.m.f. over optimization instances $P_0$. Our algorithm for doing this sequential optimization of these weights thus follows a sort of coordinate gradient descent, where the inner expectation over samples from a fixed optimization instance is maximized before the outer expectation over all optimization instances.

% \begin{algorithm}[tb]
%    \caption{Coordinate Gradient Descent for Maximizing Expected Loss over Optimization Instances/Individuals in Predict-Then-Optimize settings}
%    \label{alg:example}
% \begin{algorithmic}
%     \STATE {\bfseries Input:} weights $P_0, \cdots, P_k$, data $X_1, \cdots, X_k$

%     \FOR {$i=1$ {\bfseries to} $k$}
%     \FOR {$j=1$ {\bfseries to} iterations}
%     \FOR {$k=1$ {\bfseries to} num\_samples}
%     \STATE sample $U^{S} \sim P_S$
%     \STATE accumulate $\frac{\partial \ell(U^{(S)})}{\partial P_S}$
%     \ENDFOR
%     \STATE update $P_S \leftarrow P_S + \alpha (\frac{\partial \text{aggregate\_loss}}{\partial P_S})$
%     \STATE update $P_S \leftarrow \text{proj}_{P_S}P_S^*$
%     \ENDFOR
%     \ENDFOR

%     \FOR {$i=1 ${\bfseries to} iterations}
%     \STATE sample $S \sim P_0$
%     \FOR {$j=1$ {\bfseries to} num\_samples}
%     \STATE sample $U^{(S)} \sim P_S$
%     \STATE accumulate $\frac{\partial \ell(U^{(S)})}{\partial P_0}$
%     \ENDFOR
%     \STATE update $P_0 \leftarrow P_0 + \alpha(\frac{\partial \text{aggregate\_loss}}{\partial P_0})$
%     \STATE update $P_S \leftarrow \text{proj}^{P_0} P_0^*$
%     \ENDFOR

% \end{algorithmic}
% \end{algorithm}

% \begin{algorithm}[tb]
%    \caption{Frank-Wolfe Method for Maximizing Expected Loss over Optimization Instances/Individuals in Predict-Then-Optimize settings}
%    \label{alg:example}
% \begin{algorithmic}
%     \STATE {\bfseries Input:} weight offsets $W_{\xi}, \cdots, W_{j}, \cdots, W_k$ (initialized to $\{0\}_{1}^{n_j}$), $v^{(1)}_0 = \{0\}_{1}^{n_j}$, $v^{(2)}_0 = \{0\}_{1}^{n_j}$, iterations, num\_samples, num\_samples2, $p_t$

%     \FOR {$i=1$ {\bfseries to} $k$}
%     \FOR {$j=1$ {\bfseries to} iterations}
%     \FOR {$k=1$ {\bfseries to} num\_samples}
%     \STATE sample $\{x_s, y_s\}_{s=1}^{n_j} \sim W_i$
%     \STATE calculate $\ell = DL(m\{x_s\}_{s=1}^{n_j}, \{y_s\}_{s=1}^{n_j})$
%     \STATE accumulate $\frac{\partial \ell}{\partial W_i}$
%     \ENDFOR
%     \STATE set $v^{(1)}_i = p_t * -\frac{\partial \ell}{\partial W_i} + (1-p_t) * v^{(1)}_{i-1}$
%     \STATE solve $v = \text{\textbf{gradmax}}(v^{(1)}_i) - \frac{1}{|W_i|}$
%     \STATE update $W_i = W_i + \frac{1}{\text{iterations}} v$
%     \ENDFOR
%     \ENDFOR
    
%     \FOR {$i=1 ${\bfseries to} iterations}

%     \FOR {$j=1$ {\bfseries to} num\_samples2}
%     \STATE sample $h \sim W_\xi$
%     \FOR {$k=1$ {\bfseries to} num\_samples}
%     \STATE sample $\{x_s, y_s\}_{s=1}^{n_j} \sim W_h$
%     \STATE calculate $\ell = DL(m\{x_s\}_{s=1}^{n_j}, \{y_s\}_{s=1}^{n_j})$
%     \STATE accumulate $\frac{\partial \ell}{\partial W_h}$
%     \ENDFOR
%     \STATE set $v_i = p_t * \frac{-\partial \ell}{\partial W_h} + (1-p_t) * v_{i-1}$
%     \STATE solve $v = \text{\textbf{gradmax}}(v_i) - \frac{1}{k}$
%     \STATE update $W_\xi = W_\xi + \frac{1}{\text{iterations}} v$

%     \ENDFOR
%     \ENDFOR

% \end{algorithmic}
% \end{algorithm}

% Our methodology for converging towards a set of perfectly suboptimal weights is as follows: 1) initialize our weights to be perfectly uniform, over subpopulations and creating a uniform distribution over each group of patients in a given subpopulation, 2) for each subpopulation, repeatedly take batches of samples of individuals in that subpopulation, where each batch of samples represents 1 epoch of sampling, 3) for each batch, calculate the average loss of the samples in the batch and the gradient of that loss with respect to the weights of each sampled individual, 4) update said weights and project back onto the space of valid probability distributions within some distance metric from the initialized set of weights, and 5) repeat this optimization pattern for the subpopulation-level weights, randomly sampling a subpopulation at each epoch and using our previously trained individual-level weights to randomly sample individuals from that subpopulation. Using this block coordinate descent-esque algorithm we will then have a set of weights that correspond to a worst-case subpopulation shift within reason.
% For a specific sample from that distribution, the gradient w.r.t. some weight $w$ (either representing a subpopulation or an individual) is represented as

% $\nabla_{w} (l(x, y) | P_j) = \sum_{\text{samples}} \nabla_{w} P(\text{sample})\ell(\text{sample})$\\

% $= \sum_{\text{samples}} \nabla_{w} \log (P(\text{sample}))P(\text{sample})\ell(\text{sample})$\\
% $= \mathbb{E}_{\text{samples} \sim P}[(\nabla_{w} \log(P(\text{sample})))\ell(\text{sample})]$.\\

% For a subpopulation, this gradient becomes\\

% $= \mathbb{E}_{\text{samples} \sim P}[(\nabla_{w} \log(P(\text{sample})))\ell(\text{sample})]$\\
% $= \frac{\partial}{\partial w_{i}} \ell(\text{sample}) \frac{\partial}{\partial w_i} \log(w_i \Pi_{x\in \text{sample}} P(x | w_i))$\\
% $= \ell(\text{sample}) \frac{\partial}{\partial w_i} \log (w_i) + \sum_{x \in \text{sample}} s_x$\\
% $= \frac{\ell(\text{sample})}{w_i}$.

% For an individual, this gradient is \\

% $\frac{\partial}{\partial s_i} \ell(\text{sample})[\log(w_i) + \sum_{\text{individuals}} \log(s_{\text{individual}})]$\\
% $= \frac{\sum_{s_j \in \text{sample}} 1\{i=j\}}{s_i}(\ell(\text{sample}))$

% These gradients thus define our update rules for each iteration of coordinate gradient descent.
% Evaluating this gradient for all weights allows us to use some kind of optimization (eg stochastic gradient descent) to find weights that maximize expected loss.

% \subsection{Main Algorithm}
% \label{sec:algo}

\textbf{Main algorithm.} In our algorithm, we first optimize over individuals within a given optimization instance and then optimize over all optimization instances. We include the full pseudocode in Algorithm \ref{alg:example}, a formal writeup of how we implement the Frank-Wolfe algorithm developed by \cite{Bian}. Our algorithm includes a subroutine called \textit{gradmax}, which maximizes the dot product over the feasible set of viable allocations and the gradients of the objective function w.r.t. the optimization variables \cite{Wilder2}. Additional implementation details can be found in Appendix \ref{sec:code}.

% Due to the strong empirical improvements we saw along with past work by such works as \cite{li2020does}, we also introduce a momentum term into the update rule that preserves a portion of gradients calculated in the previous iteration of the algorithm, initialized to 0.

% Building on prior work from \cite{Wilder2}, Frank-Wolfe algorithms commonly require subroutines to maximize the dot product over the feasible set of viable allocations and the gradients of the objective function with respect to the optimization variables. We incorporate their work as a subroutine within our algorithm, termed \textit{gradmax}. Gradmax is also used to solve the optimization problem over all optimization instances, where we input a vector of converged worst-case losses for all optimization instances along with $\rho_\xi$ into gradmax, which then returns the probability distribution within the feasible set that maximizes expected worst-case loss over optimization instances.

\begin{algorithm}[!ht]
   \caption{Frank-Wolfe Method for Maximizing Expected Loss over Optimization Instances}
   \label{alg:example}
\begin{algorithmic}
    \STATE {\bfseries Input:} weight offsets $W_{\xi}, \cdots, W_{j}, \cdots, W_k$ (initialized to $\{0\}_{1}^{n_j}$), $v_0 = \{0\}_{1}^{n_j}$, iterations, num\_samples, num\_samples2, $p_t$, $\rho_{\text{ind}}$, $\rho_\xi$

    \FOR {$i=1$ {\bfseries to} $k$}
    \FOR {$j=1$ {\bfseries to} iterations}
    \FOR {$r=1$ {\bfseries to} num\_samples}
    \STATE sample $\{x_s, y_s\}_{s=1}^{n_j} \sim W_i$
    \STATE calculate $\ell = DL'(m(\{x_s\}_{s=1}^{n_j}), \{y_s\}_{s=1}^{n_j})$
    \STATE accumulate $\frac{\partial \ell}{\partial W_i}$
    \ENDFOR
    \STATE set $v_j = p_t * -\frac{\partial \ell}{\partial W_i} + (1-p_t) * v_{j-1}$
    \STATE solve $\delta = \text{\textbf{gradmax}}(v_j, \rho^{\text{ind}}) - \frac{1}{|W_i|}$
    \STATE update $W_i = W_i + \frac{1}{\text{iterations}} \delta$
    \ENDFOR
    \ENDFOR

    \STATE initialize $\lambda = \{0\}_{1}^{k}$
    \FOR {$i=1 ${\bfseries to} k}

    \FOR {$j=1$ {\bfseries to} num\_samples2}
    \STATE sample $\{x_s, y_s\}_{s=1}^{n_j} \sim W_i$
    \STATE accumulate $\ell = DL'(m(\{x_s\}_{s=1}^{n_j}), \{y_s\}_{s=1}^{n_j})$
    % \STATE set $v^{(2)}_i = p_t * \frac{-\partial \ell}{\partial W_h} + (1-p_t) * v^{(2)}_{i-1}$
    % \STATE update $W_\xi = W_\xi + \frac{1}{\text{iterations}} v$

    \ENDFOR
    \STATE set $\lambda_i = \text{avg} (\ell)$
    \ENDFOR
        \STATE solve $W_\xi = \text{\textbf{gradmax}} (\lambda, \rho_\xi)$

\end{algorithmic}
\end{algorithm}

\section{Experiments and Results}

We consider the following allocation tasks on real-world data. Specifically, we focus on the following three tasks, utilizing US census data \citep{folktables}. Each task is motivated by resource allocation problems in a public policy-related setting. More concretely, we consider (1) predicting employment status, (2) predicting an individual's income in American dollars, and (3) predicting whether an individual's income is above or below \$50,000 annually. For each task, we consider a hypothetical resource allocation problem, modeling a decision maker who wishes to target a limited intervention to individuals more likely to be unemployed or more likely to have low income, respectively. In each of these tasks, the optimization instances are formed from individuals in a particular US state, reflecting that allocation decisions are made among geographic cohorts and different states may differ systematically in distribution.

We use these domains to run large-scale experiments over thousands of different combinations of models, optimization instances, and loss functions. See Appendix \ref{sec:exptdetails} for further details on the train-test setup, model architectures, etc.

% For our third task, we take a case study of a real medical domain based on the NC3 electronic health record dataset of COVID-19 outpatients. Here, computational limitations of the secure platform the data is stored on prevent us from running comparably large-scale experiments. However, this dataset allows us to reproduce our core findings in the context of a high-stakes medical task. Specifically, we adopt the goal of predicting mortality in the context of allocating a resource-limited medical treatment (motivated e.g., by limited supplies when therapeutics such as Paxlovid were introduced). Optimization instances are formed by bucketing patients by month, reflecting both that allocations are made based on limited supply that accrues during a specific time period and that patient distributions may differ systematically across, e.g., waves of the pandemic. Details on the train-test setup, model architectures, and so on, may be found in Appendix \ref{sec:exptdetails}.

% \subsection{Employment Prediction}

% We suppose that a non-profit employment assistance agency has developed a program that is meant to help the unemployed find jobs, where there is a finite headcount and lack of concrete labels as to whether an individual is employed. As such, this agency must be careful as to which individuals they target; assume that they have a limited headcount for this program and want to target as many unemployed people as possible. We use a logistic regression model to predict the probability that a given individual is employed, using model outputs to inform the decision task. In this context, we want to determine whether a given model can be distributionally robust to different subpopulations (i.e., individuals from different states); here we treat the individual's state of residency as the set of optimization instances.
% Finally, consider the scenario in which this agency originally operates in one community (e.g. the state of Pennsylvania) and wishes to expand their operations to other states while taking into account the fact that characteristics of unemployed individuals may change from state to state. 
% This motivates the use of our estimator to determine whether a given model can be robust to distribution shift over states and within states.

% \subsection{Income Prediction}
% We consider a second real-world scenario involving income prediction, where the task is to allocate resources to underemployed rather than strictly unemployed people. For this task, we use a 1-layer neural network to predict whether an individual makes over \$50,000 annually \cite{folktables}. We similarly consider distribution shift over states, or whether a given model can be distributionally robust given deployment in different geographic settings --- as with the employment dataset we bucket individuals into optimizations by their state of residency. 
% To demonstrate that our approach is agnostic of model architecture, we use 1-hidden-layer neural networks to classify individuals in this dataset.

% \subsection{COVID-19 Dataset}

% Consider a scenario in which the medical community has gained access to . However, access to this treatment is limited and only a finite number of patients will be able to receive it. 
% We consider a real world setting of Paxlovid allocation, a potentially life-saving treatment, for high-risk COVID-19 outpatients. We use a multi-layered perceptron to assess the risk of death for a given patient. However, we want to take into account the fact that temporal factors may confound our ability to treat patients in a deployment settings: patients who enter the hospital at the same time may tend to have similar demographic and health characteristics due to the infectious origin of diseases like COVID-19. Thus, we want to determine whether a given model can be robust to distribution shift when considering patients from similar cohorts. Here, we treat the month in which a patient was treated for COVID-19 as the set of optimization instances, and categorize patients into buckets by treatment month.

\subsection{Loss functions}
We identify and compare the worst-case distribution shift for the following loss functions.
% with respect to three different loss functions shared between all tasks, and several loss functions applicable either only to the binary prediction or regression setting. 
Recall that the objective of our method is to identify a set of distributions over individuals and optimization instances that maximizes the expected value of a given loss function. First, we look at \textbf{top-k}, where the decision maker has $k$ units of the resource available per instance and the objective function is the number of individuals with true label 1 who receive the resource. This is the most canonical model of scarce resource allocation based on predicted risk. Second, we look at \textbf{knapsack}, where the decision maker's objective is the same as the top-k setting but they are subject to a knapsack constraint instead of a simple budget \citep{mulamba2020contrastive, stuckey2020dynamic, Tang}. More specifically, we set an individual's cost to be proportional to the number of years of education, simulating a policy maker who also wishes to guarantee that public assistance is given preferentially to individuals with less education. Note that top-k is a simplification of knapsack where cost is identical for all individuals. 
Third, we study a fairness-motivated loss function in which the decision maker takes the decision rendered by top-k but wishes to assess the equity of the resulting allocation. We calculate the true positive rate (TPR) over all distinct racial subgroups in the optimization instance, and calculate a Gini coefficient using these TPRs as a measure of unfairness of treatment over racial groups. For simplicity, we refer to this loss function as \textbf{fairness-based loss}.

% Third, \textbf{fair}, a fairness-motivated loss function in which the decision maker uses the same objective function as top-k, calculates True Positive Rate (TPR) over all  distinct racial subgroups in the optimization instance, and calculates a Gini coefficient using these TPRs as a measure of unfairness of treatment over racial groups.

For the binary prediction tasks, we look at \textbf{1-accuracy}, or the misclassification rate over individuals. We also look at the cross entropy loss (\textbf{CE}) over individuals. During optimization, CE is scaled as necessary to fit the format required by our Frank-Wolfe solution (i.e. we take the negative inverse of cross-entropy loss).

% For the binary prediction tasks, we first have \textbf{1-accuracy}, or the fraction of individuals correctly classified subtracted from 1, as an example of a standard (decision-blind) metric. Secondly, \textbf{CE}, which takes the negative inverse of cross-entropy loss.

For the income regression task, we look at mean squared error (\textbf{MSE}), which is also scaled as necessary to fit the format required by our Frank-Wolfe sollution (i.e. we take the base-2 log and divide by a constant). Further, we look at a \textbf{utility-based loss}, where a decision maker seeks to maximize the Nash social welfare function of income, divided by a constant, over individuals \citep{kaneko1979nash}. The decision maker also has access to a (finite) budget which prioritizes distributing money to relatively poor individuals to maximize utility.
% We do not include this loss function in the COVID-19 domain, because there is little application motivation for differential costs across individuals.


% In order for the Hessian matrix with respect to optimization variables $W$ to be negative, we require that the loss function $DL$ be upper-bounded. Here, we utilize loss functions, or metrics, bounded between 0 and 1 inclusive; subtracting 1 to arrive at $DL'$ allows us to justify the use of \cite{Bian}'s algorithm. We propose three such metrics below.

% \paragraph{1-Accuracy} In this baseline approach, predictions are made by rounding predictions to the nearest whole number. This task is independent of the number of individuals drawn in a given sample; thus, we sample one individual at a time per sample per iteration of the Frank-Wolfe solution. The remaining loss functions that we employ revolve around more nuanced allocation tasks.

% \paragraph{Top-k Approach} In this approach, individuals are to be allocated resources with a quantity too low to treat all individuals. In this scenario, the model is used to predict the likelihood of each individual's outcome; the top $\frac{1}{k}$ of individuals predicted to have the outcome variable (i.e., unemployment or mortality) are then assigned the resource. Loss is then calculated by subtracting the precision of the resource from the optimal precision. In this experiment, we set $k=3$.

% \paragraph{Knapsack Loss} Here we treat allocation as a form of the 0/1 knapsack problem, where each individual has a binary weight (i.e., [0,1])
% (1 if a positive outcome, 0 if a negative outcome; approximated as a decimal from 0 to 1 by the model) 
% and an assigned cost. In the context of census data, we are interested in allocating resources to relatively less educated people.
% , as a kind of proxy for underprivileged individuals. 
% Each individual's cost is given by adding a constant offset value to their number of years of education, giving more educated individuals a higher cost on the assumption that educated people are less likely to derive benefit from this workforce preparedness program. The objective of the optimization problem is to maximize the sum of weights while keeping the total incurred cost of treated individuals below a predefined limit. Our loss is calculated using relative regret. Formally, let $z$ represent the achieved benefit of the chosen allocation, and let $z^*$ represent the optimal benefit had the optimizer known the true labels beforehand. We define relative regret as: $1 - \frac{z}{z*}$.

% Throughout the following section we define decision-based metrics as metrics that consider the relative predictive model outputs and/or features of individuals in an optimization problem to determine a final treatment over individuals. Out of our proposed loss functions, we consider only the top-k and knapsack losses as decision-based metrics.

\subsection{Experimental Setup}

We train predictive models on each dataset using cross-entropy (CE) loss, mean-squared error (MSE) loss, and Smart Predict-then-Optimize loss (SPO) with knapsack as the underlying decision task \citep{Elmachtoub, Tang}. We train two separate models for each state, one with the SPO loss and one with either CE (for binary classification models) or MSE (for regression models). For each predictive model, we then identify worst-case distributions over all optimization instances, w.r.t. each applicable metric, to obtain different worst-case distributions to compare systematically. Per each of 5 loss functions, this results in 50 worst-case distributions per base model, for 75,000 worst-case distributions over each of the three prediction tasks. After identifying all worst-case distributions, we evaluate each converged worst-case distribution on all \textit{other} applicable metrics. This is accomplished by (i) drawing samples from the  generative model for each worst-case distribution, (ii) evaluating the average loss of these samples when inputted to each of the other loss functions, and (iii) for each model, compiling these averages for all worst-case distributions -- resulting in the following matrices in Figure \ref{fig:fig1}.
See Appendix \ref{sec:exptdetails} for additional experimental details, and Appendix \ref{sec:extragraphs} for additional results.

% Since for the COVID-19 task we are interested in the model's ability to be robust to future cohort distributions, we fit models to the first month with available data (i.e., March 2020). Furthermore, since knapsack is poorly motivated here (i.e., the cost of administering a medical treatment such as Paxlovid should not depend upon demographic or medical properties of individuals) we train SPO models using top-k and do not consider knapsack.

% We train models on each optimization instance using two different loss functions. Models are trained using both cross-entropy loss (we refer to these models as 'CE') and using Smart Predict-then-optimize loss on the knapsack task defined above ('SPO') \cite{Elmachtoub, Tang}.
% % , with code adapted from \cite{Tang}. 
% Thus, for each state we train two models: one using CE loss and one using SPO loss, resulting in 200 total models. Note that, while training using cross-entropy loss can be accomplished with nothing but the raw train set, training using SPO requires treating each optimization instance as a random sample of individuals from the train set. To this end, we take 15,000 samples of $n_j = 20$ individuals, each sample representing one treatment allocation problem. Two such models are trained on each of the two datasets, resulting in four total trained models per each of the fifty optimization instances. For the COVID data, as we are interested in the model's ability to be robust to future changes in cohort distributions, we fit models to the first month in which data is available (March 2020). Furthermore, because the knapsack loss is poorly motivated here (the cost of administering a medical treatment such as Paxlovid should not depend upon demographic or medical properties of individuals) we train SPO models using top-k loss and do not consider the knapsack task.

% Under the assumption that 500 individuals constitutes a representative sample of the training set, we isolate the first 500 entries of the test set for each optimization instance in order to identify worst-case shifts. Each of the 200 trained models are then optimized to find their worst-case distributions with respect to our three provided decision problems. For each distribution we run 15 iterations of our Frank-Wolfe algorithm, with a momentum value of 0.7 and with 75,000 optimization problems sampled per iteration. For each of the three worst-case distributions converged for each of the models, we evaluate the expected performance of the distribution on all three loss functions, given the original model used to generate the distribution. This is accomplished by sampling 200 optimization instances from each distribution/loss function combination, sampling 4000 decision problems from each optimization instance, and aggregating over distribution/loss function pairs. For all models we set variables $\rho_1 = 500, \rho_2 = 6.25$ in order to impose meaningful constraints on optimization problems. We finally aggregate these 3x3 matrices across dataset and model training method. 

% For each of the 30,000 converged distributions we fit a XGBoost regressor predicting the converged weight on a given individual given their features, in order to derive some intuition as to which features are particularly important for determining which individuals will recieve higher probabilites in a given worst-case distribution. This work mirrors similar work by \cite{LIME} in using post-hoc models to explain the decisions of a more ambiguous process. The distributions of the importance scores over features over models are also compiled into aggregate form, broken by prediction task, training function, and evaluated decision function, resulting in 2,500 importance scores per plotted distribution.

\subsection{Results}
% We analyze the loss that the worst case distribution for each loss function induces on the \textit{other} metrics 
Given a worst-case distribution w.r.t. one metric, we evaluate the expected value of all other applicable metrics using samples from the distribution. This allows us to quantitatively assess whether a worst-case shift w.r.t. a decision-blind metric (e.g., 1-accuracy, MSE, CE) is also worst-case w.r.t. a decision-based metric (e.g. top-k, knapsack, fairness-based loss, utility-based loss). We refer to decision-based metrics as metrics that consider the relative predictive model outputs and/or individual-level features in an optimization problem. We refer to decision-blind metrics that do \emph{not} consider the relative predictive model outputs; these metrics can be expressed as a sum of losses over individuals.
% We refer to decision-blind metrics as metrics that do \textit{not} do this; these metrics can be expressed as a sum of losses over individuals. This type of metric assesses the quality of predictions without considering, for each individual, the features or decisions assigned to other individuals. 
Note that worst-case shifts with respect to the  decision-blind metrics can be seen as implementing the standard f-divergence approach popularized by \cite{namkoong2016stochastic} \citep{subbaswamy2021evaluating, li2021evaluating}. We first present these results for each allocation task, and then provide a more qualitative interpretation of how worst-case distributions differ by loss function.

\begin{figure*}[!ht]
\begin{subfigure}{.33\linewidth}
  \centering
  % \includegraphics[width=\linewidth]{images/pdfs/CE_employment_agg_num.pdf}
  \includegraphics[width=\linewidth]{_final_results_ci/binary_CE_employment_bounds.pdf}
  \caption{}
  \label{fig:sfig1-1}
\end{subfigure}%
\begin{subfigure}{.33\linewidth}
  \centering
  % \includegraphics[width=\linewidth]{images/pdfs/CE_income_agg_num.pdf}
  \includegraphics[width=\linewidth]{_final_results_ci/binary_CE_income_bounds.pdf}
  \caption{}
  \label{fig:sfig1-2}
\end{subfigure}%
\begin{subfigure}{.33\linewidth}
  \centering
  % \includegraphics[width=\linewidth]{images/pdfs/SPO_income_agg_num.pdf}
  \includegraphics[width=\linewidth]{_final_results_ci/regression_CE_income_bounds.pdf}
  \caption{}
  \label{fig:sfig1-3}
\end{subfigure}

\begin{subfigure}{.33\linewidth}
  \centering
  % \includegraphics[width=\linewidth]{images/pdfs/SPO_income_agg_num.pdf}
  \includegraphics[width=\linewidth]{_final_results_ci/binary_SPO_employment_bounds.pdf}
  \caption{}
  \label{fig:sfig1-5}
\end{subfigure}%
\begin{subfigure}{.33\linewidth}
  \centering
  % \includegraphics[width=\linewidth]{images/pdfs/SPO_employment_agg_num.pdf}
  \includegraphics[width=\linewidth]{_final_results_ci/binary_SPO_income_bounds.pdf}
  \caption{}
  \label{fig:sfig1-6}
\end{subfigure}%
\begin{subfigure}{.33\linewidth}
  \centering
  % \includegraphics[width=\linewidth]{images/pdfs/SPO_income_agg_num.pdf}
  \includegraphics[width=\linewidth]{_final_results_ci/regression_SPO_income_bounds.pdf}
  \caption{}
  \label{fig:sfig1-7}
\end{subfigure}
\caption{Diagonal-normalized aggregated heat maps over states for models trained with CE loss (in the regression case, MSE loss) (top row) and SPO loss (bottom row). From left to right in each row, results are displayed by task for (a,d) employment classification, (b,e) income classification, and (c,f) income regression. Within each heat map, rows denote the metric the worst-case distribution maximizes, and columns denote the metrics the worst-case distribution was evaluated on. Note that each column is divided by the diagonal entry in that column, resulting in a main diagonal of all 1.0. Furthermore, because CE loss is always negative, each entry in columns corresponding to CE loss is equal to the diagonal entry in that column divided by the original loss in that cell. The strong main diagonals here accentuate our observation that worst-case distributions w.r.t. a given metric tend to 'overfit' on that metric. Note that cross-entropy and accuracy are used only in binary classification tasks, and mean-squared error and utility-based loss are used only in the income regression task.}
\label{fig:fig1}

\end{figure*}

% \begin{figure}[!ht]
% \begin{subfigure}{.4\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{images/pdfs/SPO_COVID_agg_num.pdf}
%   \caption{}
%   \label{fig:sfig2-1}
% \end{subfigure}%
% \begin{subfigure}{.4\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{images/pdfs/CE_COVID_agg_num.pdf}
%   \caption{}
%   \label{fig:sfig2-2}
% \end{subfigure}

% \caption{Aggregated results over months, for (a) SPO  and (b) CE models trained on the first month of COVID-19 data. Note that worst-case distributions w.r.t. 1-accuracy obtain higher loss on 1-accuracy than worst-case distributions w.r.t. top-k. Also, worst-case distributions w.r.t. top-k obtain higher loss on top-k than worst-case distributions w.r.t. 1-accuracy.}
% \label{fig:fig2}

% \end{figure}

% \textbf{We find that the choice of metric to find a worst-case distribution w.r.t matters -- worst-case distributions tend to overfit greatly to the metrics they optimize.} This conclusion is apparent in
In Figure \ref{fig:fig1}, we observe the performance of worst-case distributions w.r.t. each metric (both decision-blind and decision-based) on all other metrics. We find that worst-case distributions w.r.t. a given metric tend to \textit{overfit} on that metric. In other words, for a given dataset and prediction task, the worst-case distribution w.r.t. a metric (e.g. CE) tends to perform comparatively worse on other metrics (e.g. top-k) than the worst-case distribution w.r.t. that other metric. In other words, the expected value of top-k induced by sampling individuals from the worst-case distribution w.r.t. top-k will be higher than the expected value of top-k induced by sampling individuals from the worst-case distribution w.r.t. CE. In particular, this can be observed from the main diagonal entries throughout all subfigures in Figure \ref{fig:fig1}: no worst-case distribution achieved a metric that was (noticeably) higher than the worst-case distribution trained w.r.t. that metric. We find that this trend is generally consistent across all considered metrics, models, and datasets/prediction tasks.

These observations lend great importance to considerations of the downstream allocation tasks predictive models may face when deployed in the real world. For instance, when a practitioner trains a robust model w.r.t one metric (decision-based \textit{or} decision blind), if the downstream allocation problem is changed (even subtly) (e.g. changing the cost per individual from constant to varying based on individual features), in many settings, models could break under shifts that were not initially thought to be worst-case.

We find that the use of decision-blind metrics (MSE, CE, accuracy) to inform judgement on worst-case outcomes for decision-based tasks, in general, risks underestimating true worst-case outcomes, which we can justify with thorough examination of the individuals in our optimization instances. Through these results (see Figures \ref{fig:fig1}, \ref{fig:fig3}, \ref{fig:fig4}), we find that existing methods of identifying worst-case distributions (e.g. DRO), which generally focus on \textit{decision-blind} metrics and optimize worst-alpha-tail performance over individuals, fail to accurately depict worst-case outcomes in the predictive resource allocation setting, wherein model predictions are passed into higher-level optimization problems for decision-making. This implies that our method identifies worst-case distributions for allocative tasks \textit{substantially better than existing DRO-like methods}, as our method more precisely considers the structure of downstream (allocation) problems in our solutions. Below we discuss potential mechanisms for this behavior by analyzing which types of individuals are more likely to be sampled in worst-case distributions w.r.t. a given metric, and how these individuals change given a worst-case distribution w.r.t. a different metric.

% This implies that using DRO-like methods for developing robust models for downstream allocation tasks is \textit{ineffective} -- these methods will not effectively estimate worst-case outcomes for these allocation tasks. 

\begin{figure}[!t]

\begin{subfigure}{.5\linewidth}
    \centering
    \includegraphics[width=\linewidth]{fig3/NM_CE_income_NH_ce.pdf}
    \caption{}
    \label{fig:sfig3-1}
\end{subfigure}%
\begin{subfigure}{.5\linewidth}
    \centering
    \includegraphics[width=\linewidth]{fig3/NM_CE_income_NH_fair.pdf}
    \caption{}
    \label{fig:sfig3-2}
\end{subfigure}

% \begin{subfigure}{.5\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{paper_visualizations/AL_CE_employment_CA_acc.pdf}
%   \caption{}
%   \label{fig:sfig3-1}
% \end{subfigure}%
% \begin{subfigure}{.5\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{paper_visualizations/AL_CE_employment_CA_ce.pdf}
%   \caption{}
%   \label{fig:sfig3-2}
% \end{subfigure}

% \begin{subfigure}{.5\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{paper_visualizations/AL_CE_employment_CA_fair.pdf}
%   \caption{}
%   \label{fig:sfig3-3}
% \end{subfigure}%
% \begin{subfigure}{.5\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{paper_visualizations/AL_CE_employment_CA_skim.pdf}
%   \caption{}
%   \label{fig:sfig3-4}
% \end{subfigure}

% \begin{subfigure}{.5\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{paper_visualizations/AL_CE_employment_CA_knapsack.pdf}
%   \caption{}
%   \label{fig:sfig3-5}
% \end{subfigure}

\caption{Plots of individuals in an optimization instance in the employment prediction task, from the perspective of worst-case distributions w.r.t. CE (a) and the fairness-based metric (b). The underlying predictive model was trained with CE loss. For each worst-case distribution w.r.t. the metric of interest, we display over all individuals their model predictions, assigned weights, and education level, split by true label. Note that the color-bar denotes the weight in the worst-case distribution and that differently-shaped points represent individuals of different races (circle for white, square for non-white).}

\label{fig:fig3}

\end{figure}

\begin{figure}[!t]

\begin{subfigure}{.5\linewidth}
    \centering
    \includegraphics[width=\linewidth]{fig4/WY_CE_income_CA_mse.pdf}
    \caption{}
    \label{fig:sfig4-1}
\end{subfigure}%
\begin{subfigure}{.5\linewidth}
    \centering
    \includegraphics[width=\linewidth]{fig4/WY_CE_income_CA_util.pdf}
    \caption{}
    \label{fig:sfig4-2}
\end{subfigure}%

% \begin{subfigure}{.5\linewidth}
%   \centering
%   % \includegraphics[width=\linewidth]{images/pdfs/va_spo_employment_final.pdf}
%   \includegraphics[width=\linewidth]{paper_visualizations/WY_SPO_income_CA_mse.pdf}
%   \caption{}
%   \label{fig:sfig4-1}
% \end{subfigure}%
% \begin{subfigure}{.5\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{paper_visualizations/WY_SPO_income_CA_fair.pdf}
%   \caption{}
%   \label{fig:sfig4-2}
% \end{subfigure}

% \begin{subfigure}{.5\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{paper_visualizations/WY_SPO_income_CA_util.pdf}
%   \caption{}
%   \label{fig:sfig4-3}
% \end{subfigure}%
% \begin{subfigure}{.5\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{paper_visualizations/WY_SPO_income_CA_top-k.pdf}
%   \caption{}
%   \label{fig:sfig4-4}
% \end{subfigure}

% \begin{subfigure}{0.5\linewidth}
%   \includegraphics[width=\linewidth]{paper_visualizations/WY_SPO_income_CA_knapsack.pdf}
%   \caption{}
%   \label{fig:sfig4-5}
% \end{subfigure}

\caption{Plots of individuals in an optimization instance in the income regression task, from the perspective of worst-case distributions w.r.t. mse (a) and the utility-based metric (b). The underlying predictive model was trained with mean-squared error loss. For each worst-case distribution, we display over all individuals their model predictions, label income, and assigned weights. In each figure the identity line ($\text{True Income} = \text{Model Prediction}$) is marked with a dotted line.}

\label{fig:fig4}
\end{figure}

% \begin{figure}[!ht]
% \begin{subfigure}{.4\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{_final_results_ci/SPO_COVID_agg_num.pdf}
%   \caption{}
%   \label{fig:sfig5-1}
% \end{subfigure}%
% \begin{subfigure}{.4\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{_final_results_ci/CE_COVID_agg_num.pdf}
%   \caption{}
%   \label{fig:sfig5-2}
% \end{subfigure}

% \caption{Diagonal-normalized aggregated results over months, for (a) SPO  and (b) CE models trained on the first month of COVID-19 data. Observe similar strong main diagonals as we saw in Figure \ref{fig:fig1}}.

% \label{fig:fig5}

% \end{figure}

% \begin{figure}[!t]
% \begin{subfigure}{.5\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{fig6/ce_acc.png}
%   \caption{}
%   \label{fig:sfig6-1}
% \end{subfigure}%
% \begin{subfigure}{.5\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{fig6/ce_ce.png}
%   \caption{}
%   \label{fig:sfig6-2}
% \end{subfigure}

% \begin{subfigure}{.5\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{fig6/ce_fair.png}
%   \caption{}
%   \label{fig:sfig6-3}
% \end{subfigure}%
% \begin{subfigure}{.5\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{fig6/ce_skim.png}
%   \caption{}
%   \label{fig:sfig6-4}
% \end{subfigure}

% \caption{(a-d) Joint distributions over individuals in an optimization instance in the COVID mortality prediction task, from the perspective of worst-case distributions trained w.r.t. binary prediction loss functions. The underlying predictive model was trained using CE loss. For each worst-case distribution w.r.t. the metric of interest (in order of (a) accuracy, (b) CE, (c) fair, and (d) top-k, we display the joint distribution of model predictions and assigned weights, split by true label.}

% \label{fig:fig6}

% \end{figure}


% \textbf{We empirically find that the choice of metric for worst-case analysis has a substantial impact on the conclusions developers can make regarding model performance on other metrics.}

% \textbf{We empirically find that worst-case distributions w.r.t. decision-blind metrics differ substantially from worst-case distributions w.r.t. decision-based metrics.} First, we empirically analyze all matrix plots and find a general trend of strong on-diagonal entries and relatively weak off-diagonal entries, indicating that the converged distribution w.r.t. 1-accuracy performs very poorly (achieved low loss values) on both decision-based metrics (i.e., knapsack, top-k) (see Figures \ref{fig:fig1}, \ref{fig:fig2}, \ref{fig:sfig3-1}, \ref{fig:sfig4-1}). Our aggregate results over training methods and prediction task are shown in Figure \ref{fig:fig1} for census data, and in Figure \ref{fig:fig2} for the COVID-19 dataset. We supplant these aggregate results with Figures \ref{fig:sfig3-1} and \ref{fig:sfig4-1} for two example plots of worst-case distribution performances over metrics on individual models. We clearly observe that \textit{different decision-based metrics result in highly different worst-case distributions}. In short, we empirically find that for each worst-case distribution, the distribution performed exceedingly well w.r.t the metric it was trained on.

% Our aggregate results over training methods and prediction task are shown in Figure \ref{fig:fig1} for census data, and in Figure \ref{fig:fig2} for the COVID dataset. In both cases, we generally observe strong on-diagonal entries and relatively weak off-diagonal entries, indicating that the converged distribution w.r.t. 1-accuracy performs very poorly (achieved low loss values) on both decision-based metrics (i.e., knapsack loss and top-k approach) (see Figures \ref{fig:fig1}, \ref{fig:fig2}).

 % Figure \ref{fig:sfig1-3} further visualizes this, where we clearly see a similarly strong main diagonal with weak off-diagonal entries on the aggregation of models of this type (SPO-trained employment prediction).

% We observe similar empirical trends in the remaining matrix plots (see Figure \ref{fig:sfig3-1} and Figure \ref{fig:sfig4-1}). In Figure \ref{fig:sfig3-1}, we observe that while the worst-case distribution w.r.t. top-k realized high losses on knapsack evaluation, the worst-case distribution w.r.t. knapsack observed significantly lower losses on top-k evaluation. In Figure \ref{fig:sfig4-1}, we similarly see that for each worst-case distribution, the model performed exceedingly well on itself (i.e., the metric it was trained on) and only itself. 

% We clearly observe from the differences in performance of the three distributions (each defined w.r.t. its corresponding metric) in Figure \ref{fig:sfig4-1} that \textit{different decision-based metrics result in highly different worst-case distributions}. These qualitative differences are clearly reflected in Figure \ref{fig:sfig4-1}: we see for each worst-case distribution, the model performed exceedingly well on itself (i.e., the metric it was trained on) and only itself.

%It seemed that overall our method was effectively maximizing the expected value of the loss function given the available data.
% , as over all aggregate worst-case distributions of a given loss function we are able to converge to fairly high values of expected loss.


% The converse is also true: the converged distributions on knapsack and top-k loss both seem to result in fairly low accuracy-based losses.
% This is a trend that we attempt to visualize in Figure \ref{fig:fig3} --- in fact, one major strength of our methodology is our ability to easily visualize and run interpretive post-hoc analysis on our experimental results by visualizing the converged weights of individuals of varying feature values and who received varying outputs from the predictive model.

% An advantage of our proposed method is its interpretability and ability to easily run interpretable post-hoc analysis 
 
For further analysis, we examine differences between the worst-case distributions w.r.t. a selection of metrics by visualizing the converged weights of individuals of varying feature values and varying model predictions (see Figures \ref{fig:fig3}, \ref{fig:fig4}). We find that, given the same individuals within the same optimization instance, worst-case distributions w.r.t. different metrics differ in the individuals they tend to upweight. We clearly observe that in a worst-case distribution w.r.t CE (Figure \ref{fig:sfig3-1}), individuals are weighed directly proportionally to their distance from the decision boundary (e.g. false positives with high model predictions receive high weights, as do false negatives with low model predictions). We can use similar logic when identifying which individuals are upweighted in the worst-case distribution w.r.t. MSE (Figure \ref{fig:sfig4-1}); individuals are gradually upweighted the higher their residual is, or in other words, the farther their prediction strays from their ground-truth label (noted by the dotted identity lines in Figure \ref{fig:fig4}).

In contrast, worst-case distributions w.r.t. decision-based metrics upweight systematically different sets of individuals. We observe that in Figure \ref{fig:sfig3-2}, two particular types of individuals are highly upweighted: a non-white, positive (i.e. unemployed) individual with a high model prediction, along with a white, negative (i.e. employed) individual with a high model prediction. When these individuals are considered in conjunction within the fairness metric, we tend to correctly treat many positive non-white individuals, incorrectly treat many negative white individuals, and incorrectly fail to treat some positive white individuals. Notice that while non-white individuals tend to be correctly treated here, white individuals are consistently assigned the incorrect treatment, thus achieving high disparity in the quality of treatment between non-white and white individuals and a high fairness-based loss metric. Turning our focus to the utility-based loss metric with income regression in Figure \ref{fig:sfig4-2}, the worst-case distribution seems to upweight only relatively high-residual individuals, as expected by definition of the utility-based loss metric. More concretely, since our Nash social welfare function scales with the log of income, there exists a diminishing returns effect of allocating money to high-income individuals. Therefore, we can maximize the impact of an ideal treatment on individuals, and thus, achieve higher levels of relative regret, if our sample consists of many low-income individuals, but we instead choose to mistakenly treat higher-income individuals whose predicted income is particularly low. Furthermore, individuals with relatively high values for both ground-truth labels and predictions do not receive high weights despite having large residuals, since they still have higher predicted incomes than the upweighted individuals and are unlikely to be treated in this allocation setting.

% indiscriminately on incorrectly classified samples (i.e., for accuracy-based metrics, decision boundaries are defined by a threshold), while

% Using two models trained on Alaska's train set and evaluated on Alaska's test set as an example we visualize differences between the worst-case distributions for all three selected loss functions. Figure 3b, a worst-case accuracy distribution, features weights placed indiscriminately on incorrectly classified samples, where in contrast the decision-based worst-case distributions in Figure 3c and 3d place weights on the more specific subset of false positives with high model predictions.

% This limitation of accuracy-based metrics in binary prediction settings can be attributed to the fact that accuracy-based metrics use hard thresholds (e.g., $0.5$) as decision boundaries. This means that we can divide our data points into four categories, which we term output classes: true/false positives, and true/false negatives. A worst-case distribution w.r.t. 1-accuracy then sets all weights to be equal within each of these four classes (see Figure \ref{fig:sfig7-1}). In other words, individuals within the same class (e.g., false negative) but with greatly different predicted probabilities (e.g., 0.55 vs. 0.99) are both equally likely to be sampled. In contrast, the worst-case distributions of our two decision tasks  upweight only the highest-probability false negatives. 

% Due to this inherent limitation of accuracy-based metrics, we notice that our worst-case distribution w.r.t. 1-accuracy has a fairly high chance of a) sampling a true positive individual with a high predicted probability by the model and b) sampling a false negative individual with a relatively low prediction (e.g., 0.55). In the event that too many of these high-probability true positives are sampled using the worst-case distribution w.r.t. 1-accuracy, under the top-k approach, our optimizer will allocate resources to true positive individuals, in which case we achieve lower loss values. In all, given the visualizations of these joint distributions over individuals, given some worst-case distribution, we are able to provide \textit{interpretable} and empirical explanations of the aggregate behavior we see reflected in high-level results such as those in Figure \ref{fig:fig1}.

% Figure \ref{fig:sfig4-3} also suggests that for the regression setting, individuals with large residuals, large labels, and also large predictions (eg prediction = \$70k, label = \$200k) do not receive high weights despite having large residuals, since they are unlikely to be selected/be one of the lowest-predicted individuals regardless. 

% The worst-case distributions w.r.t. knapsack (\ref{fig:sfig3-5}, \ref{fig:sfig4-5}) appear qualitatively similar to the worst-case distributions w.r.t. top-k, although based on Figure \ref{fig:sfig3-5} this distribution seems to place a higher degree of weight on low-education negatives (e.g. we want to prioritize negatives who are likely to be selected, by both taking into account high-prediction negatives and cheap ones).
% This results in the worst-case accuracy distribution's poor performance on decision-based metrics in which the relative predictions of individuals in the decision problem are compared. 

% In short, the key insight is that worst-case distributions w.r.t. 1-accuracy create discrete segmentations of the data based on absolute decision boundaries and do not take into account the relative prediction of individuals in a fixed sample, whereas worst-case decision task distributions weight individuals based on less piecewise functions of their label and model prediction. 

% \begin{figure}[!t]
% \begin{subfigure}{.5\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{fig7/spo_acc.png}
%   \caption{}
%   \label{fig:sfig7-1}
% \end{subfigure}%
% \begin{subfigure}{.5\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{fig7/spo_skim.png}
%   \caption{}
%   \label{fig:sfig7-2}

% % \begin{subfigure}{.5\linewidth}
% %   \centering
% %   \includegraphics[width=\linewidth]{fig7/spo_ce.png}
% %   \caption{}
% %   \label{fig:sfig7-2}
% % \end{subfigure}

% % \begin{subfigure}{.5\linewidth}
% %   \centering
% %   \includegraphics[width=\linewidth]{fig7/spo_fair.png}
% %   \caption{}
% %   \label{fig:sfig7-3}
% % \end{subfigure}%
% \end{subfigure}

% \caption{(a-d) Joint distributions over individuals in an optimization instance in the COVID mortality prediction task, from the perspective of worst-case distributions trained w.r.t. 1-accuracy (a) and top-k (b). The underlying predictive model was trained using SPO loss. For each worst-case distribution, we display the joint distribution of model predictions and assigned weights, colored by true label.}

% \label{fig:fig7}

% \end{figure}

% Continuing this logic, from figures \ref{fig:sfig3-1}-\ref{fig:sfig3-5}, the qualitative differences between the worst-case distributions trained w.r.t. each of the binary loss functions become easily and visually apparent. An example worst-case distribution w.r.t. CE (see Figure \ref{fig:sfig3-2}) reveals that cross-entropy or residual-based loss functions allow for a smoother gradient from correct to incorrect predictions, wherein positive individuals receive gradually decreasing converged weights as their predictions become larger, while negative individuals receive gradually increasing predictions as predictions increase (or become "more" wrong). The worst-case distribution w.r.t. MSE (Figure \ref{fig:sfig4-1}) is very similar to this; individuals are gradually upweighted the higher their residual is, e.g. the farther their prediction strays from their label income.

% From the above analysis, we conclude that worst-case distributions w.r.t. the decision-based metrics upweight significantly different groups of individuals in a given optimization instances from the worst-case distributions w.r.t. decision-blind metrics. Indeed, our results suggest that, in general, virtually \textit{all} significantly different loss functions yield structurally different worst-case optimization instances that result in different converged worst-case distributions. These apparent differences between worst-case distributions w.r.t. decision-based functions line up with Figure \ref{fig:fig1}, where we see that worst-case distributions w.r.t. decision-based functions tend to perform poorly on other decision-based functions here.


% The example worst-case distribution w.r.t. top-k in Figures \ref{fig:sfig3-4} and \ref{fig:sfig4-4} appears to prioritize individuals with the highest residuals without the smooth gradient in the worst-case distribution w.r.t. CE or MSE. Figure \ref{fig:sfig4-3} also suggests that for the regression setting, individuals with large residuals, large labels, and also large predictions (eg prediction = \$70k, label = \$200k) do not receive high weights despite having large residuals, since they are unlikely to be selected/be one of the lowest-predicted individuals regardless. The worst-case distributions w.r.t. knapsack (\ref{fig:sfig3-5}, \ref{fig:sfig4-5}) appear qualitatively similar to the worst-case distributions w.r.t. top-k, although based on Figure \ref{fig:sfig3-5} this distribution seems to place a higher degree of weight on low-education negatives (e.g. we want to prioritize negatives who are likely to be selected, by both taking into account high-prediction negatives and cheap ones).

% The worst-case distribution w.r.t. fair seems to take race into account more significantly  (e.g. \ref{fig:sfig3-3} and \ref{fig:sfig4-3} have much more weight on non-white, or non-circular dots). Finally, the worst-case distribution w.r.t. util tends to upweight high-residual individuals, subject to these individuals having lower-magnitude labels (we want to make sure that we achieve high regret, and since util scales with log-money this can only be done if the optimal treatment gives lots of money to low-income individuals).

% We further visualize this more concretely in Figure \ref{fig:fig6} and Figure \ref{fig:fig7}, taking our real-world COVID-19 case study as an example. The clearest difference between Figures \ref{fig:sfig6-1} and \ref{fig:sfig6-4}, and Figures \ref{fig:sfig7-1} and \ref{fig:sfig7-4}, respectively, is the continuous relationship that converged weights in worst-case distributions w.r.t. top-k take w.r.t. model predictions versus the piecewise relationship that we see for worst-case distributions w.r.t. 1-accuracy. In short, worst-case distributions w.r.t. 1-accuracy create discrete segmentations of the data based on absolute decision boundaries and do not take into account the relative prediction of individuals in a fixed sample, whereas worst-case distributions w.r.t decision-based metrics weigh individuals based on functions of their labels and model predictions that are not necessarily discrete, piecewise functions. The visual differences in worst-case distributions w.r.t. various metrics is also apparent in the other worst-case joint distribution plots in these two figures.

% From the above analysis, we conclude that worst-case distributions w.r.t. the decision-based metrics upweight significantly different groups of individuals in a given optimization instance from the worst-case distributions w.r.t. decision-blind metrics. Indeed, our results suggest that, in general, virtually \textit{all} significantly different loss functions yield structurally different worst-case optimization instances that result in different converged worst-case distributions. These apparent differences between worst-case distributions w.r.t. decision-based functions line up with Figure \ref{fig:fig1}, where we see that worst-case distributions w.r.t. decision-based functions tend to perform poorly on other decision-based functions here.

Based on the observed differences between the worst-case distributions w.r.t. decision-blind metrics and worst-case distributions w.r.t. decision-based metrics, we conclude that practitioners who use current off-the-shelf DRO methods for building robust predictive allocation models \textit{may not actually be training models robust to the true worst-case shifts in the downstream decision task their model will be used for}. In other words, the DRO equivalent of greedily placing weight on poorly-performing individuals w.r.t. decision-blind metrics results in worst-case distributions metrics that fundamentally differ in composition from the worst-case distributions for an optimization-level, decision-based metric. These results provide insights to practitioners deploying robust predictive models for resource allocation: allocative tasks make structurally different uses of machine learning predictions than decision-blind prediction tasks, and methods for identifying distribution shifts must reflect this structure.  

% These results present a wake-up call to those interested in deploying robust predictive models for resource allocation: existing methods to develop robust predictors fail when subjected to downstream allocation tasks.


% These observations lend even \textit{greater importance to considerations of the downstream allocation tasks predictive models may face when deploying real-world systems} -- if, for instance, a practitioner trains a robust model w.r.t one decision-based metric, if the downstream allocation problem is changed even subtly (e.g. changing the cost per individual from a constant to varying based on individual features), in many settings the model could break down under shifts that were not initially thought to be worst-case.

% We further visualize this more concretely in Figure \ref{fig:fig5}, taking our real-world COVID-19 case study as an example. The clearest difference between Figures \ref{fig:sfig5-1} and \ref{fig:sfig5-2}, and Figures \ref{fig:sfig5-3} and \ref{fig:sfig5-4}, respectively, is the continuous relationship that converged weights in worst-case distributions w.r.t. top-k take w.r.t. model predictions versus the piecewise relationship that we see for worst-case distributions w.r.t. 1-accuracy. In short, worst-case distributions w.r.t. 1-accuracy create discrete segmentations of the data based on absolute decision boundaries and do not take into account the relative prediction of individuals in a fixed sample, whereas worst-case distributions w.r.t decision-based metrics weigh individuals based on functions of their labels and model predictions that are not necessarily discrete, piecewise functions.


% In other words, while in worst-case distributions w.r.t. 1-accuracy weights are assigned equally within a given output class of an individual, worst-case top-k distributions tend to give higher weights to high-probability true negative individuals, as well as low-probability true positive individuals, even within the same output class. 
% This figure serves as yet another piece of evidence in support of the crucial distinction between accuracy-based and decision-based tasked in resource allocation.

% Meanwhile, worst-case decision task distributions also seem to perform poorly on accuracy-based tasks. We note that in decision tasks such as our top-k and knapsack loss functions a high proportion of weight must be placed specifically on those false negative individuals with very high predictions. However, if we know that we are likely to sample a reasonable number of these individuals due to the large probability placed on them, it no longer matters what other kinds of individuals round out our sample --- we know that we will treat the false negative predictions and achieve high loss regardless of the other individuals in the sample. As a result, some correctly predicted individuals, such as true negatives or true positives closer to the decision boundary, can still be included in the decision task without affecting its outcome, and so a worst-case distribution can place weight on these 'filler' individuals without significant consquences. To quantify this observation, we note that the proportion of weight put on correctly classified negatives in the worst-case knapsack distribution was 0.326, and 0.276 for the worst-case top-k distribution. This was several times greater than the same proportion for worst-case accuracy, 0.087. So, we assert that clear structural differences between these accuracy-based and decision-based tasks are reflected by their poor performance on the opposing type of task. These trends are clearly reflected in Figure \ref{fig:sfig3-1}: the worst-case shifts with respect to accuracy differ substantially from the two worst-case decision loss shifts. In all we conclude that the DRO equivalent of greedily placing weight on poorly-performing individuals fundamentally differs in composition from the worst-case distribution for an optimization-level decision task.


% This structure unique to the worst-case distributions w.r.t. 1-accuracy leads to two problems, when used to draw samples as input to our decision tasks. First, due to our constraints on the optimization variables, it may not be possible to place 100\% of weights solely on misclassified samples --- in fact, in this example, 11.4\% of weight is placed on true positive individuals with a predicted probability or reward greater than the decision boundary of 0.5; when we increase our cutoff to 0.9 this fraction decreases to 7.4\% but is still non-trivial. 
% Second, since accuracy-based tasks use thresholds as decision boundaries, 


% While this problem is certainly not unique to worst-case accuracy, it comes in conjunction with the second problem, which is the fact that accuracy-based tasks utilize hard cutoffs as decision boundaries. These boundaries (which we set to 0.5 in this project) mean that we can categorize our data into four categories, which we may term output classes (true/false positives, and true/false negatives). A worst-case accuracy distribution will then set all weights to be roughly equal within each of these four classes, which is something we observe in Figure 3b. However, this also means that a false negative indivdual with predicted probability 0.55 is just as likely to be sampled as another false negative individual with prediction 0.99. In contrast, the worst-case distributions of our two decision tasks learn to place weight only on the highest-probability false negative. When we couple these two observations, we notice that our worst-case accuracy distribution has a fairly high chance of a) sampling a true positive individual with a high predicted probability by the model, and then b) sampling a false negative individual with a relatively low prediction, say 0.55. In the event that too many of these high-probability true positives are sampled using the worst-case accuracy distribution, under the top-k problem our optimizer will allocate treatment to true positive individuals, in which case we achieve lower loss values. These kinds of fairly common edge cases result in the worst-case accuracy distribution's extremely poor performance on decision tasks in which the \textit{relative predictions} of individuals in the decision problem are compared. 



% \textbf{Worst-case distributions can vary in different settings, and at times even different decision tasks can have qualitatively different worst-case distributions}. We also observe that the model's training function has non-trivial implications on aggregate results. From Figure 2a and 2b we see that in cross-entropy trained models worst-case skim distributions appear to 'dominate' worst-case knapsack distributions. Here the worst-case distribution for skim loss seems to match or outperform the worst-case distribution for knapsack loss when evaluated on knapsack loss, a potential reflection of inefficiencies in the Frank-Wolfe optimizer when exposed to too few data. Furthermore the worst-case distribution for skim loss consistently outperforms the worst-case distribution for knapsack loss when evaluated on skim loss by 5 and 8 percent on average, on the employment and income prediction tasks respectively. 

% Meanwhile a different pattern emerged for the SPO-trained employment prediction models: namely, we see relatively weak off-diagonal entries overall, meaning each worst-case distribution appears to perform comparatively poorly on the two other decision tasks. While in both cross-entropy and SPO loss settings we see a divergence between worst-case distributions w.r.t. 1-accuracy and worst-case decision task distributions, in the SPO employment case it is clear that each distribution to some degree is effectively specializing in the loss function used to train the distribution.

% \textbf{Optimization w.r.t. different decision-based metrics can lead to significantly different worst-case distributions.}
% \textbf{Identifying worst-case distributions w.r.t. one decision-based metric may or may not result in the worst-case distribution w.r.t. another decision-based metrics}.  
% We clearly observe from the differences in performance of the three distributions (each defined w.r.t. its corresponding metric) in Figure \ref{fig:sfig4-1} that \textit{different decision-based metrics result in highly different worst-case distributions}. These qualitative differences are clearly reflected in Figure \ref{fig:sfig4-1}: we see for each worst-case distribution, the model performed exceedingly well on itself (i.e., the metric it was trained on) and only itself. Figure \ref{fig:sfig1-3} further visualizes this, where we clearly see a similarly strong main diagonal with weak off-diagonal entries on the aggregation of models of this type (SPO-trained employment prediction).
% Further, we note that in Figure \ref{fig:sfig3-1}, while the worst-case distribution w.r.t. top-k realized high losses on knapsack evaluation, the worst-case distribution w.r.t. knapsack observed significantly lower losses on top-k evaluation. 
% THIS WAS IN ORIGINAL In Figure \ref{fig:sfig3-1}, while the worst-case distribution w.r.t. top-k realized high losses on knapsack evaluation, the worst-case distribution w.r.t. knapsack observed significantly lower losses on top-k evaluation. We explain this difference with the joint distributions shown in Figure \ref{fig:fig3}. In particular, the worst-case distribution w.r.t. knapsack in Figure \ref{fig:sfig3-4} placed a higher portion of weight (i.e., 0.028) on high-probability true positive individuals (i.e., predicted probabilities > 0.9) than the worst-case distribution w.r.t. top-k (i.e., 0.016) (see Figure \ref{fig:sfig3-3}).
% Furthermore, in Figure \ref{fig:sfig3-4}, we observe higher converged probabilities (i.e., brighter colors) in the band of positive individuals with predictions between 0.8 and 0.9 than in Figure \ref{fig:sfig3-3}.
% Note that this trend is subtle in the visualization, due to the wide range of probabilities assigned to positives.
% , there exists a wide range of colors plotted in Figure \ref{fig:sfig3-3} and Figure \ref{fig:sfig3-4} making this trend seem more subtle than it is. 
% THIS WAS IN ORIGINAL Hence, treating these more commonly sampled positive individuals with high predicted probabilities in the worst-case distribution w.r.t. knapsack results in lower losses on top-k. Thus, \textit{the two decision-based metrics had structurally different worst-case distributions that resulted in varying performance on each others' loss functions.} 


% In fact, while in the worst-case  distribution w.r.t. knapsack a combined probability of 0.028 was placed on positive individuals with a predicted probability of over 0.9, this value was only 0.016 for the worst-case distribution w.r.t. top-k. 
% The fact that in many settings a worst-case distribution w.r.t. top-k was equivalent to a worst-case  distribution w.r.t. knapsack, but not the other way around, motivates increased scrutiny into the structure of individual decision problems in the process of model selection.

% The ability to easily understand what kinds of individuals tend to be sampled more frequently by our worst-case distributions in this tabular setting allows for a building of intuition behind an otherwise nebulous iterative optimization process.

% THIS WAS IN ORIGINAL We observe similar behavior in Figure \ref{fig:fig4} for the SPO-trained employment prediction models, where we observe significant qualitative differences between all three worst-case distributions: \textit{no worst-case distribution was able to achieve high average loss on a metric it was not trained on}. Note that (see Figures \ref{fig:sfig4-2}-\ref{fig:sfig4-4}) for some SPO-trained employment prediction models, the prediction model learned to predict very high probabilities on all individuals, since knapsack does not use a fixed decision threshold and shifts its predicted probabilities towards extreme values that minimize SPO loss.
% In all, we find that, because there are more \textit{cheap} (low-educated) negative individuals with fairly high predicted probabilities, the worst-case distribution w.r.t. knapsack places high weights on these individuals, who are not normally upweighted given the predictions of other types of models. 
% Because these individuals still have slightly lower predicted probabilities than some more expensive negative individuals, the worst-case distribution w.r.t. knapsack performs worse on top-k as it tends to sample negative individuals who do not have the highest predicted probabilities. 

% THIS WAS IN ORIGINAL Overall, we find that the worst-case distribution w.r.t. the knapsack problem favors less-costly (i.e. low-educated) negative individuals, due to their relatively high predicted probabilities compared to other types of models. This upweighting results in poorer performance on top-k, as it samples negative individuals with lower predicted probabilities compared to more costly negative individuals. The converse is also true -- the worst-case distribution w.r.t. top-k results in poor performance on knapsack, as it places weights on many cheap, positive individuals with low probabilities (i.e., individuals that tend to get treatment in knapsack), as it does not consider the cost vectors used by knapsack. 

% We further observe that worst-case distribution w.r.t. top-k results in poor performance on knapsack evaluation, as it places weights on many cheap, positive individuals with low probabilities (i.e., individuals that tend to get treatment in knapsack), since it does not consider the cost vectors used by the knapsack metric. 

% Furthermore because the top-k metric does not consider the cost vectors used as input into knapsack it places weights on many cheap, positive individuals with low probabilities; these individuals tend to get treated in knapsack resulting in low losses.

% Note that, because SPO-trained models consider the relative predictions of individuals in a given knapsack problem during training, there is no concept of a decision threshold --- the model can shift its predictions towards any extreme that results in low values of SPO loss. 

% In Figure \ref{fig:sfig4-2} we see that, because all predictions were high (SPO-trained models need only worry about the relative predictions between individuals and so can output predictions with a much higher distributional center than .5 for binary prediction), all negative individuals were given non-zero weights by the worst-case distribution w.r.t. accuracy. As a result this distribution would only ever sample negative individuals, in which case the top-k and knapsack problems become trivial and no loss is realized. In contrast, the worst-case distributions for the two decision-based metrics assign weights to both positive and negative individuals. 

% While the worst-case distribution w.r.t. top-k (Figure \ref{fig:sfig4-3}) places a fairly consistent amount of weight on both true negative individuals with near-1 predictions and true positives with slightly lower predictions, the worst-case distribution w.r.t knapsack (Figure \ref{fig:sfig4-4}) places a higher degree of weight on \textit{cheaper}, less educated negative individuals. Furthermore, the worst-case distribution w.r.t. knapsack also assigns non-trivial weights to \textit{expensive} true positive individuals with high predicted probabilities, presumably because, in the context of the knapsack metric, more expensive individuals, regardless of their predicted values, are unlikely to be treated and therefore do not contribute to the outputted allocation. We posit that this distinction results in poor top-k metric performance, because in the top-k allocation setting individuals are treated with constant cost vectors. 

% Second, because the worst-case distribution w.r.t. top-k places weight solely on those negative individuals with the highest model predictions, these individuals, whose treatment would normally result in high losses, it does so without considering the education (or associated treatment cost) of the individual w.r.t. knapsack. These upweighted individuals may then be too expensive to treat in many instances of the knapsack problem, and therefore are not treated, resulting in lower losses on the knapsack metric. 


% THIS WAS IN ORIGINAL These observations lend even \textit{greater importance to considerations of the downstream allocation tasks predictive models may face when deploying real-world systems} -- if, for instance, a practitioner trains a robust model w.r.t one decision-based metric, if the downstream allocation problem is changed even subtly (e.g. changing the cost per individual from a constant to varying based on individual features), in many settings the model could break down under shifts that were not initially thought to be worst-case.

% In all, these qualitative differences along with differences in performance of the three distributions in Figure \ref{fig:sfig4-1} imply that \textit{even these two decision tasks resulted in highly different worst-case distributions}. These qualitative differences are clearly reflected in Figure \ref{fig:sfig4-3}: we see that for this model each worst-case distribution performed exceedingly well on itself and only itself. This was a trend that was common for many SPO-trained employment prediction models as well, with a similarly strong main diagonal with weak off-diagonal entries on the aggregation of models of this type in Figure \ref{fig:sfig1-3}. These conclusions lend even greater importance to considerations of the downstream allocation tasks predictive models may be used for when deploying real-world allocation systems --- if, for instance, a practitioner trains a robust model w.r.t one decision-based metric, if the downstream allocation problem is changed even subtly (e.g. changing the cost per individual from a constant to varying based on individual features), in many settings the model could break down under shifts that were not originally thought to be worst-case.


% \begin{figure}[!ht]
% \begin{subfigure}{.5\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{./images/covid/acc_ce.png}
%   \caption{}
%   \label{fig:sfig5-1}
% \end{subfigure}%
% \begin{subfigure}{.5\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{./images/covid/skim_ce.png}
%   \caption{}
%   \label{fig:sfig5-2}
% \end{subfigure}

% \begin{subfigure}{.5\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{./images/covid/acc_spo.png}
%   \caption{}
%   \label{fig:sfig5-3}
% \end{subfigure}%
% \begin{subfigure}{.5\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{./images/covid/skim_spo.png}
%   \caption{}
%   \label{fig:sfig5-4}
% \end{subfigure}
% \caption{Worst-case distributions for COVID-19 mortality task (April 2020). Plotted for (a) CE accuracy, (b) CE top-k, (c) SPO accuracy, and (d) SPO top-k. Note the continuity of the joint distributions of individuals in worst-case distributions w.r.t. top-k vs. the lack of continuity in worst-case distributions w.r.t. accuracy.}
% \label{fig:fig5}
% \end{figure}
% \subsection{Interpretability Extension}

% % There were also a number of interesting trends with regards to the coefficients and significance of the simple linear regressions that were trained on the original data of all test sets over states, and the converged values over individuals.

% \textbf{The empirical differences we see in our first analysis can be made even more interpretable as we simplify the worst-case distribution identification process}. Across all models and decision tasks, we find that educational level and true label both factor heavily into the resulting probability of sampling placed on each individual within a given optimization instance. In Figure 5 we see that across prediction settings the education and label features appear to represent significant features used during the model predictions. Below we examine the importance scores assigned to years of education (directly associated with individual cost in SPO loss) and true label. All histograms of XGBoost importance scores are featured in the appendix.

% For income prediction models trained using SPO loss, we find that both education and label have strong importance scores. Education has means of .186, .262, and .266 for the accuracy, skim, and knapsack losses respectively. True label was given even higher importance scores with .308, .363, and .369 in the same order. There also exists a shift from worst-case distributions w.r.t. 1-accuracy to the decision tasks, where the left tail gradually disappears.

% For income prediction models trained with cross-entropy loss, we find that true labels seem much more important for the decision tasks than for accuracy; the distribution of importance of education is fairly consistent across worst-case loss functions. We also found similar results for employment prediction with cross-entropy loss.

% For employment prediction models trained on SPO loss, we find that the distribution of feature importance on education is fairly similar across tasks with means of .066, .048, and .107 for the accuracy, skim, and knapsack losses respectively. Moreover, true label was a very important feature with the score of .573 on the accuracy loss being significantly higher than the scores of .343 on knapsack loss and .292 on skim loss. This pattern aligns with our observations from Figure 4d, where in that instance true label uniquely determined which individuals received non-zero probabilities.

% spo income trends:

% \begin{itemize}
%     \item both vars pretty important (centers between .2 and .5)
%     \item some shifts in left tails from acc to skim to knapsack, until the tail is gone
% \end{itemize}

% CE income trends:

% \begin{itemize}
%     \item label way more important for skim/knapsack than acc
%     \item similar edu dists
% \end{itemize}

% spo emp trends:

% \begin{itemize}
%     \item edu pretty much same for all tasks (.08ish)
%     \item label also super important (even more for acc than others, in contrast to spo inc)
%     \item edu more important for knapsack than others (reflected in empirical dists)
% \end{itemize}

% CE emp trends:

% \begin{itemize}
%     \item label more important for decision tasks than acc
%     \item edu not changing much (.05ish)
% \end{itemize}

% From these last two major takeaways, we find differing trends between SPO-trained employment prediction models and all other classes of models --- this implies that the method in which a model was trained has nontrivial implications on the properties of its worst-case distributions with respect to different loss functions.

% spo/CE trends:

% \begin{itemize}
%     \item education becomes more important for spo than for CE in knapsack
% \end{itemize}

% emp/inc trends:

% \begin{itemize}
%     \item education more important for income than emp
%     \item label similarly important
% \end{itemize}

% \begin{itemize}
%     \item talk about how different magnitudes/signs of coefficients for different variables changed between the 4 types of tasks and the 3 different loss functions
%     \item for acc, age seems to matter a ton, along with education and label
%     \item for knapsack, label starts to matter way more, and possibly education as well
%     \item when you change to spo employment, these 3 variables start to become super super important for all tasks
%     \item for income CE, label becomes way more important when you move from acc to skim/knapsack; distributions for age/education get more skewed right
%     \item for income spo: 1) age not too diff (more skewed right for 2 decision tasks), 2) education becomes more important in decision, particularly for knapsack, 3) label super important in general (distribution shifts from flatter for acc (some have importance 1) to more centered at like .3 or .4)
%     \item images to include: some of the xgb/lr histograms for the different types of tasks, particularly for education and label
%     \item try to connect the histograms to trends you notice in the main results
% \end{itemize}

% \begin{figure}
% \begin{subfigure}{.5\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{./images/edu_spo_emp.png}
%   \caption{}
%   \label{fig:sfig1}
% \end{subfigure}%
% \begin{subfigure}{.5\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{./images/label_spo_emp.png}
%   \caption{}
%   \label{fig:sfig2}
% \end{subfigure}

% \begin{subfigure}{.5\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{./images/edu_spo_inc.png}
%   \caption{}
%   \label{fig:sfig1}
% \end{subfigure}%
% \begin{subfigure}{.5\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{./images/label_spo_inc.png}
%   \caption{}
%   \label{fig:sfig3}
% \end{subfigure}
% \caption{Marginal distributions of XGBoost regressor feature importance values for SPO-trained models in the knapsack loss setting. Displayed for models in the employment prediction setting, for education (a), label (b). Also shown for income prediction setting for education (c) and label (d).}
% \label{fig:fig3}

% \end{figure}

\begin{figure}[!ht]
  \includegraphics[width=\linewidth]{paper_visualizations/final_vis2.pdf}

\caption{Aggregate results of efficiency experiment. The converged expected values of loss are plotted over all metrics and all datasets. Here $n_j = 8$, and we calculate reference worst-case expected values of each metric using Pyomo with the IPOPT solver. All values are normalized w.r.t. the value of the Pyomo/IPOPT solution. The bolded flat line at $y=1$ represents the Pyomo/IPOPT solutions. The colored lines each represent, for a given predictive model training method, prediction task, and metric, how closely our method asymptotically reaches the solution quality of the Pyomo/IPOPT solutions. We find that the majority of our curves converged asymptotically to over 80\% of the Pyomo/IPOPT solution.}
\label{fig:fig2}

\end{figure}

% \textbf{We find that our method effectively identifies worst-case distributions over individuals w.r.t. both decision-blind and decision-focused metrics, with a high degree of efficiency.}

% We finally present empirical evidence of the efficiency of our algorithm in identifying distributions over optimization instances and individuals. In Figure \ref{fig:fig2} we compare our algorithm for solving Equation \ref{eq:eq2} to non-convex solvers on small problems instances, where these polynomial solvers are able to find strong local approximations of the true optimization objective. We find that we consistently approach gold label values found by Pyomo and SOTA optimizer IPOPT when used to solve Equation \ref{eq:eq2} \citep{hart2017pyomo, biegler2009large}. Note that, by enumerating over all possible sets of $n_j$ individuals and calculating the value of loss for each set, \ref{eq:eq2} becomes a closed-form non-convex problem that can be solved reasonably quickly by engines such as IPOPT for small $n_j$. Here we set $n_j = 8$ due to limitations with Pyomo's ability to optimize for high-degree nonconvex functions. By plotting our estimated solutions (as a proportion of the solutions achieved by Pyomo/IPOPT) to \ref{eq:eq2} over number of samples per iteration of the Frank-Wolfe algorithm in Figure \ref{fig:fig2}, and aggregating by metric over models trained with CE/SPO loss functions on the employment or income prediction/regression tasks, we find that, after 3,000 samples per iteration, 26 of the 30 plotted curves exceeded 80\% of the solutions reached by Pyomo/IPOPT, and all curves reached over 60\% of this value. Furthermore all curves clearly increased monotonically towards the gold labels as the number of samples increased. As such we have evidence that we may empirically enjoy stronger results than the $\frac{1}{e}$ convergence guarantees put forth for our Frank-Wolfe algorithm \citep{Bian}.

Finally, we empirically study the efficiency of our algorithm via empirical comparisons with existing polynomial solvers, in identifying distributions over optimization instances and individuals. In Figure \ref{fig:fig2}, we evaluate our algorithm for solving Equation \ref{eq:eq2}, as the number of optimization problems sampled during each iteration of the Frank-Wolfe algorithm increases. 
% on 
% optimization instances 
% with a relatively small number of individuals, where polynomial optimization solvers are able to produce high-quality solutions. 
%TODO: Move to appendix:
% Note that the degree of the polynomial scales with $n_j$. meaning that traditional polynomial solvers such as Pyomo/IPOPT become computationally-consuming at large scales.
%%%%%%%%%%%%%%%%%
Note that, by enumerating over all possible sets of $n_j$ individuals and calculating the value of loss for each set, Equation \ref{eq:eq2} becomes a closed-form polynomial optimization problem. We find that we \emph{consistently} approach values found by the polynomial optimization solver, Pyomo with optimizer IPOPT, when used to solve Equation \ref{eq:eq2} \citep{hart2017pyomo, biegler2009large}. In Figure \ref{fig:fig2}, we plot our estimated solutions to Equation \ref{eq:eq2} as a proportion of the solutions achieved by Pyomo/IPOPT against the number of samples per iteration of the Frank-Wolfe algorithm. We then aggregate these results by metric across models trained with CE, MSE, or SPO loss functions. This analysis is conducted on all tasks: employment prediction, income prediction, and income regression. We find that 87\% of the plotted curves exceeded 80\% of the solutions reached by Pyomo/IPOPT, and all curves reached over 60\% of this value.
% By plotting our estimated solutions (as a proportion of the solutions achieved by Pyomo/IPOPT) to Equation \ref{eq:eq2} over number of samples per iteration of the Frank-Wolfe algorithm in Figure \ref{fig:fig2}, and aggregating by metric, over models trained with CE, MSE, or SPO loss functions, and on all tasks (employment prediction, income prediction, income regression), we find that
% 87\% of the plotted curves exceeded 80\% of the solutions reached by Pyomo/IPOPT, and all curves reached over 60\% of this value. 
Furthermore, all curves increased monotonically (i.e., closer to the solution reached by Pyomo/IPOPT) as the number of samples increased, indicating that as our method has access to more instances of optimization problems it more effectively maximizes estimates of worst-case loss. The theoretically guaranteed approximation ratio for nonmonotone submodular optimization with Frank-Wolfe algorithms is $\frac{1}{e} \approx 37\%$ \citep{Bian}. Therefore, these results suggest that our algorithm empirically performs much stronger than the worst-case theoretical guarantees. 

% This provides evidence that \textit{our method works}: we are able showcase empirically strong results that rival the abilities of well-established polynomial solvers, with the benefit of being able to address \textit{arbitrarily large} optimization instances and populations. This empirical proof-of-concept cements our method's novelty as the first of its kind to optimize over arbitrary decision tasks composed of arbitrary sets of individuals.
\section{Discussion}
In this paper, we develop an algorithmic approach to find worst-case distribution shifts over a constrained set for predictive resource allocation problems. Our formulation reflects the structure of predict-then-optimize settings, allowing us to account for distribution shift within and between optimization instances through a two-level generative model, which DRO-style methods do not account for. We show that the optimization of this model can be formulated as a submodular optimization problem and solved with a Frank-Wolfe algorithm. We empirically demonstrate that worst-case distributions with respect to decision-blind and decision-based metrics exhibit substantial divergences, and that worst-case distributions with respect to decision-blind metrics may be very far from the worst-case for decision tasks. 
% Further, we provide empirical evidence for the mechanisms driving these worst-case resource allocation decisions via visualization of which patients tend to be disproportionately represented or underrepresented in worst-case distributions, with respect to different metrics (e.g., fairness-based loss). 
Finally, we find that our methods efficiently approach solutions found by existing polynomial solvers much more closely than might be suggested by worst-case theoretical guarantees.
% Finally, we demonstrate the efficiency of our method via an empirical comparison to existing polynomial solvers. 
In all, our results highlight that evaluation of the robustness of ML models in high-stakes resource allocation settings must account for the nature of the downstream decision problem in order to capture the true consequences of potential distribution shift.
% In this paper, we develop an algorithmic approach to finding worst-case distribution shifts over a constrained set for predictive resource allocation problems. Our formulation reflects the structure of predict-then-optimize settings, allowing us to account for distribution shift within and between optimization instances through a two-level generative model. Computationally, we show that the task can be formulated as a submodular optimization problem and solved with a Frank-Wolfe algorithm.  We also empirically demonstrate that worst-case distributions with respect to decision-blind and decision-based metrics exhibit substantial divergences, and worst-case distributions with respect to decision-blind metrics may be very far from the worst case for decision tasks. Our results provide evidence that assessments of the robustness of ML models in high-stakes resource allocation settings must account for the nature of the downstream decision problem in order to capture the true consequences of potential distribution shift.

% We investigate the implications of our framework on high-stakes assistance program allocation to low-income and unemployed individuals. We find that inequities in treatment along different distributions of features, including race, can be induced with our methods. These contributions allow developers to more easily identify potential shortcomings of predictive models early on. Further, we provide empirical evidence for the mechanisms driving these worst-case resource allocation decisions via visualization of which patients tend to be disproportionately represented or underrepresented in worst-case distributions with respect to different metrics (e.g., fairness-based loss). In all, our results provide evidence that assessments of the robustness of ML models in high-stakes resource allocation settings must account for the nature of the downstream decision problem in order to capture the true consequences of potential distribution shift.

\section{Acknowledgements}
This work was supported by the AI2050 program at Schmidt Sciences (Grant G-22-64474) and by the AI Research Institutes Program funded by the National Science Foundation under AI Institute for Societal Decision Making (AI-SDM), Award No. 2229881.


% TESTING NEW FIGURE 3/4

% \begin{figure}[!t]
% \begin{subfigure}{.5\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{paper_visualizations/AL_CE_employment_CA_acc.pdf}
%   \caption{}
%   \label{fig:sfig3-1}
% \end{subfigure}%
% \begin{subfigure}{.5\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{paper_visualizations/AL_CE_employment_CA_ce.pdf}
%   \caption{}
%   \label{fig:sfig3-2}
% \end{subfigure}

% \begin{subfigure}{.5\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{paper_visualizations/AL_CE_employment_CA_fair.pdf}
%   \caption{}
%   \label{fig:sfig3-3}
% \end{subfigure}%
% \begin{subfigure}{.5\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{paper_visualizations/AL_CE_employment_CA_skim.pdf}
%   \caption{}
%   \label{fig:sfig3-4}
% \end{subfigure}

% \begin{subfigure}{.5\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{paper_visualizations/AL_CE_employment_CA_knapsack.pdf}
%   \caption{}
%   \label{fig:sfig3-5}
% \end{subfigure}

% \begin{subfigure}{.5\linewidth}
%   \centering
%   % \includegraphics[width=\linewidth]{images/pdfs/va_spo_employment_final.pdf}
%   \includegraphics[width=\linewidth]{paper_visualizations/WY_SPO_income_CA_mse.pdf}
%   \caption{}
%   \label{fig:sfig4-1}
% \end{subfigure}%
% \begin{subfigure}{.5\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{paper_visualizations/WY_SPO_income_CA_fair.pdf}
%   \caption{}
%   \label{fig:sfig4-2}
% \end{subfigure}

% \begin{subfigure}{.5\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{paper_visualizations/WY_SPO_income_CA_util.pdf}
%   \caption{}
%   \label{fig:sfig4-3}
% \end{subfigure}%
% \begin{subfigure}{.5\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{paper_visualizations/WY_SPO_income_CA_top-k.pdf}
%   \caption{}
%   \label{fig:sfig4-4}

% \begin{subfigure}{\linewidth}
%   \includegraphics[width=\linewidth]{paper_visualizations/WY_SPO_income_CA_knapsack.pdf}
%   \caption{}
%   \label{fig:sfig4-5}
% \end{subfigure}
% \end{subfigure}

% \caption{(a-e) Joint distributions over individuals in an optimization instance in the employment prediction task, from the perspective of worst-case distributions trained w.r.t. all binary prediction loss functions. The underlying predictive model was trained using CE loss. For each worst-case distribution w.r.t. the metric of interest (in order of (a) accuracy, (b) CE, (c) fair, (d) top-k, and (e) knapsack) we display the joint distribution of model predictions, assigned weights, and education level, split by true label. Note that the color-bar denotes the weight in the worst-case distribution, and that differently-shaped points represent individuals of different races.}

% \label{fig:fig10}
% \end{figure}

% \section{General Formatting Instructions}
% As a general rule: \emph{follow the template}.

% \subsection{Authorship}
% Reviewing is double-blind.
% However, you can already fill in your author names and affiliations in the \verb|\author| block in the preamble following the example of the template because the class will remove it as long as the option \textsf{accepted} is not passed to the class.
% Nevertheless, make sure any other information in the paper does not disclose your identity, for example URLs to supplementary material.

% \subsection{Sectioning}
% Three numbered sectioning commands are provided: \verb|\section|, \verb|\subsection|, and \verb|\subsubsection|.
% Please respect their order, so do not put a \verb|\subsubsection| directly beneath a \verb|\section|.
% One unnumbered sectioning command is provided, \verb|\paragraph|.
% It can be used directly below any numbered section level.
% Do not use any other sectioning commands.

% \subsubsection{Typing the Section Titles}
% The \verb|\section| and \verb|\subsection| titles are uppercased by the class.
% Please type them in title case.
% (This is used in the PDF bookmarks.)
% Please also write the \verb|\subsubsection| titles in title case.

% \paragraph{What is title case?}
% \href{https://en.wikipedia.org/wiki/Title_case}{Wikipedia} explains:
% \begin{quote}
%     Title case or headline case is a style of capitalization used for rendering the titles of published works or works of art in English.
%     When using title case, all words are capitalized except for ‘minor’ words (typically articles, short prepositions, and some conjunctions) unless they are the first or last word of the title.
% \end{quote}

% \subsection{References, Citations, Footnotes}\label{sec:etc}
% \subsubsection{Cross-Referencing}
% Always use \verb|\label| and \verb|\ref|—or a command with a similar effect—when cross-referencing.
% For example, this subsection is Section~\ref{sec:etc}.

% \subsubsection{Citations}
% Citations should include the author's last name and year.
% They should be part of the sentence.
% An example parenthetical citation: “Good introductions to the topic are available \citep{latexcompanion}.”
% An example textual citation: “\citet{einstein} discusses electrodynamics of moving bodies.”
% Do not use a parenthetical citation where a textual one is appropriate.
% An example of what \emph{not} to do: “\citep{einstein} discusses electrodynamics of moving bodies.”

% We strongly advise to use reference list software such as Bib\TeX{} and a citation package such as \textsf{natbib}.
% The reference style you use should be compatible with the author-year citations.
% Both the citation style and reference style used should be consistent.

% For the original submission, take care not to reveal the authors' identity through the manner in which one's own previous work is cited.
% For example, writing
% “I discussed electrodynamics of moving bodies before \citep{einstein}.” would be inappropriate, as it reveals the author's identity.
% Instead, write “\citet{einstein} discussed electrodynamics of moving bodies.”

% \subsubsection{Footnotes}
% You can include footnotes in your text.\footnote{
%     Use footnotes sparingly, as they can be distracting, having readers skip back and forth between the main text and the foot of the page.
% }
% The footnote mark should follow the fragment to which it refers, so a footnote\footnote{
%     A footnote is material put at the foot of a page.
% }
% for a word has a footnote mark attached to that word and a footnote for a phrase or sentence has a footnote mark attached to the closing punctuation.

% \section{Math}\label{sec:math}
% The class file does not load any math support package like \textsf{amsmath}\footnote{%
%   See the \textsf{amsmath} documentation at \url{https://ctan.org/pkg/amsmath} for further details.
% }.
% We advise using the \textsf{mathtools}\footnote{%
%   See the \textsf{mathtools} documentation at \url{https://ctan.org/pkg/mathtools} for further details.
% }
% package, which extends \textsf{amsmath} with fixes and even more useful commands.
% Feel free to load other support packages for symbols, theorems, etc.

% Use the \textsf{amsmath} environments for displayed equations.
% So, specifically, use the \texttt{equation} environment instead of \verb|$$...$$| and the \texttt{align} environment instead of \texttt{eqnarray}.\footnote{For reasons why you should not use the obsolete \texttt{eqnarray} environment, see Lars Madsen, \textit{Avoid eqnarray!} TUGboat 33(1):21--25, 2012.}
% An \texttt{equation}:
% \begin{equation}\label{eq:example}
%   0 = 1 - 1.
% \end{equation}
% Two \texttt{align}'ed equations:
% \begin{align*} % no numbers with starred version
%   1 + 2 &= 3,\\
%   1 - 2 &= -1.
% \end{align*}
% Equations can also be put inline, of course.
% For example, Equation~\eqref{eq:example}: \(0=1+1\). % $0=1+1$ also works
% (Notice that both inline and displayed math are part of the sentence, so punctuation should be added to displayed math.)

% The \textsf{amsmath} and \textsf{mathtools} packages provide a lot of nice functionality, such as many common math operators, e.g., \(\sin\) and \(\max\), and also commands for defining new ones.

% \section{Floats}\label{sec:floats}
% Floats, such as figures, tables and algorithms, are moving objects and are supposed to float to the nearest convenient location.
% Please do not force them to go in the middle of a paragraph.
% They must respect the column width.

% Two-column floats are possible.
% They appear at the top of the next page, so strategic placement may be necessary.
% For an example, see Figure~\ref{fig:tikz}.
% They may not enter the margins.
% \begin{figure*}
%     \centering
%     \begin{tikzpicture}[xscale=1.5]
%         \coordinate (origin);
%         \draw[->] (origin) -- +(1cm,0) node[below] {$x$};
%         \draw[->] (origin) -- +(0,1cm) node[left] {$y$};
%         \fill[gray] (45:1cm) circle[radius=.2cm];
%     \end{tikzpicture}
%     \caption{A Nice Filled Ellipse with a Pair of Coordinate Axes.}\label{fig:tikz}
% \end{figure*}

% All material in floats should be legible and of good quality.
% So avoid very small or large text and pixelated or fuzzy lines.

% \subsection{Figures}\label{sec:figures}
% Figures should go in the \texttt{figure} environment and be centered therein.
% The caption should go below the figure.
% Use \verb|\includegraphics| for external graphics files but omit the file extension.
% Supported formats are \textsf{pdf} (preferred for vector drawings and diagrams), \textsf{png} (preferred for screenshots), and \textsf{jpeg} (preferred for photographs).
% Do not use \verb|\epsfig| or \verb|\psfig|.
% If you want to scale the image, it is better to use a fraction of the line width rather than an explicit length.
% For example, see Figure~\ref{fig:city}.

% \begin{figure}[!htb]
%   \centering
%   \includegraphics[width=0.7\linewidth]{barcelona.jpg}
%   \caption{A View of a Nice City.}\label{fig:city}
% \end{figure}

% Do not use \verb|\graphicspath|.
% If the images are contained in a subdirectory, specify this when you include the image, for example \verb|\includegraphics{figures/mypic}|.

% \subsection{Tables}\label{sec:tables}
% Tables should go in the \texttt{table} environment and be centered therein.
% The caption should go above the table and be in title caps.
% For an example, see Table~\ref{tab:data}.
% \begin{table}
%     \centering
%     \caption{An Interesting Table.}\label{tab:data}
%     \begin{tabular}{rl}
%       \toprule % from booktabs package
%       \bfseries Dataset & \bfseries Result\\
%       \midrule % from booktabs package
%       Data1 & 0.12345\\
%       Data2 & 0.67890\\
%       Data3 & 0.54321\\
%       Data4 & 0.09876\\
%       \bottomrule % from booktabs package
%     \end{tabular}
% \end{table}

% \subsection{Algorithms}\label{sec:algorithms}
% You can load your favorite algorithm package, such as \textsf{algorithm2e}\footnote{See the \textsf{algorithm2e} documentation at \url{https://ctan.org/pkg/algorithm2e}.}.
% Use the environment defined in the package to create a centered float with an algorithm inside.

% \section{Back Matter}
% There are a some final, special sections that come at the back of the paper, in the following order:
% \begin{itemize}
%   \item Author Contributions (optional)
%   \item Acknowledgements (optional)
%   \item References
% \end{itemize}
% They all use an unnumbered \verb|\subsubsection|.

% For the first two special environments are provided.
% (These sections are automatically removed for the anonymous submission version of your paper.)
% The third is the ‘References’ section.
% (See below.)

% (This ‘Back Matter’ section itself should not be included in your paper.)


% \begin{contributions} % will be removed in pdf for initial submission 
% 					  % (without ‘accepted’ option in \documentclass)
%                       % so you can already fill it to test with the
%                       % ‘accepted’ class option
%     Briefly list author contributions. 
%     This is a nice way of making clear who did what and to give proper credit.
%     This section is optional.

%     H.~Q.~Bovik conceived the idea and wrote the paper.
%     Coauthor One created the code.
%     Coauthor Two created the figures.
% \end{contributions}

% \begin{acknowledgements} % will be removed in pdf for initial submission,
% 						 % (without ‘accepted’ option in \documentclass)
%                          % so you can already fill it to test with the
%                          % ‘accepted’ class option
%     Briefly acknowledge people and organizations here.

%     \emph{All} acknowledgements go in this section.
% \end{acknowledgements}

% References
\bibliography{uai2024-template}

\newpage

\onecolumn

\title{Decision-Focused Evaluation of Worst-Case Distribution Shift\\(Supplementary Material)}
\maketitle

\appendix

\section{Related Work}
\label{sec:related}

\textbf{Methods of Representing Distribution Shift}. Prior studies on distribution shift have investigated some combination of covariate shift, label shift, and/or subpopulation shift in the context of standalone machine learning models \cite{Liu}. The concept of subpopulation shift, which relates tangentially to our use of optimization instances to separate individuals, involves a shift in the distribution of data amongst an unseen class variable. This concept has deep ties to fair and robust machine learning, as in practice many models fail to generalize well to specific subgroups within a broader population \cite{Yang}. In terms of optimization, beyond the mere diagnosis of distribution shift, via such methods as outlier detection, prior work also encapsulates the development of models that are trained in an adversarial manner such as to minimize their expected loss over a set of 'reasonable' or viable out-of-distribution test sets, known as Distributionally Robust Optimization (DRO) \cite{Duchi, rahimian2019distributionally}. \citep{levy2020large} for instance identify a method for DRO with respect to convex loss functions that scales independently of population size, utilizing $\chi^2$ divergence uncertainty sets. In the field of biodiversity \cite{hierarchy} identify shifts in the spaciotemporal distribution of arctic species using Bayesian Hierarchical Models tiered on observations, density processes and process parameters, a related paradigm to our use of hierarchical modeling to describe distributions over sets of individuals.

\textbf{Predictive Treatment Allocation}. A wide body of work exists to identify optimal treatment allocation policies, particularly in the health and public policy sectors. In terms of applications in public policy, \cite{verma2023increasing} deploy the first Restless Multi-Armed Bandit framework in public health in India. Their model is used by the NGO ARMMAN to identify women in underserved communities to contact regarding maternal and childcare information. \cite{athey2023machine} follow in this more empirical style of evaluation, developing a set of empirically validated treatment strategies to provide 'nudges' to college students in order to increase financial aid application renewals. Finally a smaller body of work exists criticizing the overeagerness of industry in adopting predictive allocation models prior to addressing theoretical and empirical concerns. \cite{wang2022against} cite distribution shift as one major obstacle the field of predictive optimization as a whole must surmount before it is accepted as a legitimate machine learning practice, motivating in particular our work in non-parametric shifts. In the same vein of concerns regarding distribution shift in treatment allocation \cite{schultz2019risk} suggest that triage risk models tend to perform better on middle-aged patients than older patients in terms of short-term mortality risk.

\textbf{Predict-then-Optimize Machine Learning}. Predict-then-optimize is a more generalized class of problems than resource allocation, in which a model predicts a cost vector from a vector of features, and second, the predicted cost vector is used as a set of parameters for an optimization problem. This class of machine learning along with deterministic optimization is common in many areas of deployment, from limited resource allocation problems to shortest-path problems; in many such setups, we have some unknown cost variable that can be estimated using some machine learning model and optimized through well-known algorithms \cite{Elmachtoub}.

A large portion of work in predict-then-optimize machine learning has sought to train models that perform well in-distribution \cite{Wilder1, Elmachtoub}, with significant work aiming to develop loss functions that take into account the optimization task in-context. This framework has also been investigated for specific prediction models along with applications in reinforcement learning and solving combinatorial problems \cite{elmachtoub2020decision, mandi2020smart, wang2021learning}. However, a comparatively smaller body of work exists to train or fine-tune machine learning models in predict-then-optimize settings that are distributionally robust. Notably \cite{johnson2023characterizing} examine robust methods for accounting for label shift in predict-then-optimize, proposing modifications for a two-stage machine, decision-focused learning setup to anticipate label shift.

\section{Full Proofs of DR-submodularity}
\label{sec:proofs}

\begin{definition}
\label{def:inj}
Given some index $i$ and some sample of entries $X, Y$ from a probability distribution $\PP$, let $\#(i, (X, Y)): (\mathbb{Z}, \PP) \to \mathbb{N}$ be the number of times the $i$th indexed individual occurs in the sample $X, Y$.
\end{definition}

Before the proof of Theorem \ref{thm:optthm}
, we begin by proving the following lemma to establish guarantees on a solution to a modification of Equation \ref{eq:eq2} that uses the same version of decision loss, $DL$, as Equation \ref{eq:eq1}. This helps us establish bounds on guarantees after changing $DL$ to $DL'$ in Equation \ref{eq:eq2}.

\begin{lemma}
\label{thm:helperlemma}
Suppose we have a solution $W$ to a slight modification of Equation \ref{eq:eq2} with value at least $\alpha \cdot OPT_W  - \epsilon$ for some $\alpha \in \mathbb{R}, \epsilon \in \mathbb{R}$, where $OPT_W$ is  the optimal value, and $DL$ is used in Equation \ref{eq:eq2} instead of $DL'$. Then, we have a solution to the original optimization problem over $Q_j$, Equation \ref{eq:eq1}, with value at least $\alpha \cdot OPT_Q - \epsilon$ where $OPT_Q$ is the optimal value of the original problem.
\end{lemma}

\textbf{Proof of Lemma \ref{thm:helperlemma}}.
\begin{proof}
First, formally define the optimization problem representing the modified Equation \ref{eq:eq2} as

\begin{align*}
    \max_{W \in \Omega} \E_{j \sim W_\xi}\left[\E_{X, Y \sim W_{ j}}\left[DL(Y, m(X))\right]\right] + 1. 
\end{align*}

Let the optimal solution to the modified Equation \ref{eq:eq2} be represented as $OPT_W$. Furthermore, we define the achieved loss of our solution on the modified Equation 2 as $\hat{\phi}$, e.g. 

\begin{align*}
    \hat{\phi} = \mathbb{E}_{j \sim W_\xi}[\mathbb{E}_{X, Y \sim W_j}[DL(Y, m(X))]]
\end{align*}

Let $\phi$ denote the realized loss of $W$ on Equation 1 when we transform $W$ into a set of valid probability distributions, e.g.

\begin{align*}
    \phi = \mathbb{E}_{j \sim W_\xi + \frac{1}{k}}[\mathbb{E}_{X, Y \sim W_j + \frac{1}{|W_j|}}[DL(Y, m(X))]]
\end{align*}

Using these definitions,

\begin{align*}
    \hat{\phi} &= \sum_{j \in W_\xi} (W_\xi (j) + \frac{1}{k}) \sum_{X, Y \in S_j} DL(Y, m(X)) Pr(X, Y)\\
    &= \sum_{j \in W_\xi} (W_\xi (j) + \frac{1}{k}) \sum_{X, Y \in S_j} DL(Y, m(X)) \prod_{i \in (X, Y)}^{n_j} (W_j (X_i, Y_i) + \frac{1}{|W_j|})\\
    &= \sum_{j \in W_\xi + \frac{1}{k}} (W_\xi + \frac{1}{k}) \sum_{X,Y \in S_j} DL(Y, m(X)) \prod_{i \in (X, Y)}^{n_j} (W_j (X_i, Y_i) + \frac{1}{|W_j|})\\
    &= \mathbb{E}_{j \sim W + \frac {1}{k}}[\mathbb{E}_{X, Y \sim W_j + \frac{1}{|W_j|}} DL(Y, m(X))]\\
    &= \phi.
\end{align*}

% $\hat{\Omega} = \{\hat{W}_\xi, \hat{W}_j |1 \leq j \leq k\}$, and let the final expectation of this estimated solution be $\hat{\phi}$. Finally let the optimal solution to the modified Equation \ref{eq:eq2} be represented as $OPT_W$, and let $\phi$ be the induced objective value on Equation \ref{eq:eq1}. Adding the inverse of the size of each subpopulation (or in the case of $W_{\xi}$, adding $\frac{1}{k}$) for all $1 \leq j \leq k$ yields a new set $\hat{\Theta }= \{\hat{Q}_{\xi} = \hat{W}_{\xi} + \frac{1}{k}, \hat{Q}_j = \hat{W}_{j} + \frac{1}{|W_j|} | 1 \leq j \leq k \}$. Substituting $\hat{\Theta}$ into the original objective yields 

% \begin{align*}
% \hat{\phi} = \mathbb{E}_{j \sim \hat{Q}_{\xi}}[\mathbb{E}_{(X, Y) \sim \hat{Q}_j}[DL(Y, m(X))]] = \phi.
% \end{align*}


With the same logic, we substitute aspects of the sampling process using $W$ to derive information about the optimal solutions:

\begin{align*}
    OPT_W &= \max_{W \in \Omega} \mathbb{E}_{j \sim W_\xi}[\mathbb{E}_{X, Y \sim W_j}[DL(Y, m(X))]]\\
    % &= \max_{W \in \Omega} \mathbb{E}_{j \sim W_\xi + \frac{1}{|W_\xi|}}[\mathbb{E}_{(X, Y) \sim W_j + \frac{1}{|W_j|}}[DL(Y, m(X))]]\\
    % &= \max_{Q \in \Theta} \mathbb{E}_{j \sim Q_\xi}[\mathbb{E}_{Q_j}[DL(Y, m(X))]]\\
    &= \max_{W \in \Omega} \sum_{j \in W_\xi} (\frac{1}{k} + W_\xi (j)) \sum_{X, Y \in S_j} Pr(X, Y) DL(Y, m(X))\\
    &= \max_{W \in \Omega} \sum_{j \in W_\xi} (\frac{1}{k} + W_\xi (j)) \sum_{X, Y \in S_j} DL(Y, m(X)) \prod_{i \in (X, Y)}^{n_j} (\frac{1}{|W_j|} + W_j (X_i, Y_i))\\
    &= \max_{Q \in \Theta} \sum_{j \in Q_\xi} Q_\xi (j) \sum_{X, Y \in S_j} DL(Y, m(X)) \prod_{i \in (X, Y)}^{n_j} Q_j (X_i, Y_i)\\
    &= OPT_Q.
\end{align*}

Thus, 

\begin{align*}
\hat{\phi} \geq \alpha(OPT_W) - \epsilon \implies \phi \geq \alpha(OPT_Q) - \epsilon.
\end{align*}

\end{proof}

\textbf{Proof of Theorem \ref{thm:optthm}}.
\begin{proof}

Let the final expectation of our estimated solution to Equation \ref{eq:eq2} be $\hat{\phi}'$, e.g.

\begin{align*}
    \hat{\phi}' = \mathbb{E}_{j \sim W_\xi}[\mathbb{E}_{X, Y \sim W_j}[DL'(Y, m(X))]]
\end{align*}

Furthermore, let $OPT'_W$ denote the optimal solution to Equation \ref{eq:eq2}. We have:

\begin{align*}
    \hat{\phi}' &\geq \alpha(OPT'_W) - \epsilon \\
    &=\alpha(\max_{W \in \Omega}\left( \sum_{j \in W_\xi} W_\xi(j) \sum_{X, Y \in S_j} Pr(X, Y) (DL'(Y, m(X)) - 1)\right) + 1) - \epsilon\\
    &= \alpha(\max_{W \in \Omega} \left(\sum_{j \in W_\xi} W_\xi(j) \left[ \sum_{X, Y \in S_j} Pr(X, Y) DL'(Y, m(X))) - \sum_{X, Y \in S_j} Pr(X, Y)\right]\right) + 1) - \epsilon\\
    &= \alpha(\max_{W \in \Omega}\left( \sum_{j \in W_\xi} W_\xi(j) (-1 + \sum_{X, Y \in S_j} Pr(X, Y) DL'(Y, m(X))) \right) + 1) - \epsilon\\
    &= \alpha(\max_{W \in \Omega} \left(1 + \sum_{j \in W_\xi} -W_\xi(j) + \left[\sum_{j \in W_\xi} W_\xi (j) \sum_{X, Y \in S_j} Pr(X, Y) DL'(Y, m(X))\right]\right))\\
    &= \alpha(\max_{W \in \Omega} \left( \sum_{j \in W_\xi} W_\xi (j) \sum_{X, Y \in S_j} Pr(X, Y) DL'(Y, m(X)) \right))\\
    &= \alpha(OPT'_W) - \epsilon \implies\\
    &\hat{\phi}' \geq \alpha(OPT_Q) - \epsilon \tag{by Lemma \ref{thm:helperlemma}}
\end{align*}
\end{proof}


% Note that, because the transformation from $DL$ to $DL'$ simply shifts the distribution of the objective uniformly downward, we can assert that $OPT_W = OPT'_W - 1$, using the same definition of $OPT'_W$ from Theorem 3.1. 

% \begin{align*}
%     &\phi' \geq \alpha(OPT_W) - \epsilon \implies\\
%     &\phi' \geq \alpha(OPT'_W - 1) - \epsilon \implies\\
%     &\phi' \geq \alpha(OPT'_W) - \alpha - \epsilon \implies\\
%     &\phi' + 1 \geq \alpha(OPT'_W) - \epsilon + 1 - \alpha \implies \\
%     &\phi' + 1 \geq \alpha(OPT_Q) - \epsilon + 1 - \alpha \tag{Theorem 3.1}    
% \end{align*}
% \end{proof}

\textbf{Proof of Theorem \ref{thm:bigtheorem}}.

\begin{proof} 
Consider the gradient of the expansion of Equation \ref{eq:eq2} into sums w.r.t. a single weight placed on an individual of index $a$ in instance $j$:

\begin{align*}
    &\frac{\partial}{\partial W_j(x_a, y_a)} \sum_{X, Y \in S_j}  \left[\prod_{i = 1,}^{n_j} \left(W_j(x_i, y_i) + \frac{1}{|W_j|}\right)\right] DL'(Y, m(X)) \\
    &= \sum_{X, Y \in S_j} \#(a, (X, Y))(W_{j}(x_a, y_a) + \frac{1}{|W_j|})^{\#(a, (X, Y)) - 1} \left[\prod_{i = 1, i \neq a}^{n_j}\left( W_j(x_i, y_i)+ \frac{1}{|W_j|}\right)\right] DL'(Y, m(X))\\
    & = \mathbb{E}_{X, Y \sim W_{j}}[DL'(Y, m(X)) \frac{\#(a, (X, Y))}{W_j (x_a, y_a)+ \frac{1}{|W_j|}}]
\end{align*}

% $J(P^{(S)}) = \mathbb{E}_{S \sim P^{(0)}}[\mathbb{E}_{U\sim P^{(S)}}[\ell(U) \Pi_{i \in U} P_{i}^{(S) \#(i, U)}]]$\\
% $ = \sum_{U \sim P^{(S)}}\ell(U) \Pi_{i \in U} P_{i}^{(S) \#(i, U)}$.

% We have:

% $\frac{\partial J(P^{(nS})}{\partial P^{(S)}_{a}} = \\
% \sum_{U \sim P^{(S)}} \ell (U) \#(a, U) P_{a}^{(S) \#(a, U)-1} \Pi_{i \neq a \in U} P^{(S) \#(i, U)}_{i}$\\
% $=\mathbb{E}_{U}[\ell(U) \frac{\#(a)}{P^{(S)}_a}]$.

The resulting expectation is non-positive due to non-negative weights in any probability mass function and non-positive loss $DL$. Next consider the diagonal entries of the Hessian matrix of the refactored objective:

\begin{align*}
    &\frac{\partial^2}{\partial^2 W_j(x_a, y_a)} \sum_{X, Y \in S_j}  \left[\prod_{i = 1}^{n_j}\left( W_j(x_i, y_i)+ \frac{1}{|W_j|}\right)\right] DL'(Y, m(X)) \\
    &= \sum_{X, Y \in S_j} \#(a, (X, Y)) (\#(a, (X, Y)) - 1)(W_{j}(x_a, y_a)+ \frac{1}{|W_j|})^{\#(a, (X, Y)) - 2} \left[\prod_{i = 1, i \neq a}^{n_j} \left(W_j(x_i, y_i)+ \frac{1}{|W_j|}\right)\right] DL'(Y, m(X)) \\
    & \leq 0 \tag{by non-positive DL'}.
\end{align*}

% $\frac{\partial^2 J(P^{(S)})}{\partial^2 P^{(S)}_{a}} = \\
% \sum_{U \sim P^{(S)}} \ell (U) \#(a, U) (\#(a, U)-1) P_{a}^{(S) \#(a, U)-2} \Pi_{i \neq a \in U} P^{(S) \#(i, U)}_{i} \leq 0$.

Finally consider the off-diagonal entries of the Hessian matrix below, where we consider a second arbitrary individual of index $b$, where $a \neq b$:

{\small \begin{align*}
    &\frac{\partial^2}{\partial W_j(x_a, y_a)\partial W_j(x_b, y_b)} \sum_{X, Y \in S_j}  \left[\prod_{i = 1}^{n_j} \left(W_j(x_i, y_i)+ \frac{1}{|W_j|}\right)\right] DL'(Y, m(X)) \\
    &= \sum_{X, Y \in S_j} \#(a, (X, Y)) (\#(b, (X, Y)))(W_{j}(x_a, y_a)+ \frac{1}{|W_j|})^{\#(a, (X, Y)) - 1}\\
    &(W_{j}(x_b, y_b)+ \frac{1}{|W_j|})^{\#(b, (X, Y)) - 1} \left[\prod_{i = 1, i \notin \{a, b\}}^{n_j} \left(W_j(x_i, y_i)+ \frac{1}{|W_j|}\right)\right] DL'(Y, m(X)) \\
    & \leq 0 \tag{by non-positive DL'}.
\end{align*}}

% $\frac{\partial^2 J(P^{(S)})}{\partial P^{(S)}_{a} \partial P^{(S)}_{b}} = \\
% \sum_{U \sim P^{(S)}} \ell (U) \#(a, U) \#(b, U) P_{a}^{(S) \#(a, U)-1} P_{b}^{(S) \#(b, U)-1} \Pi_{i \notin {a, b} \in U} P^{(S) \#(i, U)}_{i} \leq 0$.

By Definition 4.3 $J$ is DR-submodular and by Definition 4.2 $J$ is non-monotone; this conclusion applies WLOG to all subpopulations between 1 and $k$ inclusive.
\end{proof}

% \section{Main Algorithm}
% \label{sec:algo}

% Below we include the full pseudocode in Algorithm 1, a formal writeup of how we implement the Frank-Wolfe developed by \cite{Bian}. Due to the strong empirical improvements we saw along with past work by such works as \cite{li2020does}, we also introduce a momentum term into the update rule that preserves a portion of gradients calculated in the previous iteration of the algorithm, initialized to 0.

% Building on prior work from \cite{Wilder2}, Frank-Wolfe algorithms commonly require subroutines to maximize the dot product over the feasible set of viable allocations and the gradients of the objective function with respect to the optimization variables. We incorporate their work as a subroutine within our algorithm, termed \textit{gradmax}. Gradmax is also used to solve the optimization problem over all optimization instances, where we input a vector of converged worst-case losses for all optimization instances along with $\rho_\xi$ into gradmax, which then returns the probability distribution within the feasible set that maximizes expected worst-case loss over optimization instances.

% \begin{algorithm}[!ht]
%    \caption{Frank-Wolfe Method for Maximizing Expected Loss over Optimization Instances}
%    \label{alg:example}
% \begin{algorithmic}
%     \STATE {\bfseries Input:} weight offsets $W_{\xi}, \cdots, W_{j}, \cdots, W_k$ (initialized to $\{0\}_{1}^{n_j}$), $v_0 = \{0\}_{1}^{n_j}$, iterations, num\_samples, num\_samples2, $p_t$, $\rho_{\text{ind}}$, $\rho_\xi$

%     \FOR {$i=1$ {\bfseries to} $k$}
%     \FOR {$j=1$ {\bfseries to} iterations}
%     \FOR {$r=1$ {\bfseries to} num\_samples}
%     \STATE sample $\{x_s, y_s\}_{s=1}^{n_j} \sim W_i$
%     \STATE calculate $\ell = DL'(m(\{x_s\}_{s=1}^{n_j}), \{y_s\}_{s=1}^{n_j})$
%     \STATE accumulate $\frac{\partial \ell}{\partial W_i}$
%     \ENDFOR
%     \STATE set $v_j = p_t * -\frac{\partial \ell}{\partial W_i} + (1-p_t) * v_{j-1}$
%     \STATE solve $\delta = \text{\textbf{gradmax}}(v_j, \rho^{\text{ind}}) - \frac{1}{|W_i|}$
%     \STATE update $W_i = W_i + \frac{1}{\text{iterations}} \delta$
%     \ENDFOR
%     \ENDFOR

%     \STATE initialize $\lambda = \{0\}_{1}^{k}$
%     \FOR {$i=1 ${\bfseries to} k}

%     \FOR {$j=1$ {\bfseries to} num\_samples2}
%     \STATE sample $\{x_s, y_s\}_{s=1}^{n_j} \sim W_i$
%     \STATE accumulate $\ell = DL'(m(\{x_s\}_{s=1}^{n_j}), \{y_s\}_{s=1}^{n_j})$
%     % \STATE set $v^{(2)}_i = p_t * \frac{-\partial \ell}{\partial W_h} + (1-p_t) * v^{(2)}_{i-1}$
%     % \STATE update $W_\xi = W_\xi + \frac{1}{\text{iterations}} v$

%     \ENDFOR
%     \STATE set $\lambda_i = \text{avg} (\ell)$
%     \ENDFOR
%         \STATE solve $W_\xi = \text{\textbf{gradmax}} (\lambda, \rho_\xi)$

% \end{algorithmic}
% \end{algorithm}

% \clearpage

% \textbf{Proof of Theorem 4.7}.

% \begin{proof}
% Consider the higher level optimization between optimization instances, following convergence of the $k$ probability distributions over individuals:

% $J(P^{(0)}) = \mathbb{E}_{n \sim P^{(0)}}[\mathbb{E}_{U\sim P^{(n)}}[\ell(U) \Pi_{i \in U} P_{i}^{(n) \#(i, U)}]]$\\
% $ = \sum_{n \sim P^{(0)}} P_{n}^{(0)}\sum_{U \sim P^{(n)}}\ell(U) \Pi_{i \in U} P_{i}^{(n) \#(i, U)}$.

% We have:

% $\frac{\partial J(P^{(0)})}{\partial P^{(0)}_{h}} = \\
% \sum_{U \sim P^{(h)}} \ell (U) \Pi_{i \in U} P^{(h) \#(i, U)}_{i} = \mathbb{E}[\ell(U) | h]$.

% This function is non-positive due to non-negative weights in any probability mass function and non-positive loss $\ell$. Next consider the non-diagonal entries of the Hessian matrix of this function:

% Note that the Hessian is trivially 0 as the first derivative with respect to any entry of $P^{(0)}$ does not depend on any other entry of $P^{(0)}$. So the Hessian is also non-positive.

% By Definition 4.3 $J$ is DR-submodular and by Definition 4.2 $J$ is non-monotone; this conclusion applied WLOG to all subpopulations between 1 and $k$ inclusive.
% \end{proof}

% \textbf{Proof of Theorem 4.8}.

% \begin{proof}
% Another change of variables yields proof of DR-submodularity here. First we find the gradient of the objective w.r.t. a weight over an individual $X^{(n)}_a$: \\

% $\frac{\partial J(X^{(n)})}{\partial X^{(n)}_{a}} = \mathbb{E}_{U}[\ell(U) \frac{\#(a)}{\frac{1}{\beta_n} + X^{(n)}_a}] \leq 0$. \\

% Diagonal entries of the Hessian matrix can be written as \\

% $\sum_{U} \ell (U) \#(a, U) (\#(a, U)-1) [\frac{1}{\beta_n} + X_{a}^{(n)}]^{\#(a, U)-2} \Pi_{i \neq a \in U} [\frac{1}{\beta_n} + X^{(n)}_{i}]^{\#(i, U)} \leq 0$.

% , and off-diagonal entries of the Hessian matrix can be written as \\

% $\sum_{U} \ell (U) \#(a, U) \#(b, U) [\frac{1}{\beta_n} + X_{a}^{(n)}]^{\#(a, U)-1} [\frac{1}{\beta_n} + X_{b}^{(n)}]^{\#(b, U)-1}
% \Pi_{i \notin {a, b} \in U} [\frac{1}{\beta_n} + X^{(n)}_{i}]^{\#(i, U)} \leq 0$.

% . Since all entries of the Hessian matrix are less than or equal to 0 and the first derivative is non-positive, this is also a non-monotone DR-submodular function.

% \end{proof}

\newpage

\section{Additional Methodological Details}
\label{sec:code}

\begin{itemize}
\item Due to the strong empirical improvements we saw along with past work by such works as \cite{li2020does}, we also introduce a momentum term into the update rule that preserves a portion of gradients calculated in the previous iteration of the algorithm, initialized to 0.

\item Building on prior work from \cite{Wilder2}, Frank-Wolfe algorithms commonly require subroutines to maximize the dot product over the feasible set of viable allocations and the gradients of the objective function with respect to the optimization variables. We incorporate their work as a subroutine within our algorithm, termed \textit{gradmax}. Gradmax is also used to solve the optimization problem over all optimization instances, where we input a vector of converged worst-case losses for all optimization instances along with $\rho_\xi$ into gradmax, which then returns the probability distribution within the feasible set that maximizes expected worst-case loss over optimization instances.
\end{itemize}

\section{Additional Experimental Details}
\label{sec:exptdetails}

Regarding the training process of our underlying predictive models:

\begin{itemize}
    \item For each of the 50 US states in the census datasets we train two models: one with CE loss (for regression, MSE) and one with SPO loss, resulting in 300 total models.
    \item Note that, while training with cross-entropy/mean-squared error loss can be accomplished with nothing but the raw train set, training with SPO loss requires treating each optimization instance as a random sample of individuals from the train set. To this end, we take 15,000 samples of $n_j = 40$ individuals from the combined training sets of each state, each sample representing one resource allocation problem.
    \item  Two such models are trained on each of the three datasets, resulting in six total trained models per each of the fifty optimization instances (300 total predictive models).
    \item The predictive models used to predict employment/income also differ in architecture depending on the dataset. For employment classification we train logistic regression models on each state's train set; for income classification/regression we train a 2-hidden-layer neural network. All categorical features are mapped to Pytorch embedding layers that are also trained for each predictive model.
\end{itemize}

Regarding the application of these predictive models in finding worst-case distributions w.r.t. relevant loss metrics:

\begin{itemize}
    \item Each of the trained models are then optimized to find their worst-case distributions with respect to our provided decision problems.
    \item For each worst-case distribution (defined by a unique combination of predictive model, loss function, and optimization instance) we run 15 iterations of our Frank-Wolfe algorithm, with a momentum value of $p_t = 0.7$ and with 35,000 optimization problems sampled per iteration.
    \item For each of the three worst-case distributions converged for each of the models, we evaluate the expected performance of the distribution on all three loss functions, given the original model used to generate the distribution. This is accomplished by sampling 200 optimization instances from each distribution/loss function combination, sampling 4000 decision problems from each optimization instance, and aggregating over distribution/loss function pairs. For all models we set variables $\rho_1 = n_j = 40, \rho_2 = 6.25$ in order to impose meaningful constraints on optimization problems (i.e., balance between small $\rho$ values that do not allow for significant deviation from the empirical distribution and large values yielding unconstrained optimization over the simplex).
\end{itemize}

Regarding our experiment comparing our method to Pyomo/IPOPT:

\begin{itemize}
    \item In this experiment we set $n_j = 8$. Note that the degree of the polynomial scales with $n_j$. As a result traditional polynomial solvers such as Pyomo/IPOPT can become computationally-consuming at large scales. This is particularly true when many calls are made to the solver (in our case, Pyomo/IPOPT is called once for each worst-case distribution w.r.t. each metric).
    \item Each colored curve in Figure \ref{fig:fig2} scales up to 3,000 samples per iteration of the Frank-Wolfe algorithm. 

\end{itemize}

% TO ADD
% - fig 1, with intervals
% - in additional details, organize with bullet points

\clearpage

\section{Additional Presentation of Aggregate Results}
\label{sec:extragraphs}

Below we include, for all combinations of dataset and method used for training the underlying predictive model (CE/MSE, SPO), a set of aggregate results identical to those of Figure \ref{fig:fig1} but with 95\% confidence bounds. Note that for each individual subfigure, the confidence interval is calculated using a normality assumption, with the results of the 50 individual predictive models belonging to that subfigure's combination of dataset and training method serving as data points.

\begin{figure*}[!ht]
\begin{subfigure}{.33\linewidth}
  \centering
  % \includegraphics[width=\linewidth]{images/pdfs/CE_employment_agg_num.pdf}
  \includegraphics[width=\linewidth]{_final_results_ci_bounds/binary_CE_employment_bounds.pdf}
  \caption{}
  \label{fig:sfig8-1}
\end{subfigure}%
\begin{subfigure}{.33\linewidth}
  \centering
  % \includegraphics[width=\linewidth]{images/pdfs/CE_income_agg_num.pdf}
  \includegraphics[width=\linewidth]{_final_results_ci_bounds/binary_CE_income_bounds.pdf}
  \caption{}
  \label{fig:sfig8-2}
\end{subfigure}%
\begin{subfigure}{.33\linewidth}
  \centering
  % \includegraphics[width=\linewidth]{images/pdfs/SPO_income_agg_num.pdf}
  \includegraphics[width=\linewidth]{_final_results_ci_bounds/regression_CE_income_bounds.pdf}
  \caption{}
  \label{fig:sfig8-3}
\end{subfigure}

\begin{subfigure}{.33\linewidth}
  \centering
  % \includegraphics[width=\linewidth]{images/pdfs/SPO_income_agg_num.pdf}
  \includegraphics[width=\linewidth]{_final_results_ci_bounds/binary_SPO_employment_bounds.pdf}
  \caption{}
  \label{fig:sfig8-5}
\end{subfigure}%
\begin{subfigure}{.33\linewidth}
  \centering
  % \includegraphics[width=\linewidth]{images/pdfs/SPO_employment_agg_num.pdf}
  \includegraphics[width=\linewidth]{_final_results_ci_bounds/binary_SPO_income_bounds.pdf}
  \caption{}
  \label{fig:sfig8-6}
\end{subfigure}%
\begin{subfigure}{.33\linewidth}
  \centering
  % \includegraphics[width=\linewidth]{images/pdfs/SPO_income_agg_num.pdf}
  \includegraphics[width=\linewidth]{_final_results_ci_bounds/regression_SPO_income_bounds.pdf}
  \caption{}
  \label{fig:sfig8-7}
\end{subfigure}
\caption{Diagonal-normalized aggregated heat maps with 95\% confidence intervals over states for models trained with CE loss (in the regression case, mean-squared error) (top row) and SPO loss (bottom row). From left to right in each row, results are displayed by task for (a,d) employment classification, (b,e income classification, and (c,f) income regression. Within each heat map, rows denote the metric the worst-case distribution maximizes, and columns denote the metrics the worst-case distribution was evaluated on. Note that each column is divided by the diagonal entry in that column, resulting in a main diagonal of all 1.0. Furthermore, because CE loss is always negative, each entry in columns corresponding to CE loss is equal to the diagonal entry in that column divided by the original loss in that cell.}
\label{fig:fig8}

\end{figure*}

% \subsection{Sample of Joint Distributions -- Income Prediction}

% Below we include a randomly chosen sample of 4 models  applied on the test sets of 1 random state each in the income prediction case. Figures are titled with the test set of the plotted data first marked, then the name of the state and metric for which a worst-case distribution was calculated with respect to. Each plot displays the joint distribution of individuals in the selected state's test set, colored by weight assigned by the worst-case distribution and displaying the joint distribution of education level and model prediction, as was seen in Figures \ref{fig:fig3} and \ref{fig:fig4}. Note that in these images the top-k losses are abbreviated as "skim".

% \begin{figure}[!ht]
% \begin{subfigure}{.33\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{images/income_pdfs/acc_edu_prob_CT.pdf}
%   \caption{}
%   \label{fig:sfig8-1}
% \end{subfigure}%
% \begin{subfigure}{.33\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{images/income_pdfs/skim_edu_prob_CT.pdf}
%   \caption{}
%   \label{fig:sfig8-2}
% \end{subfigure}%
% \begin{subfigure}{.33\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{images/income_pdfs/knapsack_edu_prob_CT.pdf}
%   \caption{}
%   \label{fig:sfig8-3}
% \end{subfigure}

% \begin{subfigure}{.33\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{images/income_pdfs/acc_edu_prob_VA.pdf}
%   \caption{}
%   \label{fig:sfig8-4}
% \end{subfigure}%
% \begin{subfigure}{.33\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{images/income_pdfs/skim_edu_prob_VA.pdf}
%   \caption{}
%   \label{fig:sfig8-5}
% \end{subfigure}%
% \begin{subfigure}{.33\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{images/income_pdfs/knapsack_edu_prob_VA.pdf}
%   \caption{}
%   \label{fig:sfig8-6}
% \end{subfigure}

% \begin{subfigure}{.33\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{images/income_pdfs/acc_edu_prob_acc_AL.pdf}
%   \caption{}
%   \label{fig:sfig8-7}
% \end{subfigure}%
% \begin{subfigure}{.33\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{images/income_pdfs/skim_edu_prob_skim_AL.pdf}
%   \caption{}
%   \label{fig:sfig8-8}
% \end{subfigure}%
% \begin{subfigure}{.33\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{images/income_pdfs/knapsack_edu_prob_knapsack_AL.pdf}
%   \caption{}
%   \label{fig:sfig8-9}
% \end{subfigure}

% \begin{subfigure}{.33\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{images/income_pdfs/acc_edu_prob_acc_IN.pdf}
%   \caption{}
%   \label{fig:sfig8-10}
% \end{subfigure}%
% \begin{subfigure}{.33\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{images/income_pdfs/skim_edu_prob_skim_IN.pdf}
%   \caption{}
%   \label{fig:sfig8-11}
% \end{subfigure}%
% \begin{subfigure}{.33\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{images/income_pdfs/knapsack_edu_prob_knapsack_IN.pdf}
%   \caption{}
%   \label{fig:sfig8-12}
% \end{subfigure}
% \label{fig:fig8}
% \end{figure}
% \clearpage
% \subsection{Sample of Joint Distributions -- Employment Prediction}

% Below we include a randomly chosen sample of 4 models  applied on the test sets of 1 random state each in the employment prediction case. Figures are titled with the test set of the plotted data first marked, then the name of the state and metric for which a worst-case distribution was calculated with respect to. Each plot displays the joint distribution of individuals in the selected state's test set, colored by weight assigned by the worst-case distribution and displaying the joint distribution of education level and model prediction, as was seen in Figures \ref{fig:fig3} and \ref{fig:fig4}. Note that in these images the top-k losses are abbreviated as "skim".

% \begin{figure}[!ht]
% \begin{subfigure}{.33\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{images/employment_pdfs/acc_edu_prob_KY.pdf}
%   \caption{}
%   \label{fig:sfig9-1}
% \end{subfigure}%
% \begin{subfigure}{.33\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{images/employment_pdfs/skim_edu_prob_KY.pdf}
%   \caption{}
%   \label{fig:sfig9-2}
% \end{subfigure}%
% \begin{subfigure}{.33\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{images/employment_pdfs/knapsack_edu_prob_KY.pdf}
%   \caption{}
%   \label{fig:sfig9-3}
% \end{subfigure}

% \begin{subfigure}{.33\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{images/employment_pdfs/acc_edu_prob_WV.pdf}
%   \caption{}
%   \label{fig:sfig9-4}
% \end{subfigure}%
% \begin{subfigure}{.33\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{images/employment_pdfs/skim_edu_prob_WV.pdf}
%   \caption{}
%   \label{fig:sfig9-5}
% \end{subfigure}%
% \begin{subfigure}{.33\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{images/employment_pdfs/knapsack_edu_prob_WV.pdf}
%   \caption{}
%   \label{fig:sfig9-6}
% \end{subfigure}

% \begin{subfigure}{.33\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{images/employment_pdfs/acc_edu_prob_acc_MN.pdf}
%   \caption{}
%   \label{fig:sfig9-7}
% \end{subfigure}%
% \begin{subfigure}{.33\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{images/employment_pdfs/skim_edu_prob_skim_MN.pdf}
%   \caption{}
%   \label{fig:sfig9-8}
% \end{subfigure}%
% \begin{subfigure}{.33\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{images/employment_pdfs/knapsack_edu_prob_knapsack_ND.pdf}
%   \caption{}
%   \label{fig:sfig9-9}
% \end{subfigure}

% \begin{subfigure}{.33\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{images/employment_pdfs/acc_edu_prob_acc_ND.pdf}
%   \caption{}
%   \label{fig:sfig9-10}
% \end{subfigure}%
% \begin{subfigure}{.33\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{images/employment_pdfs/skim_edu_prob_skim_ND.pdf}
%   \caption{}
%   \label{fig:sfig9-11}
% \end{subfigure}%
% \begin{subfigure}{.33\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{images/employment_pdfs/knapsack_edu_prob_knapsack_MN.pdf}
%   \caption{}
%   \label{fig:sfig9-12}
% \end{subfigure}
% \label{fig:fig9}
% \end{figure}

% \begin{figure}[!ht]
% \begin{subfigure}{.3333\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{xgb_hists/edu_CE_employment_acc.png}
%   \caption{}
%   \label{fig:sfig1}
% \end{subfigure}%
% \begin{subfigure}{.3333\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{xgb_hists/edu_CE_employment_skim.png}
%   \caption{}
%   \label{fig:sfig2}
% \end{subfigure}%
% \begin{subfigure}{.3333\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{xgb_hists/edu_CE_employment_knapsack.png}
%   \caption{}
%   \label{fig:sfig2}
% \end{subfigure}

% \begin{subfigure}{.3333\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{xgb_hists/label_CE_employment_acc.png}
%   \caption{}
%   \label{fig:sfig1}
% \end{subfigure}%
% \begin{subfigure}{.3333\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{xgb_hists/label_CE_employment_skim.png}
%   \caption{}
%   \label{fig:sfig2}
% \end{subfigure}%
% \begin{subfigure}{.3333\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{xgb_hists/label_CE_employment_knapsack.png}
%   \caption{}
%   \label{fig:sfig2}
% \end{subfigure}

% \caption{Marginal distributions of XGBoost feature importances on education and label over all models trained on cross-entropy loss predicting employment, with each model's performance on each state's test set representing a single datapoint.}
% \label{fig:fig3}

% \end{figure}

% \begin{figure}
% \begin{subfigure}{.3333\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{xgb_hists/edu_SPO_employment_acc.png}
%   \caption{}
%   \label{fig:sfig1}
% \end{subfigure}%
% \begin{subfigure}{.3333\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{xgb_hists/edu_SPO_employment_skim.png}
%   \caption{}
%   \label{fig:sfig2}
% \end{subfigure}%
% \begin{subfigure}{.3333\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{xgb_hists/edu_SPO_employment_knapsack.png}
%   \caption{}
%   \label{fig:sfig2}
% \end{subfigure}

% \begin{subfigure}{.3333\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{xgb_hists/label_SPO_employment_acc.png}
%   \caption{}
%   \label{fig:sfig1}
% \end{subfigure}%
% \begin{subfigure}{.3333\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{xgb_hists/label_SPO_employment_skim.png}
%   \caption{}
%   \label{fig:sfig2}
% \end{subfigure}%
% \begin{subfigure}{.3333\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{xgb_hists/label_SPO_employment_knapsack.png}
%   \caption{}
%   \label{fig:sfig2}
% \end{subfigure}

% \caption{Marginal distributions of XGBoost feature importances on education and label over all models trained on SPO loss predicting employment, with each model's performance on each state's test set representing a single datapoint.}
% \label{fig:fig3}

% \end{figure}

% \begin{figure}
% \begin{subfigure}{.3333\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{xgb_hists/edu_CE_income_acc.png}
%   \caption{}
%   \label{fig:sfig1}
% \end{subfigure}%
% \begin{subfigure}{.3333\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{xgb_hists/edu_CE_income_skim.png}
%   \caption{}
%   \label{fig:sfig2}
% \end{subfigure}%
% \begin{subfigure}{.3333\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{xgb_hists/edu_CE_income_knapsack.png}
%   \caption{}
%   \label{fig:sfig2}
% \end{subfigure}

% \begin{subfigure}{.3333\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{xgb_hists/label_CE_income_acc.png}
%   \caption{}
%   \label{fig:sfig1}
% \end{subfigure}%
% \begin{subfigure}{.3333\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{xgb_hists/label_CE_income_skim.png}
%   \caption{}
%   \label{fig:sfig2}
% \end{subfigure}%
% \begin{subfigure}{.3333\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{xgb_hists/label_CE_income_knapsack.png}
%   \caption{}
%   \label{fig:sfig2}
% \end{subfigure}

% \caption{Marginal distributions of XGBoost feature importances on education and label over all models trained on cross-entropy loss predicting income, with each model's performance on each state's test set representing a single datapoint.}
% \label{fig:fig3}

% \end{figure}

% \begin{figure}
% \begin{subfigure}{.3333\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{xgb_hists/edu_SPO_income_acc.png}
%   \caption{}
%   \label{fig:sfig1}
% \end{subfigure}%
% \begin{subfigure}{.3333\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{xgb_hists/edu_SPO_income_skim.png}
%   \caption{}
%   \label{fig:sfig2}
% \end{subfigure}%
% \begin{subfigure}{.3333\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{xgb_hists/edu_SPO_income_knapsack.png}
%   \caption{}
%   \label{fig:sfig2}
% \end{subfigure}

% \begin{subfigure}{.3333\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{xgb_hists/label_SPO_income_acc.png}
%   \caption{}
%   \label{fig:sfig1}
% \end{subfigure}%
% \begin{subfigure}{.3333\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{xgb_hists/label_SPO_income_skim.png}
%   \caption{}
%   \label{fig:sfig2}
% \end{subfigure}%
% \begin{subfigure}{.3333\linewidth}
%   \centering
%   \includegraphics[width=\linewidth]{xgb_hists/label_SPO_income_knapsack.png}
%   \caption{}
%   \label{fig:sfig2}
% \end{subfigure}

% \caption{Marginal distributions of XGBoost feature importances on education and label over all models trained on SPO loss predicting income, with each model's performance on each state's test set representing a single datapoint.}
% \label{fig:fig3}

% \end{figure}

% \clearpage

% \subsection{Link To Full Histograms of XGBoost Importance Scores Over All Features}

% Please visit the following Google Drive folder to view all histograms of XGBoost importance scores, over each of the features in our datasets: \href{https://drive.google.com/drive/folders/1Q9Mch3dG8Tf19L-LS-RzZe7mXUqd2Ggf?usp=sharing}{Link To Visualizations}

% \subsection{Link To Full Histograms of OLS Importance Scores Over All Features}

% Please visit the following Google Drive folder to view all histograms of OLS linear regression importance scores, over each of the features in our datasets: \href{https://drive.google.com/drive/folders/1dyAy4Ikj9UjH4XaYbZolxHIF3IGEtjU7?usp=sharing}{Link To Visualizations}

% \subsection{Link To Full Empirically Sampled Joint Distributions}

% Please visit the following Google Drive folder to view all sampled joint distributions of each converged worst-case distribution for all (model, state, loss function) combinations: \href{https://drive.google.com/drive/folders/1e1gnieZkkS4dfPTmRYKotFFLJZqBdPuG?usp=sharing}{Link To Visualizations}

% \section{Additional simulation results}
% Table~\ref{tab:supp-data} lists additional simulation results; see also \citet{einstein} for a comparison. 

% \begin{table}[!h]
%     \centering
%     \caption{An Interesting Table.} \label{tab:supp-data}
%     \begin{tabular}{rl}
%         \toprule % from booktabs package
%         \bfseries Dataset & \bfseries Result\\
%         \midrule % from booktabs package
%         Data1 & 0.12345\\
%         Data2 & 0.67890\\
%         Data3 & 0.54321\\
%         Data4 & 0.09876\\
%         \bottomrule % from booktabs package
%     \end{tabular}
% \end{table}

% \section{Math font exposition}
% % NOTE: necessary when ptmx or no mathfont class option is given
% \providecommand{\upGamma}{\Gamma}
% \providecommand{\uppi}{\pi}
% How math looks in equations is important:
% \begin{equation*}
%     F_{\alpha,\beta}^\eta(z) = \upGamma(\tfrac{3}{2}) \prod_{\ell=1}^\infty\eta \frac{z^\ell}{\ell} + \frac{1}{2\uppi}\int_{-\infty}^z\alpha \sum_{k=1}^\infty x^{\beta k}\mathrm{d}x.
% \end{equation*}
% However, one should not ignore how well math mixes with text:
% The frobble function \(f\) transforms zabbies \(z\) into yannies \(y\).
% It is a polynomial \(f(z)=\alpha z + \beta z^2\), where \(-n<\alpha<\beta/n\leq\gamma\), with \(\gamma\) a positive real number.


\end{document}
