\documentclass[accepted]{uai2023} % [accepted]
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{abbrvnat}
    \renewcommand{\bibsection}{\subsection*{References}}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{amssymb}
\usepackage{physics}
\usepackage{mathtools}
\usepackage{booktabs}
\usepackage{xfrac}
\usepackage{setspace}
\usepackage{pgf}
\usepackage[ruled]{algorithm2e}
\usepackage{xcolor}
\usepackage[labelsep=period]{caption}
\usepackage{subcaption}
\usepackage{placeins}
\usepackage{wrapfig}
\usepackage{bm}
\usepackage{soul}
\usepackage[ragged]{sidecap}
\usepackage{multirow}

% behold Andrew's magic
\ifdefined\nohyperref\else\ifdefined\hypersetup
  \definecolor{mydarkblue}{rgb}{0,0.08,0.45}
  \hypersetup{ %
    pdftitle={},
    pdfsubject={},
    pdfkeywords={},
    pdfborder=0 0 0,
    pdfpagemode=UseNone,
    colorlinks=true,
    linkcolor=mydarkblue,
    citecolor=mydarkblue,
    filecolor=mydarkblue,
    urlcolor=mydarkblue,
    }
  \fi
\fi


\usepackage{amsthm}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}[theorem] % sub-counter of last theorem
\newtheorem{lemma}[theorem]{Lemma} % same counter as theorem
\newtheorem{proposition}[theorem]{Proposition}
\theoremstyle{definition}
\newtheorem{definition}{Definition}%[section]
\newtheorem{assumption}{Assumption}
\theoremstyle{remark}
\newtheorem*{remark}{Remark}
\DeclareMathOperator{\E}{\mathbb{E}}
\DeclareMathOperator{\e}{\mathrm{e}}
\DeclareMathOperator{\diff}{\mathrm{d}\!}
\DeclareMathOperator{\KL}{\mathrm{D}_{KL}\!}

\title{Partial Identification of Dose Responses with Hidden Confounders (Supplementary Material)}
\author[1]{\href{mailto:myrlm@isi.edu}{Myrl~G.~Marmarelis}{}}
\author[2]{Elizabeth~Haddad}
\author[3]{Andrew~Jesson}
\author[2]{\\Neda~Jahanshad}
\author[1]{Aram~Galstyan}
\author[1]{Greg~{Ver Steeg}}

\affil[1]{USC Information Sciences Institute\\
  4676 Admiralty Way\\
  Marina del Rey, CA 90292}

\affil[2]{USC Stevens Neuroimaging and Informatics Institute\\
  4676 Admiralty Way\\
  Marina del Rey, CA 90292}

\affil[3]{University of Oxford, OATML\\
  14 Parks Road\\
  Oxford, UK OX1 3AQ}

\newcommand{\notindep}{\ensuremath{ \mathbin{\not\!\perp\!\!\!\perp} }}
\newcommand{\indep}{\ensuremath{ \mathbin{\perp\!\!\!\perp} }}

\usepackage{xr-hyper} % xr that works with hyperref? 
\externaldocument{marmarelis_249}

\begin{document}
\onecolumn

\maketitle

\appendix

\setcounter{equation}{9} % to follow from the main paper
\setcounter{figure}{7}
\setcounter{table}{2}



%% BEGIN GRUNGE

%\subsection*{One Step at a Time.}

\section{Completing the Derivations}%\label{sup:derivations}


Consider Equation~\ref{eq:decompose}.A:
\begin{multline}\label{eq:decompose-further}
  \int_0^1 w_t(\tau) \tilde p(y_t|\tau,x)p(\tau|x)\diff\tau
  = \underbrace{p(y_t|t,x)\int_0^1 w_t(\tau) p(\tau|x)\diff\tau}_{(A.0)} \\
  +\quad \underbrace{g_1(y_t|t,x)\int_0^1 w_t(\tau) (\tau-t) p(\tau|x) \diff\tau}_{(A.1)}
  \quad+\quad \underbrace{g_2(y_t|t,x)\int_0^1 w_t(\tau) \frac{(\tau-t)^2}{2}  p(\tau|x) \diff\tau}_{(A.2)},\\
  \textrm{where}\quad g_k(y_t|t,x)\coloneqq \partial^k_\tau p(y_t|\tau,x)|_{\tau=t}.
\end{multline}
Lightening the notation with a shorthand for the weighted expectations, $\langle \cdot \rangle_\tau \coloneqq \int_{\,0}^1 w_t(\tau) (\cdot) p(\tau|x)\diff\tau,$ it becomes apparent that we must grapple with the pseudo-moments $\langle 1 \rangle_\tau$, $\langle \tau-t \rangle_\tau$, and $\langle (\tau-t)^2 \rangle_\tau$. Note that $t$ should not be mistaken for a ``mean'' value.

Furthermore, we have yet to fully characterize $g_k(y_t|t,x)$. Observe that
\begin{align*}
  p(y_t|\tau,x) = \frac{p(\tau|y_t,x)p(y_t|x)}{p(\tau|x)} \quad&\iff\quad \partial_\tau p(y_t|\tau,x) = p(y_t|x)\cdot\frac{\partial}{\partial\tau} \frac{p(\tau|y_t,x)}{p(\tau|x)}.\\
\intertext{The $p(y_t|x)$ will be moved to the other side of the equation as needed; by Equation~\ref{eq:big-lambda},}
  \frac{\partial}{\partial\tau} \frac{p(\tau|y_t,x)}{p(\tau|x)}
  &= \frac{\partial}{\partial\tau} \Lambda(\tau|y_t,x).\\
\intertext{Expanding,}
  &= \frac{\partial}{\partial\tau}\exp{\int_0^\tau \gamma(\tau|y_t,x) \diff \tau}
  \quad=\quad \gamma(\tau|y_t,x) \exp{\int_0^\tau \gamma(\tau|y_t,x) \diff \tau}\\
  &= (\gamma\Lambda)(\tau|y_t,x).
\end{align*}

Appropriate bounds will be calculated for $g_2(y_t|t,x)$ next, utilizing the finding above as their main ingredient. % recipe
Let
\begin{equation*}
  \tilde g_k(y_t|t,x) \coloneqq p(y_t|x)^{-1}g_k(y_t|t,x)
  = \left.\left(\frac{\partial}{\partial\tau}\right)^{\!k} \frac{p(\tau|y_t,x)}{p(\tau|x)}\right|_{\tau=t.}
\end{equation*}

The second derivative may be calculated in terms of the ignorance quantities $\gamma, \Lambda$:
\begin{align*}
  \tilde g_2(y_t|t,x) =& \partial_\tau \gamma(\tau|y_t,x)\Lambda(\tau|y_t,x)\\
  =& \gamma(\tau|y_t,x)^2 \Lambda(\tau|y_t,x) + \dot\gamma(\tau|y_t,x)\Lambda(\tau|y_t,x)\\
  =& (\gamma^2 + \dot\gamma)\Lambda(\tau|y_t,x).
\end{align*}

And finally we address $\tilde p(y_t|x)$. Carrying over the components of Equation~\ref{eq:decompose-further} into Equation~\ref{eq:decompose},
\begin{equation}\begin{aligned}
  \tilde p(y_t|x) &= \frac{p(y_t|t,x)\langle1\rangle_\tau}{\langle\Lambda(\tau|y_t,x)\rangle_\tau
    - \tilde g_1(y_t|t,x)\langle\tau-t\rangle_\tau - \tilde g_2(y_t|t,x)\langle(\tau-t)^2\rangle_\tau}\\
  %&= \frac{p(y_t|t,x)}{\E_\tau[\Lambda(\tau|y_t,x)]
  %    - \tilde g_1(y_t|t,x)\E_\tau[\tau-t] - \frac{1}{2}\tilde g_2(y_t|t,x)\E_\tau[(\tau-t)^2]}\\
  &= \frac{p(y_t|t,x)}{\E_\tau[\Lambda(\tau|y_t,x)]
      - (\gamma\Lambda)(t|y_t,x)\E_\tau[\tau-t] - \frac{1}{2}((\dot\gamma+\gamma^2)\Lambda)(t|y_t,x)\E_\tau[(\tau-t)^2]},
\end{aligned}\end{equation}
where these expectations $\E_\tau[\cdot]$ are with respect to the implicit distribution $q(\tau|t,x)\propto w_t(\tau)p(\tau|x).$ The first term in the denominator, $\E_\tau[\Lambda(\tau|y_t,x)]$, may be approximately bounded by the same Algorithm~\ref{alg:minimax}.

%% END GRUNGE

%% BEGIN THEOREM-PROOF

\section{How to Calibrate the Weighing Scheme}%\label{sup:theorem-proof} not allowed in camera ready

We present an argument based on the absolute error of the approximation in Equation~\ref{eq:approx}, specifically for Beta propensities. The following applies to both {\sf Beta} and {\sf Balanced Beta}, $0<t<1$.

% TODO make lemma/proposition about the integral proportionality

Suppose that the the second derivative employed in the Taylor expansion is $Q$-Lipschitz, so that $\abs{\partial^3_\tau p(y_t|\tau,x)} \leq Q.$
Denote the remainder as $\rho(y_t|\tau,x).$ By Taylor's theorem,
\begin{equation*}
\abs{\rho(y_t|\tau,x)} \leq \frac{\abs{\tau-t}^3}{6} Q.
\end{equation*}
The approximated quantity (part A) in Equation~\ref{eq:decompose} is the following integral, which ends up becoming the numerator in Equation~\ref{eq:approx-frac}:
\begin{equation*}
  \int_{0}^1 w_t(\tau)\tilde p(y_t|\tau,x)p(\tau|x)\diff\tau\ =\ \int_{0}^1 w_t(\tau) \big[ p(y_t|\tau,x) + \rho(y_t|\tau,x)\big]p(\tau|x)\diff\tau.
\end{equation*}
The absolute error of this integral is therefore
\begin{equation*}
  \abs{\int_{0}^1 w_t(\tau) \rho(y_t|\tau,x)p(\tau|x)\diff\tau}\ \leq\ \frac{1}{6}Q\underbrace{\int_{0}^1 w_t(\tau)p(\tau|x)\abs{\tau-t}^3\diff\tau}_\text{$\coloneqq J$, which upper-bounds the error.}\quad\text{by the remainder theorem.}
\end{equation*}
Let $A=\alpha-1+rt$ and $B=\beta-1+r(1-t)$, where $(\alpha,\beta)$ parametrize the nominal propensity and $r$ is the precision of the Beta trust-weighing scheme.
The trust-propensity combination is
\begin{equation*}
  w_t(\tau)p(\tau|x) = \frac{\tau^A(1-\tau)^B}{c_t\,\mathbb{B}(\alpha,\beta)},\quad\text{where $c_t=t^{rt}(1-t)^{r(1-t)}$.}
\end{equation*}
Hence, the error bound reduces to
\begin{align*}
  J\ =&\ [c_t\,\mathbb{B}(\alpha,\beta)]^{-1} \int_{0}^1 \tau^A(1-\tau)^B\abs{\tau-t}^3\diff\tau\\
  =&\ [c_t\,\mathbb{B}(\alpha,\beta)]^{-1}\left[\ \underbrace{\frac{\Gamma(A+1)\Gamma(B+1)}{\Gamma(A+B+5)}U_3(A,B,t)}_\text{first term}\ \ +\ \ \underbrace{\frac{\Gamma(A+1)}{\Gamma(A+5)}12t^{A+4}(1-t)^{B+4}\,{_2}F_1(4, A+B+5, A+5;\, t)}_\text{second term}\ \right],
\end{align*}
where $U_3(A,B,t)$ is a cubic polynomial in $A$, $B$, and $t$. Notice that even though the quantity is symmetric about $(A,B,t)\mapsto(B,A,1-t)$, the form does not appear so. We shall focus on the relation of the error bound entirely with $A$ and $\alpha$, then justify the analogous conclusion for $B$ and $\beta$ by the underlying symmetry of the expression.

The Gaussian hypergeometric function in the second term can be expressed as
\begin{align*}
  \sum_{i=0}^\infty \frac{(4)_i(A+B+5)_i}{(A+5)_i} \frac{t^i}{i!}\ =&\ \sum_{i=0}^\infty (4)_i \underbrace{\left(\frac{A+B+5}{A+5}\right)\left(\frac{A+B+6}{A+6}\right)\cdots}_\text{length $i$} \frac{t^i}{i!}\\
  =&\ \sum_{i=0}^\infty \frac{(4)_i}{i!} \left(1+\frac{B}{A+5}\right)\left(1+\frac{B}{A+6}\right)\cdots t^i,\quad\text{where } \frac{(4)_i}{i!} = \frac{(i+2)(i+3)(i+4)}{3!}.
\end{align*}
by using the definition of the Pochhammer symbol $(x)_i=x(x+1)\dots(x+i-1)$. In terms of $A\to\infty$, the whole second term in $J$ is $\mathcal{O}(A^{-4})$ due to the fraction of $\Gamma$ functions. The first term in $J$ is
\begin{equation*}
  \mathcal{O}(A^{-(B+4)}B^{-(A+4)})\cdot U_3(A,B,t)=\mathcal{O}(A^{-B-1}B^{-A-1})
\end{equation*}
by Stirling's approximation of $\Gamma(x)=\mathcal{O}(x^{x-\frac{1}{2}})$.
Clearly, a small $B>0$ might cause the first term in $J$ to explode with large $A$ due to the $\mathcal{O}(B^{-A-1})$ part. This could occur with high $\alpha$, low $\beta$, and low $r$---it is an instance of a high-precision propensity and low-precision weighing scheme destroying the upper error bound. Hence follows an argument for having $r$ match the propensity's precision, to avoid these cases.

As mentioned earlier, the same argument flows for large $B$ and small $A$, while swapping $t\mapsto (1-t).$


%% END THEOREM-PROOF

%% BEGIN OPTIMALITY

\section{Correctness of Algorithm~\ref{alg:minimax}}%\label{sup:algo}
The algorithm functions by incrementally reallocating mass (relative, in the weights) to the righthand side, from a cursor beginning on the lefthand side of the ``tape''.
\begin{proof}
  Firstly we characterize the indicator quantity $\Delta_j.$ Differentiate the quantity to be maximized with respect to $w_j;$
  \begin{align*}
    \frac{\partial}{\partial w_j} \frac{\sum_i w_i f_i}{\sum_i w_i} =& \frac{f_j}{\sum_i w_i} - \frac{\sum_i w_i f_i}{\left(\sum_i w_i\right)^2}\\
    =& \frac{f_j\sum_i w_i - \sum_i w_i f_i}{\left(\sum_i w_i\right)^2}\\
    \propto& \underbrace{\sum_i w_i (f_j-f_i)}_{ \coloneqq \Delta_j } \quad \textrm{up to some positive factor.}
  \end{align*}
  Hence, $\Delta_j$ captures the sign of the derivative.

  We shall proceed with induction. Begin with the first iteration, $j=1.$ No weights have been altered since initialization yet. Therefore we have
  \begin{equation*}
    \Delta_1 = \sum_i \overline{w}_i (f_1 - f_i).
  \end{equation*}
  Since $\forall i,\ f_1\leq f_i$ due to the prior sorting, $\Delta_1$ is either negative or zero. If zero, trivially terminate the procedure as all function values are identical.

  Now assume that by the time the algorithm reaches some $j>1$, all $w_k=\underline{w}_k$ for $1\leq k<j$. In other words,
  \begin{equation*}
    \Delta_j = \sum_{i<j}\underline{w}_i \underbrace{(f_j-f_i)}_{(+)} + \sum_{i>j}\overline{w}_i\underbrace{(f_j-f_i)}_{(-)}.
  \end{equation*}
  Per the algorithm, we would flip the weight $w_j\gets \underline{w}_j$ only if $\Delta_j<0.$ In that case,
  \begin{equation*}
    \sum_{i<j}\underline{w}_i (f_j-f_i) < \sum_{i>j}\overline{w}_i(f_i-f_j), \quad \textrm{where both sides are non-negative.}
  \end{equation*}
  Notice that the above is not affected by the current value of $w_j.$ This update can only increase the current estimate because the derivative remains negative and the weight at $j$ is non-increasing. We \emph{must} verify that the derivatives for the previous weights, indexed at $k<j$, remain negative. Otherwise, the procedure would need to backtrack to possibly flip some weights back up.

  More generally, with every decision for weight assignment, we seek to ensure that the condition detailed above is not violated for any weights that have been finalized. That includes the weights before $j$, and those after $j$ at the point of termination. Returning from this digression, at $k<j$ after updating $w_j$,
  \begin{equation*}
    \Delta_k = \sum_{i\leq j} \underline{w}_i (f_k-f_i) + \sum_{i>j}\overline{w}_i(f_k-f_i).
  \end{equation*}
  To glean the sign of this, we refer to a quantity that we know.
  \begin{align*}
    \sum_{i<j}\underline{w}_i (f_j-f_i) <& \sum_{i>j}\overline{w}_i(f_i-f_j)\\
    \iff \sum_{i\leq j}\underline{w}_i (f_k-f_i) <& \sum_{i>j}\overline{w}_i(f_i-f_j)+\sum_{i\leq j} \underline{w}_i(f_k-f_j)\\
    \iff \underbrace{\sum_{i\leq j}\underline{w}_i (f_k-f_i) + \sum_{i>j}\overline{w}_i(f_k-f_i)}_{\Delta_k} <& \underbrace{\sum_{i>j}\overline{w}_i(f_k-f_j)+\sum_{i\leq j} \underline{w}_i(f_k-f_j)}_{\textrm{negative.}}\\
  \end{align*}
  The remaining fact to be demonstrated is that upon termination, when $\Delta_j\geq 0,$ no other pseudo-derivatives $\Delta_{j'},\ j'>j$ are negative. This must be the case simply because $f_{j'}\geq f_j.$
\end{proof}

%% END OPTIMALITY

\twocolumn

%% BEGIN ILLUSTRATIVE-DETAILS

\section{On the introductory illustration}%\label{sup:ill-details}

\begin{figure}[!htb]\centering
  \includegraphics[width=\linewidth]{figures/taleb-2.pdf}
\caption{\label{fig:curve-flipping-more}
  Elaboration on the example in Figure~\ref{fig:curve-flipping}.
  Treatments were exponentially distributed, and the thresholds displayed in the grid controlled the center of the second sigmoid in $S^2$ due to \citet{ref:taleb}. Two different visible attributes demonstrate how the hidden bias depends on the interplay between propensity and outcome, via the hidden attribute. The blue curve is a little shorter, which allows the vulnerable subgroup's threshold change to be revealed in the data. Estimation minimized the empirical squared error. } % population -> record?
%When a confounder is distorting the assigned treatments in sub-populations, the overall population-level trend may appear flipped in comparison to each sub-population's dose response.
%We deal with the cases where the variables that separate the sub-populations are unobserved.
%Refer to \S\ref{sec:ill-details} for details on the illustration.}
\end{figure}

\begin{figure}[!hbt]\centering
  \scalebox{0.75}{
    \input{figures/curve-flipping.pgf}}\vspace{-1em}
\caption{\label{fig:curve-flipping-alt}
  A different example that shows the connection to Simpson's paradox more clearly~\citep{ref:yule,ref:simpson}.
  When a confounder is distorting the assigned treatments in sub-populations, the overall population-level trend may appear flipped in comparison to each sub-population's dose response.}
\end{figure}

%% END ILLUSTRATIVE-DETAILS

%% BEGIN RESULT-BENCHMARK-DETAILS

\section{Details on The Benchmark}%\label{sup:result-benchmark}
% quantile normalization: scaled rank orders
During each trial, 750 train and 250 test instances of (observed/hidden) confounders, treatment, and outcome were generated. The APO was computed on the test instances. 
Coverage of the dose-response curve was assessed on a treatment grid of $100$ evenly spaced points in $[0,1]$. The different violation factors $\Gamma$ that were tested were also from a $100$-sized grid in $[0,2.5]$.%from $0$ to $2.5$.
\begin{align*}
  \intertext{The data-generating process constructed vectors}
  V\coloneqq\langle \text{visible conf\dots, treatment, hidden conf\dots} \rangle\in\mathbb{R}^k
\end{align*}
where $k$ is the number of confounders plus one, for the treatment. Each of these variables is a projection of the original data with i.i.d normal coefficients. We upscale the middle (i.e.\ treatment) entry by $(k-1)$ to keep the treatment effect strong enough. Then, we experiment with two functional forms of confounded dose-response curves:
\begin{itemize}
  \item (linear) mixing vector $\{M_i\}_{i=1}^k \sim \text{i.i.d Normal}(0,1)$. Pre-activation outcome is $u\coloneqq M\cdot v$.
  \item (quadratic) matrix $\{M_{ij}\} \sim \text{i.i.d Normal}(0,1)$. Pre-activation outcome is $u\coloneqq v^\text{T} M v$. Unlike a covariance, $M$ is not positive (semi-)definite. The fact that all entries are i.i.d Gaussian implies that there are cases where the off-diagonal entries are much larger in magnitude than the on-diagonal entries, in such a way that cannot occur in a covariance matrix. This induces more confounding and strengthens our benchmark.
\end{itemize} % {(i,j)\in[k]\times[k]}

The actual outcome is Bernoulli with probability $u^\star\coloneqq\phi\big((u - m)/s\big)$, wherein $\phi$ is the standard normal CDF, location parameter $m$ is the sample median, and scale $s$ is the sample mean absolute deviation from the median. If $u$ were normal, $s$ would be expected to be a bit smaller than $\sigma$, by a factor of $\sqrt{2/\pi}$. Generally $u^\star$ is no longer uniformly distributed (on margin) because we use $s$, and instead it gravitates towards zero or one. Since the estimated outcome models use logistic sigmoid activations, there is already an intentional measure of model mismatch present in this setup. % gravitates -> concentrates

See Table~\ref{tab:benchmark-full} for results under all the settings considered.

The linear outcome and propensity predictors were estimated by maximum likelihood using the ADAM gradient-descent optimizer, with learning rate $10^1$, $4$ batches, and $50$ epochs throughout. For the outcome, we used a sigmoid activation stretched horizontally by $10^2$ for smooth training. For the propensity, similarly, we stretched a sigmoid horizontally and vertically, gating the output in order to yield Beta parameters within $(0,10^2)$.


\paragraph{Data sources.}
The datasets \texttt{brain} and \texttt{blood} both came from the UK Biobank, which is described in the case study of \S\ref{sec:result-workflow}. The two datasets are taken from disjoint subsets of all the available fields, one pertaining to parcelized brain volumes (via MRI) and the other to blood tests. The \texttt{pbmc} dataset came from single-cell RNA sequencing, a modality that is exploding in popularity for bioinformatics. PBMC data are a commonly used benchmark in the field~\citep{ref:kang}. Finally, the \texttt{mftc} dataset consisted of BERT embeddings for morally loaded tweets~\citep{ref:hoover, ref:mokhberian}.

\begin{table}[!ht]\centering
\begin{tabular}{l | r r}
Dataset & Sample Size & Dimension \\
\midrule
\texttt{brain} & 43,069 & 148 \\
\texttt{blood} & 31,811 & 42 \\
\texttt{pbmc} & 14,039 & 16 \\
\texttt{mftc} & 17,930 & 768 \\
\bottomrule
\end{tabular}
\caption{Characteristics of the various datasets employed in our experiments.}
\end{table}



Model mismatch varied with how approximately linear the true dose responses were. As expected, there was a significant negative correlation between model likelihood and divergence cost, so poorer fits had higher costs for coverage. %The applicability of the $\delta$MSM was tested over all these scenarios. % different extents

\begin{table*}[bt]\centering
  % single `multicolumn' overrides the vertical rules
  \begin{tabular}{l l l| r r | r r | r r | r r }
    \toprule
    \multicolumn{3}{l}{Benchmarks~{\large\textbackslash}~Scores} & \multicolumn{2}{c}{\tt brain} & \multicolumn{2}{c}{\tt blood} & \multicolumn{2}{c}{\tt pbmc} & \multicolumn{2}{c}{\tt mftc} \\
    %\cline{2-4}\cline{5-7}\cline{8-10}\cline{11-13}
    & & \multicolumn{1}{l}{} & mean & \multicolumn{1}{r}{median} & mean & \multicolumn{1}{r}{median} & mean & \multicolumn{1}{r}{median} & mean & \multicolumn{1}{r}{median} \\
    \midrule
    linear & 2 confounders & $\delta$MSM & $\bm{94}$ & $\bm{71}$ & $\bm{86}$ & $\bm{63}$ & $\bm{105}$ & $\bm{75}$ & $\bm{69}$ & $\bm{59}$ \\
    & & CMSM & $291$ & $253$ & $261$ & $228$ & $288$ & $259$ & $243$ & $204$ \\
    & & uniform & $116$ & $82$ & $104$ & $71$ & $128$ & $83$ & $78$ & $66$ \\
    & & binary MSM & $116$ & $90$ & $104$ & $73$ & $127$ & $94$ & $91$ & $73$  \\
    \midrule
    & 6 confounders & $\delta$MSM & $\bm{63}$ & $\bm{39}$ & $\bm{63}$ & $\bm{33}$ & $\bm{77}$ & $\bm{44}$ & $\bm{47}$ & $\bm{31}$ \\
    & & CMSM & $177$ & $111$ & $186$ & $117$ & $198$ & $136$ & $167$ & $105$ \\
    & & uniform & $68$ & $41$ & $68$ & $36$ & $83$ & $47$ & $51$ & $33$ \\
    & & binary MSM & $177$ & $176$ & $173$ & $163$ & $188$ & $195$ & $168$ & $160$ \\
    \midrule
    & 10 confounders & $\delta$MSM & $\bm{57}$ & $\bm{31}$ & $\bm{61}$ & $\bm{35}$ & $\bm{72}$ & $\bm{31}$ & $\bm{43}$ & $\bm{27}$ \\
    & & CMSM & $151$ & $81$ & $146$ & $84$ & $158$ & $84$ & $126$ & $74$ \\
    & & uniform & $58$ & $32$ & $63$ & $37$ & $73$ & $33$ & $45$ & $28$ \\
    & & binary MSM & $177$ & $181$ & $182$ & $190$ & $172$ & $170$ & $184$ & $191$ \\
    \midrule
    \ul{quadratic} & 2 confounders & $\delta$MSM & $\bm{170}$ & $\bm{151}$ & $\bm{160}$ & $\bm{139}$ & $\bm{180}$ & $\bm{160}$ & $\bm{159}$ & $\bm{144}$ \\
    & & CMSM & $301$ & $275$ & $283$ & $263$ & $299$ & $274$ & $270$ & $248$ \\
    & & uniform & $198$ & $180$ & $190$ & $166$ & $212$ & $188$ & $190$ & $167$ \\
    & & binary MSM & $205$ & $186$ & $192$ & $169$ & $217$ & $198$ & $190$ & $173$ \\
    \midrule
    & 6 confounders & $\delta$MSM & $\bm{138}$ & $\bm{103}$ & $\bm{145}$ & $\bm{120}$ & $\bm{155}$ & $\bm{134}$ & $\bm{140}$ & $\bm{112}$ \\
    & & CMSM & $216$ & $171$ & $220$ & $193$ & $239$ & $223$ & $222$ & $198$ \\
    & & uniform & $171$ & $118$ & $181$ & $149$ & $189$ & $158$ & $177$ & $132$ \\
    & & binary MSM & $217$ & $231$ & $227$ & $257$ & $230$ & $266$ & $224$ & $249$ \\
    \midrule
    & \ul{10 confounders} & $\delta$MSM & $\bm{138}$ & $\bm{101}$ & $\bm{141}$ & $\bm{100}$ & $\bm{138}$ & $\bm{104}$ & $\bm{144}$ & $\bm{117}$ \\
    & & CMSM & $186$ & $173$ & $188$ & $165$ & $205$ & $178$ & $182$ & $165$ \\
    & & uniform & $158$ & $116$ & $162$ & $108$ & $157$ & $117$ & $167$ & $140$ \\
    & & binary MSM & $211$ & $241$ & $213$ & $240$ & $222$ & $258$ & $214$ & $242$ \\
    \bottomrule
  \end{tabular}
  \caption{\label{tab:benchmark-full}The full array of experiments. Underlined settings are those shown in Table~\ref{tab:benchmark}.}
\end{table*} % <10^-3 for linear, <10^-5 for quadratic

% \ul is from the soul package, and has a prettier underline

%% END RESULT-BENCHMARK-DETAILS

%% BEGIN RESULT-WORKFLOW-DETAILS

\section{Details on The Biobank Study}%\label{sup:result-workflow} % application 11559
The application number used to access data from the UK Biobank will be mentioned in the de-anonymized manuscript. The measured outcomes were cortical thicknesses and subcortical volumes, the latter normalized by intracranial volume, obtained via structural Magnetic Resonance Imaging (MRI). The results in the main text (\S\ref{sec:result-workflow}) focused on the cortical thicknesses, for brevity. Input variables comprising the covariates and DQS treatments are listed in Table~\ref{tab:biobank-variables}. Inputs were normalized in the unit interval, and outputs were $z$-scored.

\paragraph{Training the models.}
The outcome predictors with 40 inputs and 48 outputs were implemented as multilayer perceptions with three hidden layers of width 32, and single-skip connections. They used Swish activation functions and a unit dropout rate of $0.1$. The ADAM optimizer with learning rate $5\times 10^{-3}$ was was run for $10^4$ epochs. The data were split into four non-overlapping test sets, with separate ensembles of 16 predictors trained for each split. Training sets were bootstrap-resampled for each estimator in the ensemble. The propensity was formulated as a linear model outputting Beta parameters within $(0,64)$, trained in a similar fashion. Finally, CAPOs were partially identified using the set of models from the train-test split for which the data instance belonged to the test set.

\paragraph{Additional figures.}
This exploratory study includes plots of relative effects on the various brain regions, shown in Figures~\ref{fig:biobank-sexdiff}~\&~\ref{fig:biobank-delta-bin}. We plan on studying the differential effects of diet on the brain further.

\begin{figure}[!ht]\centering
  \includegraphics[width=\linewidth]{figures/biobank-sexdiff-fig.png}
  \caption{\label{fig:biobank-sexdiff}Normalized effect differences between males and females for the overall average diet score and stratified by individual diet components. The lefthand columns depict individual effects across all cortical thickness parcellations and the righthand side shows subcortical regional volumes. Females show generally larger effects across most diet components.}
\end{figure}

\begin{figure*}[!ht]\centering
  \includegraphics[width=0.85\textwidth]{figures/biobank-delta-bin.png}
  \caption{\label{fig:biobank-delta-bin}Normalized effect differences comparing the $\delta$MSM against a shoehorned binary MSM (``$\delta$'' vs. ``B'') stratified by sex. Note differences in relative feature importance, where continuous modeling ranks vegetables and whole grains to be the most important compared to the binary model which emphasizes dairy, vegetable oils, refined grains (primarily for males) and fish.}
\end{figure*}



\begin{table*}[bt]
  \small
  \begin{tabular}{|p{1.9cm}|p{4.5cm}|p{6cm}|p{3cm}|}
  \hline
  \textbf{Variable} & \textbf{Features} & \textbf{Classifications} & \textbf{Data Field ID} \\
  \hline
  \multirow{3}{5em}{Demographics} & Age at scan  & - & 21003 \\\cline{2-4}
  & Sex & Male/Female & 31 \\\cline{2-4}
  & Townsend Deprivation Index & - & 189 \\\cline{2-4}
  & ApoE4 copies & 0, 1, 2 & - \\\cline{2-4}
  \hline
  Education& College/University & Yes/No &  6138\\
  \hline
  \multirow{3}{5em}{Physical Activity/ Body Composition} & American Heart Association (AHA) guidelines for weekly physical activity & Ideal ($\geq$150 min/week moderate or $\geq$75 min/wk vigorous or 150 min/week mixed); Intermediate (1--149 min/week moderate or 1--74 min/week vigorous or 1--149 min/week mixed); Poor (not performing any moderate or vigorous activity) & 884, 904,  894, 914 \\\cline{2-4}
  & Waist to Hip Ratio (WHR) & - & 48,49 \\\cline{2-4}
  & Normal WHR & Females: $\le$ 0.85; Males $\le$ 0.90 & 48,49 \\\cline{2-4}
  & Body Mass Index (BMI) & - & 23104 \\\cline{2-4}
  & Body fat percentage & - & 23099 \\\cline{2-4}
  \hline
  \multirow{3}{5em}{Sleep} & Sleep 7-9 Hours a Night &  - & 1160 \\\cline{2-4}
  & Job Involves Night Shift Work & Never/Rarely & 3426 \\\cline{2-4}
  & Daytime Dozing/Sleeping & Never/Rarely & 1220 \\\cline{2-4}
  \hline
  \multirow{3}{5em}{Diet} & DQS 1 - Fruit & - & 1309, 1319 \\\cline{2-4}
  & DQS 2 - Vegetables & - & 1289, 1299 \\\cline{2-4}
  & DQS 3 - Whole Grains & - & 1438, 1448, 1458, 1468 \\\cline{2-4}
  & DQS 4 - Fish & - & 1329, 1339 \\\cline{2-4}
  & DQS 5 - Dairy & - & 1408, 1418 \\\cline{2-4}
  & DQS 6 - Vegetable Oil & - & 1428, 2654, 1438 \\\cline{2-4}
  & DQS 7 - Refined Grains & - & 1438, 1448, 1458, 1468 \\\cline{2-4}
  & DQS 8 - Processed Meats & - & 1349, 3680 \\\cline{2-4}
  & DQS 9 - Unprocessed Meats & - & 1369, 1379, 1389, 3680 \\\cline{2-4}
  & DQS 10 - Sugary Foods/Drinks & - & 6144 \\\cline{2-4}
  & Water intake & Glasses/day & 1528 \\\cline{2-4}
  & Tea intake & Cups/day & 1488 \\\cline{2-4}
  & Coffee intake & Cups/day & 1498 \\\cline{2-4}
  & Fish Oil Supplementation & Yes/No & 20084 \\\cline{2-4}
  & Vitamin/Mineral Supplementation & Multivitamin (with iron/ calcium/ multimineral)/ Vitamins A, B6, B12, C, D, or E/ Folic acid/ Chromium/ Magnesium/ Selenium/ Calcium/ Iron/ Zinc/ Other vitamin & 20084 \\\cline{2-4}
  & Variation in diet & Never/Rarely; Sometimes; Often & 1548 \\\cline{2-4}
  & Salt added to food & Never/Rarely; Sometimes; Usually; Always & 1478 \\\cline{2-4}
  \hline
  Smoking & Smoking status & Never; Previous; Current & 20116\\
  \hline
  Alcohol & Alcohol Frequency & Infrequent (1–3 times a month, special occasions only, or never); Occasional (1–2 a week or 3–4 times a week), Frequent (daily/almost daily and ICD conditions F10, G312, G621, I426, K292, K70, K860, T510) & 1558/ICD\\
  \hline
  \multirow{3}{5em}{Social Support} & Leisure/social activities & Sports club/gym;  pub/social; social/religious; social/adult education; other social group & 6160 \\\cline{2-4}
  & Frequency of Friends/Family Visits & Twice/week or more & 1031 \\\cline{2-4}
  & Able to Confide in Someone & Almost Daily & 2110 \\\cline{2-4}
  \hline
  \end{tabular}
  \caption{\label{tab:biobank-variables}Variables, features, classifications, and respective data fields use in the models.  Diet quality scores (DQS) ranging from 0--10 for 10 components were computed using the same coding scheme as in \citet{ref:said, ref:zhuang}. Leisure/social activity classifications served as their own binary variables. Our results omitted DQS \#8 \& \#10 because they were not even approximately continuous, taking on only a few discrete values.}
\end{table*}

%% END RESULT-WORKFLOW-DETAILS

\section{Source-code Availability}
Please visit\ \ \url{https://github.com/marmarelis/TreatmentCurves.jl}.
Also in the \texttt{scripts/} subdirectory of the supplementary source, the \texttt{synthetic.jl} file recreates the semi-synthetic benchmarks, and the \texttt{biobank.jl} file sets up the case study.


\bibliography{refs}

\end{document}