\documentclass[accepted]{uai2023} % [accepted]
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{abbrvnat}
    \renewcommand{\bibsection}{\subsection*{References}}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{amssymb}
\usepackage{physics}
\usepackage{mathtools}
\usepackage{booktabs}
\usepackage{xfrac}
\usepackage{setspace}
\usepackage{pgf}
\usepackage[ruled]{algorithm2e}
\usepackage{xcolor}
\usepackage[labelsep=period]{caption}
\usepackage{subcaption}
\usepackage{placeins}
\usepackage{wrapfig}
\usepackage{bm}
\usepackage{soul}
\usepackage[ragged]{sidecap}
\usepackage{multirow}

% behold Andrew's magic
\ifdefined\nohyperref\else\ifdefined\hypersetup
  \definecolor{mydarkblue}{rgb}{0,0.08,0.45}
  \hypersetup{ %
    pdftitle={},
    pdfsubject={},
    pdfkeywords={},
    pdfborder=0 0 0,
    pdfpagemode=UseNone,
    colorlinks=true,
    linkcolor=mydarkblue,
    citecolor=mydarkblue,
    filecolor=mydarkblue,
    urlcolor=mydarkblue,
    }
  \fi
\fi


\usepackage{amsthm}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}[theorem] % sub-counter of last theorem
\newtheorem{lemma}[theorem]{Lemma} % same counter as theorem
\newtheorem{proposition}[theorem]{Proposition}
\theoremstyle{definition}
\newtheorem{definition}{Definition}%[section]
\newtheorem{assumption}{Assumption}
\theoremstyle{remark}
\newtheorem*{remark}{Remark}
\DeclareMathOperator{\E}{\mathbb{E}}
\DeclareMathOperator{\e}{\mathrm{e}}
\DeclareMathOperator{\diff}{\mathrm{d}\!}
\DeclareMathOperator{\KL}{\mathrm{D}_{KL}\!}

\title{Partial Identification of Dose Responses with Hidden Confounders}
\author[1]{\href{mailto:myrlm@isi.edu}{Myrl~G.~Marmarelis}{}}
\author[2]{Elizabeth~Haddad}
\author[3]{Andrew~Jesson}
\author[2]{\\Neda~Jahanshad}
\author[1]{Aram~Galstyan}
\author[1]{Greg~{Ver Steeg}}

\affil[1]{USC Information Sciences Institute\\
  4676 Admiralty Way\\
  Marina del Rey, CA 90292}

\affil[2]{USC Stevens Neuroimaging and Informatics Institute\\
  4676 Admiralty Way\\
  Marina del Rey, CA 90292}

\affil[3]{University of Oxford, OATML\\
  14 Parks Road\\
  Oxford, UK OX1 3AQ}

\newcommand{\notindep}{\ensuremath{ \mathbin{\not\!\perp\!\!\!\perp} }}
\newcommand{\indep}{\ensuremath{ \mathbin{\perp\!\!\!\perp} }}

\begin{document}
\maketitle


\begin{abstract}
  Inferring causal effects of continuous-valued treatments from observational data is a crucial task promising to better inform policy- and decision-makers. 
  A critical assumption needed to identify these effects is that all confounding variables---causal parents of both the treatment and the outcome---are included as covariates. 
  Unfortunately, given observational data alone, we cannot know with certainty that this criterion is satisfied.
  Sensitivity analyses provide principled ways to give bounds on causal estimates when confounding variables are hidden.
  While much attention is focused on sensitivity analyses for discrete-valued treatments, much less is paid to continuous-valued treatments. 
  We present novel methodology to bound both average and conditional average continuous-valued treatment-effect estimates when they cannot be point identified due to hidden confounding.
  A semi-synthetic benchmark on multiple datasets shows our method giving tighter coverage of the true dose-response curve than a recently proposed continuous sensitivity model and baselines.
  Finally, we apply our method to a real-world observational case study to demonstrate the value of identifying dose-dependent causal effects.
\end{abstract}



%% BEGIN INTRO

\section{Introduction}\label{sec:intro}
% spurious correlation is NOT confounding!
% ... so that the conclusions drawn resemble those from a ...
% ... comprises the family of techniques
% I've noticed that I have the tendency to rope in idea after idea and weave them together along one "narrative", which is cool, but sometimes I need to err on the side of introducing just one concept at a time.

%  entails various data corrections
Causal inference on observational studies~\citep{ref:hill, ref:athey19} attempts to predict conclusions of alternate versions of those studies, as if they were actually properly randomized experiments.
The causal aspect is unique among inference tasks in that the goal is not prediction per se, as causal inference deals with \emph{counterfactuals}, the problem of predicting unobservables: for example, what would have been a particular patient's health outcome had she taken some medication, versus not, while keeping all else equal (\emph{ceteris paribus})?
There is quite often no way to validate the results without bringing in additional domain knowledge.
A set of putative treatments $\mathcal{T}$, often binary with a treated/untreated dichotomy, induces \emph{potential outcomes} $Y_{t\in\mathcal{T}}$. These can depend on covariates $X$ as with heterogeneous treatment effects $\E[Y_1-Y_0 \mid X]$ in the binary case. Only one outcome is ever observed: that at the assigned treatment $T$. Potential biases arise from the incomplete observation. This problem is exacerbated with more than two treatment values, especially when there are infinite possibilities, like in a continuum, e.g.\ $\mathcal{T}=[0,1]$. Unfortunately, many consequential decisions in life involve this kind of treatment: What dose of drug should I take? How much of \rule{1.25em}{0.5pt} should I eat/drink? How much exercise do I really need?

%It is only ever one branch of an individual's inherent \emph{potential outcomes}, induced by different treatments or lack thereof, that is realized and observed.
% realized -> actualized ? 
%The problem is similar to that of domain generalization ... where other domains are never observed or something like that

%% note: point to how our structural assumption for the hidden confounder is that it follows the observed confounders?


In an observational study, the direct causal link between assigned treatment $T$ and observed outcome $Y$ (also denoted as $Y_T$) can be influenced by indirect links modulated by \emph{confounding} variables. For instance, wealth is often a confounder in an individual's health outcome from diet, medication, or exercise. Wealth affects access to each of these ``treatments,'' and it also affects health through numerous other paths. Including the confounders as covariates in $X$ allows estimators to condition on them and disentangle the influences~\citep{ref:yao}.

It can be challenging to collect sufficient data, in terms of quality and quantity, on confounders in order to adjust a causal estimation to them.
% spasmodic!
Case in point, noisy observations of e.g.\ lifestyle confounders lead researchers to vacillate on the health implications of coffee \citep{ref:atroszko2019}, alcohol \citep{ref:ystrom2022}, and cheese \citep{ref:godos2020}.

% Double machine learning has that style of outcome and propensity \citep{ref:chernozhukov17}

% We propose a sensitivity analysis for the ... arbitrary bias due to hidden confounders.
% we consider connected sets for treatment!

For consequential real-world causal inference, it is only prudent to allow margins for some amount of hidden confounding. A major impediment to such analysis is that it is impossible to know how a hidden confounder would bias the causal effect. The role of any causal \emph{sensitivity model}~\citep{ref:cornfield,ref:rosenbaum83} is to make reasonable structural assumptions~\citep{ref:manski} about different levels of hidden confounding.
Most sensitivity analyses to hidden confounding require the treatment categories to be binary or at least discrete. This weakens empirical studies that are better specified by dose-response curves~\citep{ref:calabrese,ref:bonvini} from a continuous treatment variable. Estimated dose-response functions are indeed vulnerable in the presence of hidden confounders. Figure~\ref{fig:curve-flipping} highlights the danger of skewed observational studies that lead to biased estimates of personal toxic thresholds of treatment dosages. %demonstrates the danger of these arbitrary biases.
%This is like Simpson's paradox~\citep{ref:yule}. %ref:simpson, but yule did it first
% this is not unlike?

\begin{figure}[!htb]\centering
  \includegraphics[width=0.95\linewidth]{figures/taleb-3.pdf}\vspace{-0.5em}
\caption{\label{fig:curve-flipping}
Dose-respone curves in medicine~\citep[e.g.][]{ref:taleb} can be viewed as expected potential outcomes from continuous treatments.
In this simulation (with details in \S D,) there is one unobserved confounder. The empirical estimate of the population-level dose responses massively overshoots the maximum effective dosage, and would suggest treatments that were actually toxic to the population. This phenomenon persists even when the vulnerable hidden subgroup occurs more often in the population. }
\end{figure}%\vspace{-0.5em}



%ref:tan invented the MSM as we know it

\subsection{Related works}\label{sec:related-works}%\vspace{-0.5em}
There is growing interest in causal methodology for continuous treatments (or exposures, interventions), especially in the fields of econometrics \citep[e.g.][]{ref:huang, ref:tubbicke}, health sciences \citep{ref:vegetabile}, and machine learning \citep{ref:chernozhukov, ref:ghassami, ref:colangelo, ref:kallus-weighting}. So far, most scrutiny on partial identification of potential outcomes has focused on the case of discrete treatments~\citep[e.g.][]{ref:rosenbaum83, ref:louizos, ref:lim}. A number of creative approaches recently made strides in the discrete setting. Most rely on a \emph{sensitivity model} for assessing the susceptibility of causal estimands to hidden-confounding bias. A sensitivity model allows hidden confounders but restricts their possible influence on the data, with an adjustable parameter that controls the overall tightness of that restriction. % garnishes, furnishes, endows

The common discrete-treatment sensitivity models are incompatible with continuous treatments, which are needed for estimating dose-response curves.
Still, some recent attempts have been made to handle hidden confounding under more general treatment domains~\citep{ref:chernozhukov}. \citet{ref:padh2022, ref:hu2021} optimize generative models to reflect bounds on the treatment effect due to ignorance, inducing an implicit sensitivity model through functional constraints. Instrumental variables are also helpful when they are available~\citep{ref:kilbertus2020}. The CMSM~\citep{ref:jesson22} was developed in parallel to this work, and now serves as a baseline.
% the conceptualization of

For binary treatments, the Marginal Sensitivity Model (MSM) due to~\citet{ref:tan} has found widespread usage~\citep{ref:zhao, ref:veitch, ref:yin, ref:kallus, ref:jesson21}. Variations thereof include Rosenbaum's earlier sensitivity model~[\citeyear{ref:rosenbaum}] that enjoys ties to regression coefficients~\citep{ref:yadlowski}.
Alternatives to sensitivity models leverage generative modeling~\citep{ref:meresht} and robust optimization~\citep{ref:guo}.
Other perspectives require additional structure to the data-generating (\emph{observed outcome, treatment, covariates}) process. Proximal causal learning~\citep{ref:tchetgen, ref:mastouri} requires observation of proxy variables. \citet{ref:chen} rely on a large number of background variables to help filter out hidden confounding from apparent causal influences.

\subsection{Contributions}%\vspace{-0.5em}
We propose a novel sensitivity model for continuous treatments in \S\ref{sec:sensitivity}. Next, we derive general formulas (\S\ref{sec:deets}) and solve closed forms for three versions (\S\ref{sec:beta-weights}) of partially identified dose responses---for Beta, Gamma, and Gaussian treatment variables. We devise an efficient sampling algorithm (\S\ref{sec:optim}), and validate our results empirically using a semi-synthetic benchmark (\S\ref{sec:result-benchmark}) and realistic case study (\S\ref{sec:result-workflow}). % over three treatment suports

\subsection{Problem Statement}%\vspace{-0.5em} % In this work, --- lame?
Our goal is the partial identification of causal dose responses under a bounded level of possible hidden confounding. We consider any setup that grants access to two predictors~\citep{ref:chernozhukov17} that can be learned empirically and are assumed to output correct conditional distributions. These are (1) a predictor of outcomes conditioned on covariates and the assigned treatment, and (2) a predictor of the propensity of treatment assignments, taking the form of a probability density, conditioned on the covariates. % on the basis of...
The latter measures (non-)uniformity in treatment assignment for different parts of the population. % many individuals can fall under the same covariates, technically! difference between CATE and ITE
%The latter measures covariate shift: the non-uniformity in treatment assignment for different covariates. could lead to ... sources of bias?
%
%The broad goal is to measure a treatment's effect on an individual, marked by a set of covariates, while accounting for all the confounding between the covariates and the treatment variable. Effects could manifest heterogenously across individuals. In non-interventional settings, observed covariates may not entirely overlap across treatment regimens. % THIS WAS EXPLAINED EARLIER
The observed data come from a joint distribution of outcome, continuous treatment, and covariates that include any observed confounders.

\paragraph{Potential outcomes.}
Causal inference is often cast in the nomenclature of potential outcomes, due to \citet{ref:rubin}.
% meditate on the notion of potential outcomes for a bit longer, first
Our first assumption, common to Rubin's framework, is that observation tuples of outcome, assigned treatment, and covariates, $\{(y^{(i)},t^{(i)},x^{(i)})\}_{i=1}^n,$ are \emph{i.i.d} draws from a single joint distribution. This subsumes the Stable Unit Treatment Value Assumption (SUTVA), where units/individuals cannot depend on one another, since they are \emph{i.i.d}.
% paragraph separation here?
The second assumption is overlap/positivity, that all treatments have a chance of assignment for every individual in the data: ${p_{T\mid X}(t\mid x)>0}$ for every ${(t,x)\in \mathcal{T}\times \mathcal{X}}$.

% make this shit clearer
The third and most challenging fundamental assumption is that of \emph{ignorability}/sufficiency:
% introduce ignorability first
${\{(Y_t)_{t\in\mathcal{T}} \indep T\} \mid X}.$
Clearly the outcome should depend on the assigned treatment, but \emph{potential outcomes} ought not to be affected by the assignment, after blocking out paths through covariates.

Our study focuses on dealing with limited violations to ignorability.
The situation is expressed formally as % from hypothetical treatment?
$\{(Y_t)_{t\in\mathcal{T}} \notindep T\} \mid X$, but more specifically, we shall introduce a sensitivity model that governs the shape and extent of that violation. % degree


% by maximum likelihood or other typical means
Let $p(y_t|x)$ denote the probability density function of \emph{potential} outcome $Y_t=y_t$ from a treatment $t\in \mathcal{T}$, given covariates $X=x$. This is what we seek to infer, while observing realized outcomes that allow us to learn the density $p(y_t|\,x,\,T=t)$. If the ignorability condition held, then $p(y_t|\,x,\,T=t)=p(y_t|x)$ due to the conditional independence. However, without ignorability, one has to marginalize over treatment assignment, requiring $p(y_t|\,x,\,T\neq t)$ because\vspace{-1em}
\begin{equation}\label{eq:main-integral} % motivation: if you just use Bayes' rule, this becomes a tautology.
  p(y_t|x) = \int_\mathcal{T} p(y_t|\,\tau,x)p(\tau|x)\diff\tau,
\end{equation}
where $p(y_t|\tau,x)$ is the distribution of potential outcomes conditioned on actual treatment $T=\tau\in \mathcal{T}$ that may differ from the potential outcome's index $t$. The density $p(\tau|x)$ is termed the nominal propensity, defining the distribution of treatment assignments for different covariate values.

% for quantifying treatment effects of the general form $\E[f(Y_t)|X]$ <-- our actual goal?

\paragraph{On notation.}
Throughout this study, $y_t$ will indicate the value of the potential outcome at treatment $t$, and to disambiguate with \emph{assigned} treatment $\tau$ will be used for events where $T=\tau$. For instance, we may care about the counterfactual of a smoker's $(\tau=1)$ health outcome had they not smoked $(y_{t=0}),$ where $T=0$ signifies no smoking and $T=1$ is ``full'' smoking. %We aim to develop more intuition before introducing the novelties.
We will use the shorthand $p(\cdots)$ with lowercase variables whenever working with probability densities of the corresponding variables in uppercase: % stochastic
\begin{align*}
  %p(\tau|x) \ &\textrm{ means } \ \frac{\partial}{\partial\tau}\mathbb{P}[\,T\leq \tau\mid X=x\,],\quad\textrm{and}\\
  p(y_t|\tau,x) \ &\textrm{ means } \ \frac{\partial}{\partial u}\mathbb{P}[\,Y_t\leq u\mid T=\tau,\ X=x\,]\Big\rvert_{u=y_t.}
\end{align*}

%\begin{figure}[!htb]\centering
%  \includegraphics[width=0.90\linewidth]{figures/graph-inspired.pdf}
%  \caption{\label{fig:graph}Illustration of $Z$ encompassing all confounders and variables determining potential outcomes $Y_{t\in\mathcal{T}=[0,1]}$, with observable covariate $X$ and treatment $T$. The density $p(y_t|\tau,x)$, found in the integrand of Equation~\ref{eq:main-integral}, diverges from $p(y_t|x)$ when the covariates are inadequate to block all links between assigned treatment and potential outcomes. }
%\end{figure}

\paragraph{Quantities of interest.} We attempt to impart intuition on the conditional probability densities that may be confusing.
\begin{itemize}
  \item $p(y_t|\,x)$~~[conditional potential outcome].~~A person's outcome from a treatment, disentangled from the selection bias of treatment assignment in the population. We seek to characterize this in order to (partially) identify the Conditional Average Potential Outcome (CAPO) and the Average Potential Outcome (APO):
  \begin{equation*}
    \text{CAPO}(t, x) = \E[Y_t\mid X=x];\quad \text{APO}(t) = \E[Y_t].
  \end{equation*}
  \item $p(y_t|\,\tau,x)$~~[counterfactual].~~What is the potential outcome of a person in the population characterized by $x$ and assigned treatment $\tau$? The answer changes with $\tau$ only when $x$ is inadequate to block all backdoor paths through confounders. We can estimate this for $t=\tau$. % BACKDOOR PATHS?
  \item $p(\tau|\,y_t, x)$~~[complete propensity]~~is related to the above by Bayes' rule. We distinguish it from the nominal propensity $p(\tau|x)$ because the unobservable $y_t$ possibly confers more information about the individual, again if $x$ is inadequate. The complete propensity cannot be estimated, \emph{even for} $t=\tau$; hence, this is the target of our sensitivity model.
\end{itemize}
\iffalse % I like this one, but it might not fit
Out of all the stochasticity in a potential outcome, some of it could be hidden confounding and the rest of it noise. How much of each depends on the ontology~\citep{ref:sarvet} established, whether intentionally or not. Sometimes error-prone observations of the covariates could lead to leakage of hidden confounding.
\fi

%\vspace{-0.5em}
\begin{SCfigure}[50][ht]\centering
  \includegraphics[width=0.40\linewidth]{figures/diagram.pdf}
  \caption{\label{fig:diagram}In this example, $Z$ encompasses all hidden confounders. Counterfactual $p(y_t|\,\tau,x)$ diverges from $p(y_t|\,x)$ because of the red path from $T$ to $Y_t$ through $Z$.
  %${\{(Y_t)_{t\in\mathcal{T}} \indep T\} \mid X, Z}.$
  }%$p(y_t|\tau,x)=\E_Z[p(y_t|x,z)p(\tau|z)].$}
\end{SCfigure}


%% new in camera-ready, thanks to one of the reviewers
A backdoor path between potential outcomes and treatment can manifest in several ways. Figure~\ref{fig:diagram} shows the barebones setting for hidden confounding to take place. Simply noisy observations of the confounders could leak a backdoor path. %error-prone?
It is important to understand the ontology~\citep{ref:sarvet} of the problem in order to ascribe hidden confounding to the stochasticity inherent to a potential outcome. % inherent, not intrinsic


\iffalse
\paragraph{Interpretation.}
The counterfactual $p(y_t|\tau,x)$ can be confusing. %The potential-outcomes vector $(Y_t)_{t\in[0,1]}$ of infinite dimensionality is \emph{intrinsic} to each individual with true confounder $Z$, for which $X$ is a noisy proxy.
It is important to recognize that a potential outcome $Y_t$ is \emph{intrinsic} to an individual, and does not change based on which outcome is actually realized by the assigned treatment $T$.
%The partitioning of a population's attributes between confounders and treatment can only be achieved with an ontology~\citep{ref:sarvet}. This helps for being intentional with however much of the stochasticity in the outcomes is due to dependency on the treatment, other variables, or just noise. The latter two categories must be distinguished when considering hidden confounders.
A potential outcome could have different layers of stochasticity. Some of it could be hidden confounding and the rest of it noise. How much of each depends on the ontology~\citep{ref:sarvet} established in a particular study, whether intentionally or not. Sometimes error-prone observations of the covariates could lead to incomplete accounting of all confounders, and hence some hidden confounding leaking through.
%By ``true'' confounder we refer to any set of variables that suffice to block all backdoor paths between $Y_t$ and $T$. The potential-outcomes vector would only change from knowledge of assigned treatment $T=\tau$ if it betrayed additional information about $Z$, absent in $X$, that further informed any $Y_t$. We may express $p(y_t|\tau,x)$ explicitly in terms of hypothetical true confounders as $\int p(y_t|z) p(z|\tau,x)\diff z$ because $z$ subsumes both $x$ and $\tau$. This way, $p(y_t|z)$ is the true potential outcome and $p(z|\tau,x)$ acts as a filter for how parts of the true confounder mix together into the proxy $x$ and the assigned treatment $\tau$.

\paragraph{Propensities.} The probability density $p(\tau|x)$ is termed the \emph{nominal propensity}.
A quantity often examined is the \emph{complete propensity}, specifically referring to $p(\tau|\,y_t,x)$ in our realm. The complete propensity can differ from $p(\tau|x)$ when there exists a backdoor dependency between assigned treatment $T$ and potential outcome $Y_t$, specifically through violation of ignorability. In that instance, conditioning on $y_t$ modulates the distribution. By connection through Bayes' rule, conditioning the potential outcomes $p(y_t|x)$ on assigned treatment $\tau$ modulates their distributions. Absent any hidden confounding, $p(y_t|\tau,x)=p(y_t|x)$ and Equation~\ref{eq:main-integral} trivializes. %See Figure~\ref{fig:graph} for a graphical illustration on the runaway influence of $\tau$ on the potential outcomes.

%% old words:
%It is of interest to study the impact of a bounded amount of these discrepancies onto our ignorance of $p(y_t|x)$, and the consequences on $\E[f(Y_t)|X]$ for various functions $f$. % (s,t) rather than (tau,t) ? ... An oft-examined quantity ! ... ---just like $p(y_t|\tau,x)$ from $p(y_t|x)$---
\fi

\paragraph{Sensitivity.}
Explored by \citet{ref:tan} followed by \citet{ref:kallus}, \citet{ref:jesson21}, among many others, the Marginal Sensitivity Model (MSM) serves to bound the extent of (putative) hidden confounding in the regime of binary treatments $T'\in\{0,1\}$. The MSM limits the discrepancy between the odds of treatment under the nominal propensity and the odds of treatment under the complete propensity.

\begin{definition}[The Marginal Sensitivity Model]\label{def:msm}
  For binary treatment $t'\in\{0,1\}$ and violation factor $\Gamma\geq1$, the following ratio is bounded:
  \begin{equation*}
    \Gamma^{-1}\leq\left[\frac{p(t'|x)}{1-p(t'|x)}\right]^{\,-1}\left[\frac{p(t'|\,y_{t'},x)}{1-p(t'|\,y_{t'},x)}\right]\leq\Gamma.
  \end{equation*}
\end{definition}

The confines of a binary treatment afford a number of conveniences. For instance, one probability value is sufficient to describe the whole propensity landscape on a set of conditions, $p(1-t'|\cdots)=1-p(t'|\cdots)$. As we transfer to the separate context of treatment continua, we must contend with infinite treatments and infinite potential outcomes. %upon which to possibly condition.


% towards...
\section{Continuous Sensitivity Model}\label{sec:sensitivity}
\iffalse
%We require a constraint on the fundamentally unobservable quantity $p(\tau|\,y_t,x)$ across treatment assignments $T=\tau$ and potential outcomes $Y_t=y_t$, with contrary $\tau\neq t.$
As with the MSM, our target is to associate the unlearnable complete propensity $p(\tau|\,y_t,x)$ to the learnable nominal propensity $p(\tau|x).$
% say something that we must be smooth in t for knowledge in y_t so that conditioning on a single one confers any knowledge
In other words, we seek to constrain the knowledge conferred on propensity by a single potential outcome value $y_t$. % `information` invokes information theory
%The potential-outcome variables $(Y_t)_{t\in\mathcal{T}}$ are treated as entries in an infinitely long vector.
% smooth in $t$ to some extent, if anything so that an infinitesimal slice $y_t$ of that vector carries any information.
%It is reasonable to impose that the propensity densities $p(\tau|\dots)$ are at least once differentiable in $\tau$.
What sort of analogue exists for the notion of ``odds'' in the MSM? % impose -> require?

Contrast treatment $\tau$ versus $\tau+\delta$ locally, for some infinitesimal $\delta$, at any part of the curve. A translation of the MSM might use this ratio of odds:
$\left[\frac{p(\tau+\delta|x)}{p(\tau|x)}\right]^{\,-1}\left[\frac{p(\tau+\delta|\,y_t,x)}{p(\tau|\,y_t,x)}\right]$.
 % todo? think about and establish links to statistical score functions (d-log-p)
Let us peer into one of those ratios. In logarithms, \vspace{-0.25em}
\begin{align*}
  \delta^{-1}\log\frac{p(\tau+\delta|x)}{p(\tau|x)} &= \frac{\log p(\tau+\delta|x) - \log p(\tau|x)}{\delta}
  \\ &\xrightarrow[\delta\to 0]{} \ \frac{\partial \log p(\tau|x)}{\partial \tau}
  \ \coloneqq\ \partial_\tau \log p(\tau|x),\\
\intertext{assuming the propensity densities are at least once differentiable. We arrive at the notion of an infinitesimal MSM ($\delta$MSM), tying the logarithmic derivatives of the nominal and complete propensity densities.}%$\partial_\tau \log p(\tau|\,y_t,x)$ to $\partial_\tau \log p(\tau|x)$.}
\end{align*}
\vspace{-4em}
\fi

% "Radon-Nikodym derivative or the ratio of densities with respect to a baseline measure" -- Tan
%  Assuming that $P(y_t|\,\tau+\delta,x)$ is absolutely continuous with respect to $P(y_t|\,\tau,x)$
The counterfactuals required for Equation~\ref{eq:main-integral} are almost entirely unobservable. We look to the Radon-Nikodym derivative $\omega_\delta$ of a counterfactual with respect to another~\citep{ref:tan}, quantifying their divergence between nearby treatment assignments: (assuming mutual continuity)
\begin{align*} % for small $\delta\neq0$
  \omega_\delta(y_t|\,\tau,x) \coloneqq&\ \frac{p(y_t|\,\tau+\delta,x)}{p(y_t|\,\tau,x)}\ =\ \ \stackrel{\text{(Bayes' rule)}}{\frac{p(\tau+\delta|\,y_t, x)p(\tau|x)}{p(\tau|\,y_t, x)p(\tau+\delta|x)}}\\
  &= \left[\frac{p(\tau+\delta|x)}{p(\tau|x)}\right]^{\,-1}\left[\frac{p(\tau+\delta|\,y_t,x)}{p(\tau|\,y_t,x)}\right].
\end{align*}
As with the MSM, we encounter a ratio of odds, here contrasting $\tau$ versus $\tau+\delta$ in the assigned-treatment continuum. Assuming the densities are at least once differentiable,
\begin{equation*}
\lim_{\delta\to 0} \delta^{-1}\log\omega_\delta(y_t|\,\tau,x) = \partial_\tau [ \log p(\tau|\,y_t,x) - \log p(\tau|x)].
\end{equation*}
By constraining $\omega_\delta$ to be close to unit, via bounds above and below, we tie the logarithmic derivatives of the nominal- and complete-propensity densities.

\begin{definition}[The Infinitesimal Marginal Sensitivity Model]\label{def:lsm}
  For treatments $t\in\mathcal{T}\subseteq\mathbb{R}$, where $\mathcal{T}$ is connected, and violation-of-ignorability factor $\Gamma\geq 1$, the $\delta$MSM requires
  \begin{equation*}
    \abs{\frac{\partial}{\partial\tau} \log \frac{p(\tau|\,y_t,x)}{p(\tau|x)}} \leq \log\Gamma
  \end{equation*} % not \partial_\tau ?
  everywhere, for all $\tau$, $t$, and $x$ combinations. This differs from the CMSM due to \citet{ref:jesson22} that considers only $t=\tau,$ and which bounds the density ratios directly.
\end{definition}
%% interesting but unnecessary?
%We crafted the $\delta$MSM with the intention of functionally mirroring the MSM---locally, on a treatment continuum. Whereas Definition~\ref{def:lsm} is stated in logarithms, Definition~\ref{def:msm} is not; the difference is merely cosmetic and hyperparameter $\Gamma$ plays an equivalent role in both structures. Nevertheless, the emergent properties are vastly different. % behavior?

%% END INTRO

%% BEGIN DETAILS

\subsection{The Complete Framework}\label{sec:deets}

%We list the core assumptions surrounding our problem.
\begin{assumption}[Bounded Hidden Confounding]\label{ass:lsm}
  Invoking Definition~\ref{def:lsm}, the violation of ignorability is constrained by a $\delta$MSM with some $\Gamma\geq 1$. % governed, controlled
\end{assumption}
\begin{assumption}[Anchor Point]\label{ass:zero}
  A special treatment value designated as zero is not informed by potential outcomes: $p(\tau=0 \mid y_t,x)=p(\tau=0 \mid x)$ for all $x$, $t$, and $y_t$.
\end{assumption} % interpreted as a lack of treatment

At this point we state the core sensitivity assumptions. %Assumption~\ref{ass:zero} states that we look for sensitivity to hidden confounders outside a control group at $T=0$. The restriction is modeled after situations where we expect a dramatically lessened vulnerability to hidden confounders for a relatively more plentiful control group.
In addition to the $\delta$MSM, we require an anchor point at $T=0$, which may be considered a lack of treatment. Strictly, we assume that hidden confounding does not affect the propensity density precisely at the anchor point. A broader interpretation is that the strength of causal effect, hence vulnerability to hidden confounders, roughly increases with $\abs{T}$. Assumption~\ref{ass:zero} is necessary to make closed-form solutions feasible. % monotonic in $\abs{T}$ is too much
%%It should be interpreted as a blind spot in the sensitivity model rather than a requirement for the underlying process.
We discuss ramifications and a relaxation in \S\ref{sec:beta-weights}.
%%There is no additional constraint, besides the $\delta$MSM itself, on how much the complete propensity function may fluctuate around any $T>0$. We motivate and validate this assumption in the real world with \S\ref{sec:results}. % no matter how close to zero

%Next, we proceed with derivations. The key to cracking open Equation~\ref{eq:main-integral} is to carve out a region inside the domain of integration where an approximation can be trusted. This will extrapolate from the infinitesimal point $T=t$ where estimation is feasible, as is shown in Figure~\ref{fig:math-outline}. % not "singular" to be confused with singularity

The unobservability of almost all counterfactuals is unique to the case of continuous treatments, since the discrete analogy would be a discrete sum with an observable term. Figure~\ref{fig:math-outline} explains our approach to solving Equation~\ref{eq:main-integral}.

% we are making a continuity argument

\begin{figure}[hb]\centering
  \includegraphics[width=0.95\linewidth]{figures/math-outline-2.pdf}
  \caption{\label{fig:math-outline}%To predict a potential outcome $Y_t$, we must integrate over the possible treatment assignments $T$ (Equation~\ref{eq:main-integral}). A difficulty arises with continuous treatments because the observable (blue) regime has no mass. We propose to inflate the knowable regime by a localized approximation. } % leveraging
  In the binary case, the red part is unobservable, but the MSM condition helps to bound that quantity. In the continuous case the integrand (Equation~\ref{eq:main-integral}) is unobservable \emph{almost everywhere} in the space of assigned treatments, except for the infinitesimal point $T=t$. In order to divide the integral into two parts (observable and unobservable) like with the binary sum, we must draw an approximation where assigned treatment and potential-outcome index are close enough. We use a soft window (yellow) to mark the validity of the approximation. Our continuous version of the MSM, the $\delta$MSM, allows us to bound the red part as well as reason about the yellow part. Covariates $X$ are omitted for brevity.}
\end{figure}


\subsection{A Partial Approximation}\label{sec:approx} % dealing with an unreliable approximation
We expand $p(y_t|\tau,x)$ around $\tau=t,$ where $p(y_t|t,x)=p(y|t,x)$ is learnable from data. Suppose that $p(y_t|\tau,x)$ is twice differentiable in $\tau$. Construct a Taylor expansion
\begin{multline}\label{eq:approx}
  p(y_t|\tau,x) = p(y_t|t,x) + (\tau-t)\partial_\tau p(y_t|\tau,x)|_{\tau=t} \\+ \frac{(\tau-t)^2}{2}\partial^2_\tau p(y_t|\tau,x)|_{\tau=t} + \mathcal{O}(\tau-t)^3.
\end{multline}
Denote with $\tilde p(y_t|\tau,x)$ an approximation of second order as laid out above. One could have stopped at lower orders but the difference in complexity is not that large. The intractable derivatives like $\partial_\tau p(y_t|\tau,x)|_{\tau=t}$ will be bounded using the $\delta$MSM machinery. Let us quantify the reliability of this approximation by a trust-weighing scheme $0\leq w_t(\tau)\leq 1,$ where typically $w_t(t)=1.$ This corresponds to the yellow part in Figure~\ref{fig:math-outline}. We argue that $w_t(\tau)$ should be narrower with lower-entropy (narrower) propensities (\S B). The possible forms of $w_t(\tau)$ are elaborated in \S\ref{sec:beta-weights}.

%% I tried putting figure here, too

Splitting Equation~\ref{eq:main-integral} along the trusted regime marked by $w_t(\tau)$, and then applying the approximation of Equation~\ref{eq:approx}, % dissect, splinter, decompose
\begin{equation}\begin{aligned}\label{eq:decompose}
  p(y_t|x) =& \int_\mathcal{T} \underbrace{w_t(\tau) p(y_t|\tau,x)p(\tau|x)\diff\tau}_\text{``observable'' (Fig.~\ref{fig:math-outline})}
  \\&+ \int_\mathcal{T} \underbrace{[1-w_t(\tau)] p(y_t|\tau,x)p(\tau|x)\diff\tau}_\text{``unobservable'' (Fig.~\ref{fig:math-outline})}\\  
  \textcolor{red!90!black}{\approx}& \int_\mathcal{T}\, \underbrace{ w_t(\tau) \textcolor{red!90!black}{\tilde p}(y_t|\tau,x)p(\tau|x)\diff\tau }_{(A)\text{ the approximated quantity}}
  \\&+ \int_\mathcal{T}\, \underbrace{ [1-w_t(\tau)] p(\tau|y_t,x)p(y_t|x)\diff\tau }_{(B)\text{ by Bayes' rule}}.
\end{aligned}\end{equation}
The intuition behind separating the integral into two parts is the following. By choosing the weights $w_t(\tau)$ so that they are close to one in the range where approximation Equation~\ref{eq:approx} is valid (yellow region in Figure~\ref{fig:math-outline}) and zero outside of this range, we can evaluate the first integral through the approximated counterfactuals. The second integral, which is effectively over the red region in Figure~\ref{fig:math-outline} and cannot be evaluated due to unobserved counterfactuals, will be bounded using the $\delta$MSM. Simplifying the second integral first,%\vspace{-0.5em}
%This separation into recoverable (A) and entirely unknown (B), demarcated by the weights, ensures that the inaccurate regimes of the approximation vanish (as $w_t(\tau)\to 0$ away from $t$) and are replaced with the ignorant quantity. We simplify part B of Equation~\ref{eq:decompose} first:\vspace{-0.5em}
\begin{multline*} % ensures -> insures?
  \int_\mathcal{T} [1-w_t(\tau)] p(\tau|\,y_t,x)p(y_t|x)\diff\tau \\= p(y_t|x)\left[1 - \int_\mathcal{T} w_t(\tau) p(\tau|\,y_t,x)\diff\tau \right].
\end{multline*} % entails..
By algebraic manipulation, we witness already that $p(y_t|x)$ shall take the form of
\begin{equation}\label{eq:approx-frac}
  p(y_t|x) \approx \frac{\int_\mathcal{T} w_t(\tau) \tilde p(y_t|\,\tau,x)p(\tau|x)\diff\tau}{\int_\mathcal{T} w_t(\tau) p(\tau|\,y_t,x)\diff\tau}.
\end{equation}
Reflecting on Assumptions \ref{ass:lsm}~\&~\ref{ass:zero}, the divergence between $p(\tau|\,y_t,x)$ and $p(\tau|x)$ is bounded, allowing characterization of the denominator in terms of the learnable $p(\tau|x)$. Similarly the derivatives in Equation~\ref{eq:approx} can be bounded. These results would be sufficient to partially identify the numerator. Without loss of generality, consider the unknown quantity $\gamma$ that can be a function of $\tau$, $y_t$, and $x$, such that%\vspace{-0.5em} % repurposed? recycled?
\begin{multline}\label{eq:small-gamma}
  \partial_\tau \log p(\tau|y_t,x) = \partial_\tau \log p(\tau|x) + \gamma(\tau|y_t,x), \\ \text{where } \abs{\gamma(\tau|y_t,x)} \leq \log \Gamma\text{ using the $\delta$MSM.}
\end{multline}
We may attempt to integrate both sides;%\vspace{-0.5em}
\begin{align}
   \int_0^{\,s}\partial_\tau \log p(\tau|\,y_t,x)\diff\tau
     =& \int_0^{\,s}\partial_\tau \log p(\tau|x)\diff\tau \notag\\
     &\ \ + \underbrace{\int_0^{\,s}\gamma(\tau|y_t,x)\diff\tau}_{\coloneqq\lambda(s|y_t,x)}. \notag\\
  \therefore \log p(\tau=s|\,y_t,x) - \log &\, p(\tau=0|\,y_t,x) \notag\\
  = \log p(\tau=s|\,x) -& \log p(\tau=0|\,x) + \lambda(s|y_t,x), \notag\\
  \therefore \log p(\tau|\,y_t,x) = \log p(\tau|&x) + \lambda(\tau|y_t,x). \notag\\
  \textrm{(by Assumption~\ref{ass:zero})}& \notag\\
\label{eq:big-lambda} % \textrm{Hence,}
   \therefore \quad p(\tau|\,y_t,x) = p(\tau|x)\Lambda&(\tau|y_t,x),\ \ \Lambda \coloneqq \exp{\lambda}.
\end{align}
One finds that $\abs{\lambda(\tau|y_t,x)} \leq \abs{\tau}\log\Gamma$ because $\lambda$ integrates $\gamma$, bounded by $\pm\log\Gamma,$ over a support with length $\tau$. Subsequently, $\Lambda$ is bounded by $\Gamma^{\pm\abs{\tau}}.$ These are the requisite tools for bounding $p(y_t|x)$---or an approximation thereof, erring on ignorance via the trusted regime marked by $w_t(\tau).$ The derivation is completed in \S A by framing the unknown quantities in terms of $\gamma$ and $\Lambda$, culminating in Equation~\ref{eq:expectation}. % caution?. the side of..
% properly, tightly bounding...

\paragraph{Predicting potential outcomes.}
The recovery of a fully normalized probability density $\tilde p(y_t|x)$ via Equation~\ref{eq:approx-frac} is laid out below. It may be approximated with Monte Carlo or solved in closed form with specific formulations for the weights and propensity. Concretely, it takes on the form $\tilde p(y_t|x) = d(t|y_t,x)^{-1}p(y_t|t,x),$ where
\begin{multline}\label{eq:expectation}
  d(t|y_t,x) \coloneqq \E_\tau[\Lambda(\tau|y_t,x)]
    - [\gamma\Lambda](t|y_t,x)\,\E_\tau[\tau-t] \\
    - \frac{1}{2}[(\dot\gamma+\gamma^2)\Lambda](t|y_t,x)\,\E_\tau[(\tau-t)^2],
\end{multline}
and said expectations, $\E_\tau[\cdot],$ are with respect to the implicit distribution $q(\tau|t,x)\propto w_t(\tau)p(\tau|x).$ The notation $\dot\gamma$ denotes a derivative in the first argument of $\gamma(t|y_t,x).$

\begin{assumption}[Second-order Simplification]\label{ass:second-order}
  The quantity $\dot\gamma(\tau|y_t,x)$ cannot be characterized as-is. Granting that $\gamma^2$ dominates over the former, and consequently%\vspace{-0.25em}
  \begin{equation*}
    %(\dot\gamma+\gamma^2)\Lambda\approx \gamma^2\Lambda.
    \abs{(\dot\gamma+\gamma^2)\Lambda} \leq \abs{\gamma^2\Lambda} + \varepsilon\quad\textrm{for small }\varepsilon \geq 0.
  \end{equation*}
  %In our approximation we take $\varepsilon\to 0$.
\end{assumption}


%% this table was taken from beta-specifics.tex and moved a bit up

\renewcommand{\arraystretch}{1.1}
\begin{table*}[bt]\centering
  \begin{tabular}{l c c c c }
    \toprule
    Parametrization & Support $(\mathcal{T})$ & Params. & Precision ($r$) & Bounds for $\E_\tau[\Lambda(\tau|y_t,x)]$ \\
    \midrule
    \textsf{Beta} & $[0,1]$ & $\alpha, \beta$ & $\alpha+\beta-2$ & ${}_1F_1(\bm\alpha+1;\ \bm\alpha+\bm\beta+2;\ \pm\log\Gamma)$ \\
    & & & & where $\bm\alpha\coloneqq \bar\alpha+\alpha-2,\ \bm\beta\coloneqq \bar\beta+\beta-2$\vspace{1em}\\
    \textsf{Balanced Beta} & $[0,1]$ & $\alpha, \beta$ & $\alpha+\beta-2$ & $t\cdot\langle\textrm{the \textsf{Beta} above}\rangle + (1-t)\cdot\langle\textrm{\textsf{Beta}, mirrored}\rangle$\vspace{1em}\\
    \textsf{Gamma} & $[0,+\infty)$ & $\alpha, \beta$ & $\alpha/\beta^2$ & $[1 - (\pm\log\Gamma)/\bm\beta]^{-\bm\alpha}$ \\
    & & & & where $\bm\alpha\coloneqq \bar\alpha+\alpha-1,\ \bm\beta\coloneqq \bar\beta+\beta$\vspace{1em}\\
    \textsf{Gaussian} & $(-\infty,+\infty)$ & $\mu, \sigma$ & $1/\sigma$ & $\exp{\bm\sigma^2(\log\Gamma)^2/2}\left(\begin{aligned}
      \Gamma^{\pm\bm\mu}[1+\erf&(\textstyle\frac{\bm\mu\pm\bm\sigma^2\log\Gamma}{\sqrt{2}\bm\sigma})] \\
      +& \\
      \Gamma^{\mp\bm\mu}[1-\erf&(\textstyle\frac{\bm\mu\mp\bm\sigma^2\log\Gamma}{\sqrt{2}\bm\sigma})]
      \end{aligned}\right)$ \vspace{0.4em}\\
      & & & & where $\bm\mu\coloneqq\frac{\mu\bar\sigma^2+\bar\mu\sigma^2}{\bar\sigma^2+\sigma^2},\ \bm\sigma^2\coloneqq\frac{\bar\sigma^2\sigma^2}{\bar\sigma^2+\sigma^2}$ \\
    \bottomrule
  \end{tabular}%\vspace{-0.50em}
  \caption{\label{tab:trusts}Candidates for propensity and trust-weighing combinations. Each row specifies the distribution---beta, beta, gamma, and Gaussian respectively---of the propensity model $p(\tau|x).$ %The trust-weighing scheme $w_t(\tau)$ takes the same form, but renormalized such that it reaches $1$ at its mode, which is set to $t$.
  The last column lists solutions for the first term of Equation~\ref{eq:expectation}~/~\ref{eq:expectation-simplified}.
  This is a convolution of the propensity and weighing scheme, which have similar forms (see \citet{ref:bromiley} for the {\sf Gaussian} case.)
  We distinguish the replicated parameters between propensity and weight by placing a bar over the propensity parameters. So if the propensity is $x\mapsto(\bar\alpha, \bar\beta)$, then the weighing scheme has $t\mapsto(\alpha, \beta).$ The bold parameters are of the compound density, with respect to which the first and second moments are computed in Equation~\ref{eq:expectation}~/~\ref{eq:expectation-simplified}.}
\end{table*} % ... really functions of $x$ ...
\renewcommand{\arraystretch}{1}





 
To make use of the formula in Equation~\ref{eq:expectation}, one first obtains the set of admissible $d(t|y_t,x)\in\big[\,\underline{d}(t|y_t,x), \overline{d}(t|y_t,x)\,\big]$ that violate ignorability up to a factor $\Gamma$ according to the $\delta$MSM. With the negative side of the $\pm$ corresponding to $\underline{d}$ and the positive side to $\overline{d}$, the bounds are expressible as%\vspace{-0.5em}
\begin{multline}\label{eq:expectation-simplified}
  \Big(\underline{d},\,\overline{d}\Big)=\int_\mathcal{T} \Gamma^{\pm\abs{\tau}}q(\tau|t,x)\diff\tau \quad \text{\textcolor{red!90!black}{$\longrightarrow\E_\tau[\Lambda(\tau|y_t,x)]$}} \\
    + (\pm\log\Gamma)\Gamma^\abs{t} \,\abs{\int_\mathcal{T} (\tau-t)q(\tau|t,x)\diff\tau} \\
    + \frac{1}{2}\Big(0,\ \log^2\Gamma\Big)\Gamma^\abs{t}\,\int_\mathcal{T} (\tau-t)^2 q(\tau|t,x)\diff\tau.
\end{multline}
The $\Gamma^{\pm\abs{\tau}}$ in the first integral, as well as the alternating sign of the other two terms taken together, reveal that $\underline{d}\leq 1 \leq \overline{d}$ with equality at $\Gamma=1$. This is noteworthy because it implies that $p(y|t,x)$ is admissible for the partially identified $\tilde p(y_t|x).$ We cannot describe $p(y_t|x)$ once $\underline{d}$ crosses zero.


\paragraph{Ensembles.}
%One should exploit the parcelization of $\tilde p(y_t|x)$ into a predictor $p(y_t|t,x)$ as the numerator and a nominal propensity model $p(\tau|x)$ in the denominator.
To quantify empirical uncertainties~\citep{ref:jesson20} alongside our sensitivity, the predictors could be learned as ensembles, with $\tilde p(y_t|x)$ computed as (bootstrap resampled~\citep{ref:lo}) expectations over them. %for confidence intervals via its ensembled components.


%% END DETAILS

%% BEGIN BETA-SPECIFICS





\subsection{Propensity-Trust Combinations}\label{sec:beta-weights} % Tractable
In addition to developing the general framework above, we derive analytical forms for a myriad of paramametrizations that span the relevant supports $\mathcal{T}$ for continuous treatments: the unit interval $[0,1]$, the nonnegative reals $[0,+\infty)$, and the real number line $(-\infty,+\infty).$ For some nominal propensity distributions $p(\tau|x),$ we propose trust-weighing schemes $w_t(\tau)$ with shared form so that the expectations in Equation~\ref{eq:expectation-simplified} are solvable.

For instance, consider the parametrization $(T\mid X=x) \sim \textrm{Beta}(\alpha(x), \beta(x))$. We select a Beta-like weighing scheme, rescaled and translated, $w^\text{beta}_t(\tau) = c_t \tau^{a_t-1} (1-\tau)^{b_t-1}$. Two constraints are imposed on every $w_t(\tau)$ studied herein:
\begin{itemize}\vspace{-0.5em}
  \item \emph{(the mode)} that $w_t(\tau)$ peaks at $\tau=t$, and $w_t(t)=1$.
  \item \emph{(the precision)} that some $r>0$ defines a narrowness of the form, and can be set a priori.
\end{itemize} 

For the beta version we chose $a_t+b_t=r+2.$ These constraints imply that $a_t\coloneqq rt+1$, $b_t\coloneqq r(1-t)+1$, and $c^{-1}_t\coloneqq t^{rt}(1-t)^{r(1-t)}$.
%Constraining a more complex dispersion statistic like variance is much more difficult.

%\vspace{-1em}

\begin{figure}[!ht]\centering
  \scalebox{0.85}{
    \input{figures/weights.pgf}}\vspace{-1em}
  \caption{\label{fig:weights}Beta parametrizations for $w_t(\tau)$ in the unit square, plotted for $t=0.125, 0.25, 0.5$. Trust declines with $r$.}
  % Shapes are symmetrical about $t=0.5$.
\end{figure}


\paragraph{The choices.}
We present solutions for propensity-trust combinations in Table~\ref{tab:trusts}. {\sf Balanced Beta} stands out by not strictly obeying Assumption~\ref{ass:zero}. Rather, it adheres to a symmetrified mixture that is more versatile to realistic situations. % stratification

%\begin{center} % has more spacing than \centering
%\framebox[0.97\linewidth]{
%\begin{minipage}{0.93\linewidth} % framebox cannot accept newlines otherwise
%  {\sf Balanced Beta} --- A Note \vspace{0.67em}
\paragraph{\textsf{\textbf{Balanced Beta}}.}
  Formally, for all $t$, $y_t$, and $x$, we balance the {\sf Beta} parametrization by replacing Assumption~\ref{ass:zero} with

  $\begin{cases}
    \quad p(\,\tau=0 \mid y_t,x)=p(\,\tau=0 \mid x) &\quad \text{w.p.}\quad t,\\
    \quad p(\,\tau=1 \mid y_t,x)=p(\,\tau=1 \mid x) &\quad \text{w.p.}\quad 1-t.
  \end{cases}$

  This special parametrization deserves further justifying.
  The premise is that distant treatments are decoupled; treatment assignment $\tau$ shares less information with a distal potential outcome $y_t$ than a proximal one. 
  If that were the case, then the above linear interpolation favors the less informative anchor points for a given $t$. This is helpful because the sensitivity analysis is vulnerable to the anchor points.
  % Suppose that one were assessing ...
  Stratifying the anchor points eventually leads to an arithmetic mixture of $d(t|y_t,x)$ in Equation~\ref{eq:expectation} with its mirrored version about $t\mapsto1-t,$ and $(\alpha,\beta)\mapsto (\beta,\alpha).$
% can simply talking about things involving y_t's help people digest the meaning of potential outcomes? are there positive externalities to this text, in other words?
%\end{minipage}
%}\end{center}

\paragraph{Controlling trust.}
The absolute error of the approximation in Equation~\ref{eq:decompose}.A is bounded above by a form that could grow with narrower propensities (see \S B), in the Beta parametrization.
%Noting that
%\begin{enumerate}\vspace{-0.75em}
%  \itemsep0em
%  \item narrower \emph{complete} propensities in $\tau$ worsen the approximation of Equation~\ref{eq:approx}, and
%  \item the complete propensity is coupled to the nominal propensity, namely by the $\delta$MSM,
%\end{enumerate}\vspace{-0.75em}
Intuitively the error also depends on the smoothness of the complete propensity (Taylor residual.) For that reason we used the heuristic of setting the trust-weighing precision $r$ to the nominal propensity precision.

%% END BETA-SPECIFICS

%% BEGIN ANALYSIS

\section{Estimating The Intervals}\label{sec:optim}
We seek to bound partially identified expectations with respect to the true potential-outcome densities, which are constrained according to Equation~\ref{eq:expectation}~/~\ref{eq:expectation-simplified}. The quantities of interest are the Average Potential Outcome (APO), $\E[f(Y_t)]$, and Conditional Average Potential Outcome (CAPO), $\E[f(Y_t)|X=x]$, for any task-specific $f(y)$. We use Monte Carlo over $m$ realizations $y_i$ drawn from proposal density $g(y)$, and covariates from a subsample of instances: % Monte Carlo!
\begin{multline}\label{eq:importance-sampling}
\tilde \E[f(Y_t)\mid X\in\{x^{(j)}\}_{j\in J}] = \\ \frac{\sum_{i=1}^m \sum_{j\in J} f(y_i)\, \tilde p(y_t=y_i\mid x^{(j)})/g(y_i)}{\sum_{i=1}^m \sum_{j\in J} \tilde p(y_t=y_i\mid x^{(j)})/g(y_i)},
\end{multline}
where $J\subseteq \{1\dots n\}$ indexes a subset of the finite instances. $\abs{J}=1$ recovers the formula for the CAPO, and $\abs{J}=n$ for the APO. % is the size of the whole sample.
%Even though $\tilde p(y_t|x)$ is a normalized probability density, it contains partially identified quantities. It is untenable to constrain a search along the candidate values for each $d(t|y_t=y_i,x)$ to even approximately ensure $\int_\mathcal{Y}\tilde p(y_t=y|x)\diff y=1$.
The partially identified $\tilde p(y_t|x)$ really encompasses a set of probability densities that includes $p(y|t,x)$ and smooth deviations from it. Our importance sampler ensures normalization~\citep{ref:tokdar}, but is overly conservative~\citep{ref:dorn22}.
%For this reason the bias of an estimator without the corrective denominator of Equation~\ref{eq:importance-sampling} would be uncontrollable~\citep{ref:tokdar}.
For current purposes, a greedy algorithm may be deployed to maximize (or minimize) Equation~\ref{eq:importance-sampling} by optimizing the weights $w_i$ attached to each $f(y_i)$, within the range
\begin{equation*}
  \underline{w}_i \coloneqq \frac{p(y_i|t,x)}{\overline{d}(t|y_i,x)g(y_i)},\qquad \overline{w}_i \coloneqq \frac{p(y_i|t,x)}{\underline{d}(t|y_i,x)g(y_i)}.
\end{equation*}
Our Algorithm~\ref{alg:minimax} adapts the method of \citet{ref:jesson21, ref:kallus} to heterogeneous weight bounds $[\underline{w}_i, \overline{w}_i]$ per draw $i$. View a proof of correctness in \S C.

Others have framed the APO as the averaged CAPOs, and left the min/max optimizations on the CAPO level~\citep{ref:jesson22}. We optimize the APO directly, but have not studied the impact of one choice versus the other.

\begin{algorithm}
  \SetKwInOut{Input}{Input}\SetKwInOut{Output}{Output}
  \caption{The expectation maximizer, with $\mathcal{O}(n)$ runtime if intermediate $\Delta_j$ results are memoized. \label{alg:minimax}}
  \Input{$\{(\underline{w}_i, \overline{w}_i, f_i)\}_{i=1}^n$ ordered by ascending $f_i$.}
  \Output{$\max_{w}\E[f(X)]$ estimated by importance sampling with $n$ draws.}
  \vspace{0.4em}
  Initialize $w_i \gets \overline{w}_i$ for all $i=1,2,\dots n$\;
  \For{$j=1,2,\dots n$}{
    Compute $\Delta_j\coloneqq \sum_{i=1}^n w_i(f_j-f_i)$\;
    \eIf{$\Delta_j<0$}{
      $w_j\gets \underline{w}_j$\;
    }{
      break\;
    }
  }
  Return $\sum_i w_i f_i / \sum_i w_i$
\end{algorithm}

%% END ANALYSIS

%% BEGIN RESULT-BENCHMARK

\begin{table*}[bt]\centering
  % single `multicolumn' overrides the vertical rules
  \begin{tabular}{l| c c | c c | c c | c c | r r }
    \toprule
    Benchmarks & \multicolumn{2}{c}{\tt brain} & \multicolumn{2}{c}{\tt blood} & \multicolumn{2}{c}{\tt pbmc} & \multicolumn{2}{c|}{\tt mftc} & & \multicolumn{1}{c}{ratio} \\

    %\multicolumn{1}{l}{} & mean (std.) & \multicolumn{1}{c}{\!median} & mean (std.) & \multicolumn{1}{c}{\!median} & mean (std.) & \multicolumn{1}{c}{\!median} & mean (std.) & \multicolumn{1}{c|}{\!median} & best \\
    & mean & \multicolumn{1}{c}{(std.)} & mean & \multicolumn{1}{c}{(std.)} & mean & \multicolumn{1}{c}{(std.)} & mean & \multicolumn{1}{c|}{(std.)} & \% best & \multicolumn{1}{c}{to best} \\
    \midrule
    $\delta$MSM (ours) & $\bm{138}$ & $(120)$ & $\bm{141}$ & $(129)$ & $\bm{138}$ & $(121)$ & $\bm{144}$ & $(124)$ & $\bm{78.4}$ & $\bm{1.03}~(0.08)$ \\
    CMSM & $186$ & $(153)$ & $188$ & $(156)$ & $205$ & $(169)$ & $182$ & $(145)$ & $7.8$ & $1.81~(2.15)$ \\
    uniform & $158$ & $(137)$ & $162$ & $(146)$ & $157$ & $(136)$ & $167$ & $(141)$ & $4.8$ & $1.20~(0.10)$ \\
    binary MSM & $211$ & $(128)$ & $213$ & $(131)$ & $222$ & $(127)$ & $214$ & $(127)$ & $9.0$ & $2.57~(2.34)$ \\
    \bottomrule
  \end{tabular}
  \caption{\label{tab:benchmark}Semi-synthetic benchmark: divergence \underline{costs} of 90\% coverage of the Average Potential Outcome (APO), multiplied by $1000$. The four \texttt{datasets} are listed on top. We report averages over 500 trials per experiment. A paired $t$-test and sign test, roughly corresponding to the mean and median, showed significant improvement by the $\delta$MSM over the others with all $P < 10^{-5}.$
  ``\% best'' counts the proportion of trials that each method outperformed the rest, and ``ratio to best'' is the average cost ratio to the best method's in each trial---closer to one is better.}
\end{table*}


\section{A Semi-synthetic Benchmark}\label{sec:result-benchmark}
It is common practice to test causal methods, especially under novel settings, with real datasets but synthetic outcomes~\citep{ref:curth, ref:cristali}. We adopted four exceedingly diverse datasets spanning health, bioinformatics, and social-science sources. Our variable-generating process preserved the statistical idiosyncracies of each dataset. Confounders and treatment were random projections of the data, which were quantile-normalized for uniform marginals in the unit interval.
Half the confounders were observed as covariates and the other half were hidden.
The outcome was Bernoulli with random linear or quadratic forms mixing the variables before passing through a normal CDF activation function. % link?
Outcome and propensity models were linear and estimated by maximum likelihood.
See \S E.

\paragraph{Selecting the baselines.}
The $\delta$MSM with \textsf{Balanced Beta} was benchmarked against three relevant baselines.
\begin{itemize}%\vspace{-0.5em}
  \itemsep0.25em
  \item (CMSM)~~Use the recent model by \citet{ref:jesson22}, where $\underline{d}\coloneqq \Gamma^{-1}p(\tau|x),\ \overline{d}\coloneqq \Gamma^{+1}p(\tau|x)$.
  \item (uniform)~~Suppose $\underline{d}\coloneqq \Gamma^{-1},\ \overline{d}\coloneqq \Gamma^{+1}$, as if the propensity were uniform and constant.
  \item (binary MSM)~~Shoehorn the propensity into the classic MSM~\citep{ref:tan} by considering the treatment as binary with indicator $\mathbb{I}[T>0.5].$ % threshold
\end{itemize} % "uniform" or "uninformed"? latter is meaner
% note that Jesson has a different way of computing the APO ignorance intervals, simply by averaging the CAPOs.
Note that the CMSM becomes equivalent to the ``uniform'' baseline above when CAPOs are concerned (Equation~\ref{eq:importance-sampling} with $m=1$), which are not studied in this benchmark.

\begin{figure}[!ht]\centering%\vspace{-0.75em}
  \scalebox{0.75}{
    \input{figures/divergence-cost.pgf}}\vspace{-1em}
  \caption{\label{fig:divergence-cost}Divergence cost measures the size of the ignorance intervals (blue), weighted by the badness of each estimate (red). The black line is the true APO. Coverage is the portion of treatments contained in the blue shaded region, between A and B in this example. We target 90\% of the unit interval in our benchmark with Beta-distributed treatments. }
\end{figure}


\paragraph{Scoring the coverages.}%\label{sec:coverage-cost}
A reasonable goal would be to achieve a certain amount of coverage~\citep{ref:mccandless} of the true APOs, like having 90\% of the curve be contained in the ignorance intervals. % engulfed
Since violation factor $\Gamma$ is not entirely interpretable, nor commensurable across sensitivity models, we measure the size of an ignorance interval via a cost incurred in terms of actionable inference. For each point $t$ of the dose-response curve, we integrated the KL divergence of the actual APO (which defines the $Y_t$ Bernoulli parameter) against the predicted APO uniformly between the bounds. This way, each additional unit of ignorance interval is weighed by its information-theoretic approximation cost. This score is a \emph{divergence cost} of a target coverage. % $(\underline{y}_t, \overline{y}_t)$


\paragraph{Analysis.}
The main results are displayed in Table~\ref{tab:benchmark}. There were ten confounders and the true dose-response curve was a random quadratic form in the treatment and confounders. Other settings are shown in Supplementary~Table~4.
Each trial exhibited completely new projections and outcome function. There were different levels and types of confounding as well as varying model fits. Still, clear patterns are evident in Table~\ref{tab:benchmark}, like the rate at which the $\delta$MSM provided the lowest divergence cost against the baselines. %The total amount of confounding could be approximated by the Mutual Information (MI) between $T$ and $(X,Z)$, where $Z$ is the hidden confounders, since the outcome is known to be a function of all of them. Under the simplest experimental setting (2 confounders, linear outcome) where MI estimation was most feasible, we found a Pearson correlation between divergence cost and confounding MI of $-0.11,\ (-0.15, -0.06)$ for the $\delta$MSM

%% LINEAR
%2 confounders, $\delta$MSM $-0.41$ $(-0.44, -0.37)$, CMSM $-0.08$ $(-0.13, -0.04)$
%6 confounders, $\delta$MSM $-0.36$ $(-0.39, -0.32)$, CMSM $-0.37$ $(-0.41, -0.33)$
%10 confounders, $\delta$MSM $-0.33$ $(-0.37, -0.29)$, CMSM $-0.45$ $(-0.48, -0.41)$

%2 confounders cross correlation $0.36$ $(0.32, 0.40)$,
%6 confounders cross correlation $0.56$ $(0.53, 0.59)$,
%10 confounders cross correlation $0.63$ $(0.60, 0.66)$


\begin{figure}[!ht]\centering % used to be SCfigure
  \scalebox{0.75}{
    \input{figures/coverage.pgf}}\vspace{-1em}
    % being the side caption a little higher
  \caption{\label{fig:coverage}Performance for different coverages. Black line: rate of $\delta$MSM achieving lowest divergence cost compared to baselines. Dashed line: expected rate if the chance of any one method outperforming another were identical.}
\end{figure}

%% END RESULT-BENCHMARK

%% BEGIN RESULT-WORKFLOW

\section{A Real-world Exemplar}\label{sec:result-workflow}

The UK Biobank~\citep{ref:bycroft} is a large, densely phenotyped epidemiological study with brain imaging. We preprocessed 40 attributes, eight of which were continuous diet quality scores (DQSs)~\citep{ref:said, ref:zhuang} valued 0--10 and serving as treatments, on 42,032 people. The outcome was thicknesses of 34 cortical brain regions. A poor DQS could translate to noticeable atrophy in the brain of some older individuals, depending on their attributes~\citep{ref:gu,ref:melo}.

Continuous treatments enable the (Conditional) Average Causal Derivative, {(C)ACD $\coloneqq \partial\E[Y_t|X]\,/\,\partial t$}. The CACD informs investigators on the incremental change in outcome due to a small change in an individual's given treatment. For instance, it may be useful to identify the individuals who would benefit the most from an incremental improvement in diet. We plotted the age distributions of the top 1\% individuals by CACD (diet $\to$ cortical thickness) in Figure~\ref{fig:biobank-age}.

\begin{figure}[!hb]%\centering\vspace{-0.50em}
  \scalebox{0.75}{
    \input{figures/biobank-age.pgf}}\vspace{-1em}
  \caption{\label{fig:biobank-age}When we apply the $\delta$MSM $(\Gamma>1)$ for partial identification, the individuals with the top 1\% causal derivatives of cortical thickness with respect to DQSs skew even older. This is expected logically because older people have more years during which they could have revised their diets. Red dotted line corresponds to the entire population.}
\end{figure}

We also compared the $\delta$MSM to an equivalent binary MSM where CACDs are computed in the latter case by thresholding the binary propensity at $t$. Each model's violation factor $\Gamma$ was set for an equivalent amount ($\sim$30\%) of nonzero CACDs. Under the $\delta$MSM, the DQSs with strongest average marginal benefit ranked as vegetables, whole grains, and then meat, for both females and males. They differed under the binary MSM, with meat, then whole grains as the top for females and dairy, then refined grains as the top for males.


%Table~\ref{tab:diets} lists the DQSs with the strongest CACDs on average, for the two sensitivity models and two sexes. The top three DQSs appear more consistent under the $\delta$MSM.

%\begin{table}[!ht]\centering
%  \begin{tabular}{l l| r r r r}
%    \toprule
%    Model & Sex & \#1 & \#2 & \#3 \\
%    \midrule
%    $\delta$MSM & female & veg. & whole gr. & meat \\
%    & male & veg. & whole gr. & meat \\
%    binary MSM & female & meat & whole gr. & dairy \\
%     & male & dairy & refined gr. & whole gr. \\
%    \bottomrule
%  \end{tabular}\vspace{-0.75em}
%  \caption{\label{tab:diets}Ranked DQSs by strongest average CACD, stratified by sensitivity model and sex.\\Abbreviations: ``veg.'' -- vegetables, ``gr.'' -- grains.}
%\end{table}\vspace{-0.50em}


%% END RESULT-WORKFLOW

%% BEGIN CONCLUSION

\section{Discussion}\label{sec:discussion}

%We hereby introduce a general, systematic framework under which to perform sensitivity analyses of dose responses to hidden confounders. 

%\paragraph{Ethical implications.}
Sensitivity analyses for hidden confounders can help to guard against erroneous conclusions from observational studies. We generalized the practice to causal dose-response curves, thereby increasing its practical applicability. However, there is no replacement for an actual interventional study, and researchers must be careful to maintain a healthy degree of skepticism towards observational results even after properly calibrating the partially identified effects.

Specifically for Average Potential Outcomes (APOs) via the sample-based algorithm, we demonstrated widespread applicability of the $\delta$MSM in \S\ref{sec:result-benchmark} by showing that it provided tighter ignorance intervals than the recent CMSM and other models for $78\%$ of all trials, notwithstanding the wide variation in scenarios tested. Ablating the approximation in Equation~\ref{eq:approx} and dropping the quadratic term, that percentage falls slightly to $74\%$. Even further, keeping just the constant term results in a large drop to $7\%$. This result suggests that the proposed Taylor expansion (Equation~\ref{eq:approx}) is useful, and that terms of higher order would not give additional value.
% in our sample-based algorithm specifically?

% highly realistic?
We showcased sensical behaviors of the $\delta$MSM in a real observational case study (\S\ref{sec:result-workflow}), e.g.\ how older people would be more impacted by (retroactive) changes to their reported diets. Additionally, the top effectual DQSs appeared more consistent with the $\delta$MSM rather than the binary MSM.

\paragraph{Contrasting the CMSM.}
Another recently proposed sensitivity model for continuous-valued treatments is the CMSM~\citep{ref:jesson22}, which was included in our benchmark, \S\ref{sec:result-benchmark}. Unlike the $\delta$MSM, the CMSM does not always guarantee $\underline{d}\leq 1 \leq \overline{d}$ and therefore $p(y|t,x)$ need not be admissible for $\tilde p(y_t|x)$. For partial identification of the CAPO with importance sampling, the propensity density factors out and does not affect outcome sensitivity under the CMSM. For that implementation it happens that $p(y|t,x)$ is indeed admissible. However, we believe that the nominal propensity should play a role in the CAPO's sensitivity to hidden confounders, as both the CMSM and the $\delta$MSM couple the hidden confounding (via the complete propensity) to the nominal propensity. Equations~\ref{eq:expectation}~\&~\ref{eq:expectation-simplified} make it clear that the propensity plays a key role in outcome sensitivity under the $\delta$MSM for both CAPO and APO. We remind the reader of the original MSM that bounds a ratio of complete and nominal propensity odds. The $\delta$MSM takes that structure to the infinitesimal limit and maintains the original desirable property of $p(y|t,x)$ admissibility for $\tilde p(y_t|x)$.


\paragraph{Looking ahead.}
Alternatives to sampling-based Algorithm~\ref{alg:minimax} deserve further investigation for computing ignorance intervals on expectations---but not only. Our analytical solutions bound the density function $p(y_t|x)$ of conditional potential outcomes, which can generate other quantities of interest~\citep{ref:kallus22} or play a role in larger pipelines. Further, an open challenge with the $\delta$MSM would be to find a pragmatic solution to sharp partial identification. Recent works have introduced sharpness to binary-treatment sensitivity analysis~\citep{ref:oprescu23}.

%Our analytical bounds for the potential-outcome density $p(y_t|x)$ could be harnessed in multiple other ways, perhaps incorporating sharpness guarantees~\citep{ref:dorn}.

%We wish also to consider the implications of the $\delta$MSM, perhaps under new general frameworks~\citep{ref:chernozhukov}.
%We shall also investigate the superficial similarity of the $\delta$MSM to the Riesz representers for the ACD under the general framework of~\citet{ref:chernozhukov}.


\section{Conclusion}
We recommend the novel $\delta$MSM for causal sensitivity analyses with continuous-valued treatments. The simple and practical Monte Carlo estimator for the APO and CAPO (Algorithm~\ref{alg:minimax}) gives tighter ignorance intervals with the $\delta$MSM than alternatives. We believe that the partial identification of the potential-outcome density shown in Equation~\ref{eq:expectation-simplified}, in conjunction with the parametric formulas of Table~\ref{tab:trusts}, is of general applicability for causal inference in real-world problems. The variety of settings presented in that table allow a domain-informed selection of realistic sensitivity assumptions. For instance, when estimating the effect of a real-valued variable's deviations from some base value, like a region's current temperature compared to its historical average, the \textsf{Gaussian} scheme could be used. \textsf{Gamma} is ideal for one-sided or unidirectional deviations. Finally, \textsf{Balanced Beta} is recommended for measurements in an interval where neither of the endpoints is special.

%% END CONCLUSION


\begin{acknowledgements}
  This work was funded in part by Defense Advanced Research Projects Agency (DARPA) and Army Research Office (ARO) under Contract No.\ W911NF-21-C-0002.
\end{acknowledgements}


\bibliography{refs}


\end{document}
