\section{BACKGROUND}
\label{sec:background}
% \vk{copied from NeurIPS22, needs editing}
\paragraph{Notation}
Formally, we are given an observational dataset $\mathcal{D}=\{(x^{(i)},y^{(i)},t^{(i)})\}_{i=1}^n$ consisting of $n$ units, each characterized by features $x^{(i)} \in \mathcal{X} \subseteq \mathbb{R}^d$, a binary treatment $t^{(i)} \in \{0,1\}$, and a scalar outcome $y^{(i)} \in \mathcal{Y} \subseteq \mathbb{R}$. 
We assume $\mathcal{D}$ consists of i.i.d.~realizations of random variables $X, Y, T \sim P$ from a data distribution $P$.
Although we assume binary treatments and scalar outcomes, our approach naturally extends beyond this setting.
The feature space $\mathcal{X}$ can be any continuous or discrete set.

\subsection{Causal Effect Estimation Using Propensity Scoring}

% \vk{copied from NeurIPS22, needs editing}

% We also use $z^{(i)} \in \mathbb{R}^p$ to model latent confounding factors that influence both the treatment and the outcome \citep{louizos2017causal}. 
% We assume that $(x^{(i)},y^{(i)},t^{(i)}, z^{(i)})$ represent i.i.d.~realizations of random variables $X, Y, T, Z \sim \mathbb{P}$ from a data distribution $\mathbb P$ that decomposes as $\mathbb{P}(X|Z)\mathbb{P}(Y|Z,T)\mathbb{P}(T|Z)\mathbb{P}(Z)$ (thus $Z$ is indeed a confounder). 
We seek to estimate the true effect of $T=t$ in terms of its average treatment effect (ATE).
\begin{align}
& Y[x,t] =  \mathbb{E}[Y | X =x, \text{do}(T=t)]\\
&\text{ATE} = \mathbb{E}[Y[x,1] - Y[x,0]],
\end{align}
where $ \text{do}(\cdot)$ denotes an intervention \citep{pearl2000models}. We assume strong ignorability, i.e., $(Y(0), Y(1)) \perp T | X $ and $0 < P(T|X) < 1,$ for all $X \in \mathcal{X}, T \in \{0, 1\}$, where $Y(0)$ and $Y(1)$ denote potential outcomes. We also make the stable unit treatment value assumption (SUTVA), which states that there is a unique value of outcome $Y_i(t)$ corresponding to unit $i$ with input $x_i$ and treatment $t$~\citep{Rosenbaum1983-rp}. Under these assumptions, the propensity score defined as $e(X) = P(T=1|X)$ satisfies the conditional independence  $(Y(0), Y(1)) \perp T | e(X)$~\citep{Rosenbaum1983-rp}. Propensity score also acts as a balancing score, i.e. $X \perp T | e(X)$.
Thus, ATE can be expressed as $\tau = \mathbb{E} \bigg(\frac{TY}{e(X)} - \frac{(1-T)Y}{1-e(X)}\bigg).$
The Inverse Propensity of Treatment Weight (IPTW) estimator uses an approximate model $Q(T=1|X)$ of $P(T=1|X)$ to produce an estimate $\hat{\tau}$ of the ATE, which is computed as $$ \hat{\tau_1} = \frac{1}{n}\sum_{i=1}^n \bigg( \frac{t^{(i)}y^{(i)}}{Q(T=1|x^{(i)})} - \frac{(1-t^{(i)})y^{(i)}}{1-Q(T=1|x^{(i)})}\bigg).$$ 
%We also define the Augmented Inverse Propensity Score Weight (AIPW) estimator in Appendix~\ref{apdx:estimators}. 
The Augmented Inverse Propensity Score Weight (AIPW) estimator uses an outcome model $f(X, T)$ to approximate the potential outcome $Y[X, T]$, thus computing ATE as  
$\hat{\tau_2} = \hat{\tau_1} + \frac{1}{n}\sum_{i=1}^n \big[f(x_i, T=1)\big(1-\frac{t_i}{Q(T=1|x^{(i)})}\big)  - f(x_i, T=0)\big(1-\frac{(1-t_i)}{1-Q(T=1|x^{(i)})}\big)\big].$ This doubly robust estimator can produce accurate ATE estimates when either the propensity model or the outcome model is correctly specified~\citep{Robins1994estimation}.  
%\vk{$t_i$ vs $t^{(i)}$}\vk{I think it's ok to use on display eqn here}
% Propensity scores are widely applied to high-dimensional, unstructured covariates~\citep{pryzant2017predicting,veitch2019using,veitch2020adapting}; however, when covariates are high-dimensional, a neural approximator for $e(X) = P(T|X)$ may output volatile and miscalibrated probabilities close to $\{0,1\}$, leading to unreliable effect estimates~\citep{kallus2020deepmatch}.


% \begin{itemize}
    
%     \item Need to define propensity scoring in detail. Check the papers on propensity scoring from David Blei's group cited in NeuriPS22 for details.
%     \item Potential outcomes framework, SUTVA, ignorability and positivity \vk{briefly?}
%     \item Except for the most standard estimator, it's okay to delegate details to the appendix.
% \end{itemize}

\subsection{Calibrated and Conformal Prediction}

This paper seeks to evaluate and improve the uncertainty of propensity scores.
% Propensity score methods require estimating probabilistic models $Q(T|X)$ of $P(T|X)$; this paper explores the evaluation and the creation of such models.
% \vk{copied from UAI2023, needs editing}
A standard tool for evaluating predictive uncertainties is a proper loss (or proper scoring rule) $L : \Delta_\mathcal{Y} \times \mathcal{Y} \to \mathbb{R}$, defined over the set of distributions $\Delta_\mathcal{Y}$ over $\mathcal{Y}$ and a realized outcome 
$y \in \mathcal{Y}$. Examples of proper losses include the L2 or the log-loss.
% Given another $G \in \Delta_\mathcal{Y}$, we use the notation $L(F,G)$ for the expected score
% $
% L(F,G) = \mathbb{E}_{y \sim G} L(F, y).
% $
%
% \paragraph{Calibration and Sharpness.}
% What are the qualities of a good probabilistic prediction, as measured by a proper scoring rule?
It can be shown that a proper score is a sum of the following terms \citep{gneiting2007probabilistic}:
$\text{proper loss} = \text{calibration} 
% \underbrace{
- \text{sharpness} + \text{irreducible term}.
%} _\text{refinement term}.
$
% Thus, there are precisely two qualities that define an ideal forecast: calibration and sharpness.
% These represent precisely the two desiderata that matter for evaluating uncertainties. 
%We examine each of them next.



\paragraph{Calibration.}
Intuitively, calibration means that a 90\% confidence interval contains the outcome  about $90\%$ of the time.
Sharpness means that confidence intervals should be tight. Maximally tight and calibrated confidence intervals are Bayes optimal.
%
In the context of propensity scoring methods for binary treatments, we say that a propensity score model $Q$ is calibrated  if the true probability of $T=1$ conditioned on predicting a probability $p$ matches the predicted probability: 
\begin{equation}
    P(T=1 \mid Q(T=1|X) = p) = p \;\; \forall p \in [0,1]
\end{equation}
% Calibration by itself is not sufficient to produce a useful forecast. For example, predicting marginal data probability $P(T = 1)$ for $T=1$ yields calibrated predictions; however it does not use the features $X$. In order to be useful, forecasts must also be {\em sharp}: intuitively, predicted confidence intervals should be as tight as possible.
% More formally, we want the entropy of the predicted probabilities to be small. 
%This is captured by proper scoring rules as part of a refinement term (see Table \ref{tbl:properlosses}), which equals an irreducible term minus a sharpness term \citep{murphy1973vector,brocker2009decomposition}. The latter is maximized when we minimize the scoring rule.


% We say that $L$ is {\em proper scoring rule} or a {\em proper loss} if it is minimized by $G$ when $G$ is the true distribution for $y$:
% $
% L(F,G) \geq L(G,G) \text{ for all $F$}.
% $
% %When $S$ is proper, we also refer to it as a proper loss.
% One example is the log-likelihood $L(F,y) = \log f(y)$. 
% %where $f$ is the probability density or probability mass function of $F$. 
% Another example is the check score for $\tau \in [0,1]$:
% \begin{equation}
% \rho_\tau(F, y) = \tau (y-F^{-1}(\tau)) \text{ if $y \geq f$ else $(1-\tau)(F^{-1}(\tau)-y)$}
% \end{equation}


\paragraph{Calibrated and Conformal Prediction.} Out of the box, most models $Q$ are not calibrated. Calibrated and conformal prediction yield calibrated forecasts by comparing observed and predicted frequencies on a hold-out dataset \citep{shafer2007tutorial,kuleshov2018accurate, angelopoulos2021gentle, vovk2005defensive}. %\vk{need refs in neurips format []}