\section{Theoretical Analysis}
\label{apdx:theoretical}
\subsection{Notation}
\sd{to be edited}
Formally, we are given an observational dataset $\mathcal{D}=\{(x^{(i)},y^{(i)},t^{(i)})\}_{i=1}^n$ consisting of $n$ units, each characterized by features $x^{(i)} \in \mathcal{X} \subseteq \mathbb{R}^d$, a binary treatment $t^{(i)} \in \{0,1\}$, and a scalar outcome $y^{(i)} \in \mathcal{Y} \subseteq \mathbb{R}$. 
We assume $\mathcal{D}$ consists of i.i.d.~realizations of random variables $X, Y, T \sim P$ from a data distribution $P$.
Although we assume binary treatments and scalar outcomes, our approach naturally extends beyond this setting.
The feature space $\mathcal{X}$ can be any continuous or discrete set.

\subsection{Calibration: a Necessary Condition for Propensity Scoring Models}
\label{apdx:calibration-necessary}
\begin{theorem}
When $Q(T|X)$ is not calibrated, an IPTW estimator based on $Q$ yields an incorrect estimate of the true causal effect almost surely.
% For each uncalibrated model $Q(T|X)$, the set of data distributions $P$ for which an IPTW estimator yields incorrect  probabilities has measure one.
\end{theorem}
\begin{proof}[Example]
Consider a toy binary setting where $\mathcal{X} = \mathcal{T} = \{0,1\}, \mathcal{Y} = \{0,1\}^2$.
% An IPTW estimator has the form 
% $
% \frac{1}{n}\sum_{i=1}^n \left( \frac{y^{(i)} \cdot t^{(i)}}{Q(t=1|x^{(i)})} + \frac{y^{(i)} \cdot (1-t^{(i)})}{Q(t=0|x^{(i)})}\right),
% $
We set $Y = (X \oplus T, \bar{X} \oplus \bar{T}) $, $ P(T=1|X=0)=p_0,  P(T=1|X=1)=p_1$ and $P(X=1)=0.5$ such that $\oplus$ is logical `AND' and $\bar{V}$ denotes logical negation of binary variable $V$. We see that true ATE is $\tau=(0.5, -0.5)$. Let us assume that $Q(T=1|X=0) = q_0$ and $Q(T=1|X=1)=q_1$. Thus, with IPTW estimator based on $Q$, we estimate $\tau' = \mathbb{E} \bigg(\frac{TY}{Q(T=1|X)} - \frac{(1-T)Y}{1-Q(T=1|X)}\bigg) = (-\frac{0.5(1-p_0)}{1-q_0}, \frac{0.5.p_1}{q_1}).$ The treatment effect $\tau'=\tau$ only when $q_0=p_0$ and $q_1=p_1$, which is not true if $Q$ is not calibrated. 


\sd{write a general proof}


To prove: $P(\hat{\tau}=\tau)=0$ where the IPTW estimate of ATE using \textit{uncalibrated} model $Q(T=1|X)$ is $\hat{\tau} = \mathbb{E}\bigg(\frac{TY}{Q(T=1|X)} - \frac{(1-T)Y}{1-Q(T=1|X)} \bigg)$ and $\tau$ is true ATE. 
\begin{proof}(Proof)
   

For discrete X, assume $\mathcal{Y} = \{0, 1\}^{|\mathcal{X}|}$ such that $Y_i=X_i$. 
 
\end{proof}



% % \vk{todo: maybe complete this with some working numbers; if no space, put in appendix}
\end{proof}

\subsection{Calibrated Uncertainties Improve Propensity Scoring Models}

\label{apdx:calibrated-uncertainties-improve-propensity}

We define $\pi_{t,y}(Q) = \sum_x P(y|x,t)\frac{P(t|x)}{Q(t|x)}P(x)$ to be the estimated probability of $y$ given $t$ with a propensity score model $Q$.
It is not hard to show that the true $Y[t] := \mathbb{E}_X Y[X,t] = \mathbb{E}_X \mathbb{E}[Y|X=x, \mathrm{do}(T=t)]$ can be written as $\sum_{y} y \pi_{y,t}(P)$ (\sd{explain/prove?}).
Similarly, the estimate of an IPTW estimator with propensity model $Q$ in the limit of infinite data tends to $\hat Y_Q[1] - \hat Y_Q[0]$, where $\hat Y_Q[t]:= \sum_{y} y \pi_{y,t}(Q)$. We may bound the expected L1 ATE error $|Y[1] - Y[0] - (\hat Y_Q[1] - \hat Y_Q[0])|$ by $\sum_t |Y[t] - \hat Y_Q[t]| \leq \sum_t \sum_y |y| \cdot |\pi_{y,t}(P) - \pi_{y,t}(Q)|$.
\subsubsection{Bounding the Error of Causal Effect Estimation Using Proper Losses}
\label{apdx:error-bound}

The expected error $|\pi_{y,t}(P) - \pi_{y,t}(Q)|$ induced by an IPTW estimator with propensity score model $Q$ is bounded as
$$|\pi_{y,t}(P) - \pi_{y,t}(Q)| \leq \mathbb{E}_{X \sim R_{y,t}}[ \ell_\chi(P,Q)^\frac{1}{2}], $$
where $R_{y,t} \propto P(Y=y | X, T=t) P(X)$ is a data distribution and $\ell_\chi(Q,P)= \left( 1- \frac{P(T=t|X)}{Q(T=t|X)} \right)^2$ is the $chi$-squared loss between the true propensity score and the model $Q$. 

\begin{proof}[Proof]
\sd{write full proof}
Note that
$
|\pi_{y,t}(P) - \pi_{y,t}(Q)|
 \leq \mathbb{E}_{X\sim R_{y,t}} \left| 1- \frac{P(T=t|X)}{Q(T=t|X)} \right| 
 \leq \mathbb{E}_{R_{y,t}} \ell_\chi(P,Q)^\frac{1}{2}
$
\end{proof}


Idea: use this bound on the calibration error.
First, we define
$$
\tau = P(Y=1 | do(T=1)) = \sum_X P(Y=1 | X, do(T=1)) P(X) = \sum_X P(Y=1 | X, T=1) P(X)
$$
as the true treatment effect. We define
$$
\tau_0 = P(Y=1 |T=1) = \sum_X P(Y=1 | X, T=1) P(X|T=1) \propto \sum_X P(Y=1 | X, T=1) P(T=1|X) P(X)
$$
to be the naive (biased) estimate. We define
$$
\hat\tau(Q) = \sum_X P(Y=1 | X, T=1) \frac{P(X|T=1)}{Q(X|T=1)} \propto \sum_X P(Y=1 | X, T=1) \frac{P(T=1|X)}{Q(T=1|X)} P(X)
$$
as being the propensity score estimate with model $Q$. Clearly, if $Q=P$ we get the correct value. If not, we can consider the error
$$E = (\hat\tau(P) - \hat\tau(Q))^2. $$
We can form a bound on $E$ as
\begin{align*}
E 
& = (\hat\tau(P) - \hat\tau(Q))^2 \\
& \leq \approx \sum_X P(Y=1 | X, T=1) P(X) \left( 1- \frac{P(T=1|X)}{Q(T=1|X)} \right)^2, \\
& \leq \approx \sum_X P(Y=1 | X, T=1) P(X) \ell(P,Q)
\end{align*}
which is a type of expected Chi-Squared divergence between $P, Q$, a type of proper score. Thus when $P = Q$, we get zero error, and otherwise we get a bound.

\subsubsection{Calibration Reduces Variance of Inverse Probability Estimators}
\label{apdx:variance-reduction}

\begin{theorem}
    Let $P$ be the data distribution, and suppose that $1 - \delta > P(T|X) > \delta$ for all $T, X$ and let $Q$ be a calibrated model relative to $P$. Then $1 - \delta > Q(T|X) > \delta$ for all $T, X$ as well.
\end{theorem}
\begin{proof}[Proof]
The proof is by contradiction. Suppose $Q(T=1|x) = q$ for some $x$ and $q < \delta$. Then because $Q$ is calibrated, of the times when we predict $q$, we have $P(T=1|Q(T=1|X) = q) = q <\delta$, which is impossible since $P(T=1|x) > \delta$ for every $x$. 
\end{proof}

\subsubsection{Calibration Improves the Accuracy of Causal Effect Estimation}
\label{apdx:cal-improves-accuracy}
\begin{theorem}
% A model that is separable and also calibrated achieves accurate causal effect estimation
The error of an IPTW estimator with propensity model $Q$ tends to zero as $n \to \infty$ if:
% A model $Q$ achieves accurate causal effect estimation with the IPTW estimator if
\begin{enumerate}
    \item Separability holds, i.e., $\forall X_1, X_2 \in \mathcal{X}, P(T|X_1) \neq P(T|X_1) \implies Q(T|X_1) \neq Q(T|X_1)$
    \item The model $Q$ is calibrated, i.e., $\forall q \in (0, 1), P(T=1|Q(T=1|X)=q)=q$
\end{enumerate}
\end{theorem}
% Idea: use this bound on the calibration error.
% First, we define
% $$
% \tau = P(Y=1 | do(T=1)) = \sum_X P(Y=1 | X, do(T=1)) P(X) = \sum_X P(Y=1 | X, T=1) P(X)
% $$
% as the true treatment effect. We define
% $$
% \tau_0 = P(Y=1 |T=1) = \sum_X P(Y=1 | X, T=1) P(X|T=1) \propto \sum_X P(Y=1 | X, T=1) P(T=1|X) P(X)
% $$
% to be the naive (biased) estimate. We define
% $$
% \hat\tau(Q) = \sum_X P(Y=1 | X, T=1) \frac{P(X|T=1)}{Q(X|T=1)} \propto \sum_X P(Y=1 | X, T=1) \frac{P(T=1|X)}{Q(T=1|X)} P(X)
% $$
% as being the propensity score estimate with model $Q$. Clearly, if $Q=P$ we get the correct value. If not, we can consider the error
% $$E = (\hat\tau(P) - \hat\tau(Q))^2. $$
% We can form a bound on $E$ as
% \begin{align*}
% E 
% & = (\hat\tau(P) - \hat\tau(Q))^2 \\
% & \leq \approx \sum_X P(Y=1 | X, T=1) P(X) \left( 1- \frac{P(T=1|X)}{Q(T=1|X)} \right)^2, \\
% & \leq \approx \sum_X P(Y=1 | X, T=1) P(X) \ell(P,Q)
% \end{align*}
% which is a type of expected Chi-Squared divergence between $P, Q$, a type of proper score. Thus when $P = Q$, we get zero error, and otherwise we get a bound.



We have treatment $T \in \{0, 1\}$, observed covariates $X \in \mathcal{X}$ and scalar outcome $Y \in \mathcal{R}$. $P(T=1|X)$ is the true treatment assignment mechanism and we train a propensity score model $Q(T=1|X)$. 

For $X_1, X_2 \in \mathcal{X}$ such that $X_1 \neq X_2$, we assume that if $P(T=1|X_1) \neq P(T=1|X_2)$ then $Q(T=1|X_1) \neq Q(T=1|X_2)$. 

If our input space $\mathcal{X}$ is discrete, then we can partition it into countable buckets $\{B_i\}_{i=1}^{N}$ such that we have

$$ \forall i, \forall X_1, X_2 \in B_i, P(T=1|X_1)=P(T=1|X_2),  Q(T=1|X_1)=Q(T=1|X_2). $$ 

Let us assume that for each bucket $B_i$, our model $Q(T=1|X)$ outputs a value $q_i$ and the true propensity $P(T=1|X)$ is $p_i$. 

Thus, the model $Q(T=1|X)$ outputs values $\{q_i\}_{i=1}^{N}$ and true propensity $P(T=1|X)$ takes values $\{p_i\}_{i=1}^{N}$.

Thus, we have 
$\forall i, P(T=1|Q(T=1|X)=q_i) = P(T=1| X \in B_i) = p_i.$

If $Q$ is calibrated, then by definition $p_i=q_i$. 

Now, we can write the expression for ATE $\tau$ as  
\begin{align*}
   {\tau} &= E(\frac{TY}{P(T=1|X)} - \frac{(1-T)Y}{(1-P(T=1|X))}) \\
   &=\sum_{i=1}^{N}
   P(X \in B_i) E_{X \in B_i}(\frac{TY}{P(T=1|X)} - \frac{(1-T)Y}{(1-P(T=1|X))}) \\
   &=\sum_{i=1}^{N}
   P(X \in B_i) E_{X \in B_i}(\frac{TY}{p_i} - \frac{(1-T)Y}{(1-p_i)}) \\
\end{align*}
Using our propensity score model, we estimate 
\begin{align*}
   {\tau}' &= E(\frac{TY}{Q(T=1|X)} - \frac{(1-T)Y}{(1-Q(T=1|X))}) \\
   &=\sum_{i=1}^{N}
   P(X \in B_i) E_{X \in B_i}(\frac{TY}{Q(T=1|X)} - \frac{(1-T)Y}{(1-Q(T=1|X))}) \\
   &=\sum_{i=1}^{N}
   P(X \in B_i) E_{X \in B_i}(\frac{TY}{q_i} - \frac{(1-T)Y}{(1-q_i)}) \\
\end{align*}



If our model $Q$ is calibrated, $p_i = q_i$ and hence, $\tau = {\tau}'$. 

Thus, if $\forall X_1 \neq X_2, P(T=1|X_1) \neq P(T=1|X_2) \implies Q(T=1|X_1) \neq Q(T=1|X_2)$ and  our model $Q$ is perfectly calibrated, then we can obtain unbiased estimate of treatment effect using the IPTW estimator 

$$\hat{\tau} = \frac{1}{M} \sum_{i=0}^M (\frac{T_i Y_i}{Q(T=1|X_i)} - \frac{(1-T_i)Y_i}{1-Q(T=1|X_i)})$$

% Other ideas
% \begin{enumerate}
%     \item Bounds on the ratio $\tau'/\tau$ when a subset of buckets results in the same outcome $Q(T=1|X)$ (calibrated but not sharp for a subset of buckets)
%     \item When X is continuous, we might still derive the above when the number of buckets is countable. When buckets are uncountable, can we redo this proof using integral over buckets? Might need to add some conditions
%     \item Bounding $\tau'/\tau$ using upper bound on calibration error $\sum_{i=1}^N w_i|p_i - q_i|^2 = \sum_{i=1}^N P(X_i \in B_i)|p_i - q_i|^2 $
%     \item Calibration error as function of dataset size and relating that with $\tau'/\tau$
% \end{enumerate}
\subsection{Algorithms for Calibrated Propensity Scoring}
\label{apdx:algorithms-calibrated}
\subsubsection{Asymptotic Calibration Guarantee}
\label{apdx:asymptotic-calibration}

\begin{theorem}%[Calibration]
The model $R \circ Q$ is asymptotically calibrated and
the calibration error $\mathbb{E}[L_c(R \circ Q,S)] < \delta$ for $\delta = o(m^{-k}), k>0$ w.h.p.
% $\mathcal{L}_C \leq \epsilon$ when $T \to \infty$.
%\vk{todo: copy statement and proof to appendix}
\end{theorem}

\subsubsection{No-Regret Calibration}
\label{apdx:no-regret}
\begin{theorem}
\label{lem:loss}
The recalibrated model has asymptotically vanishing regret relative to the base model: $\mathbb{E}[L(R \circ Q,T)] \leq \mathbb{E}[L(Q,T)] + \delta,$ where $\delta >0, \delta=o(m)$. % is a bound that decreases with $m$. 
% \vk{todo: proof to appendix}
\end{theorem}

\begin{proof}[Proof]
%The claim holds by empirical risk minimization. Since $R \circ H$ minimizes $L$, but is more expressive than $H$ and $R$ can represent the identity map (by Assumption \ref{ass:density}).
Solving Task \ref{ass:density} implies $\mathbb{E}[L(R \circ Q,T)] \leq \mathbb{E}[L(B \circ Q,T)] + \delta \leq \mathbb{E}[L(Q,T)] + \delta$; the second inequality holds because a Bayes-optimal $B$ has lower loss than an identity mapping.
\end{proof}


