
% \newpage
\section{THEORETICAL ANALYSIS}

\label{apdx:theoretical}
\subsection{Notation}
As described in Section~\ref{sec:background}, we are given an observational dataset $\mathcal{D}=\{(x^{(i)},y^{(i)},t^{(i)})\}_{i=1}^n$ consisting of $n$ units, each characterized by features $x^{(i)} \in \mathcal{X} \subseteq \mathbb{R}^d$, a binary treatment $t^{(i)} \in \{0,1\}$, and a scalar outcome $y^{(i)} \in \mathcal{Y} \subseteq \mathbb{R}$. 
We assume $\mathcal{D}$ consists of i.i.d.~realizations of random variables $X, Y, T \sim P$ from a data distribution $P$.
Although we assume binary treatments and scalar outcomes, our approach naturally extends beyond this setting.
The feature space $\mathcal{X}$ can be any continuous or discrete set.

\subsection{Calibration: a Necessary Condition for Propensity Scoring Models}
\label{apdx:calibration-necessary}
\begin{theorem}
 When $Q(T|X)$ is not calibrated, there exists an outcome function such that an IPTW estimator based on $Q$ yields an incorrect estimate of the true causal effect almost surely.
% For each uncalibrated model $Q(T|X)$, the set of data distributions $P$ for which an IPTW estimator yields incorrect  probabilities has measure one.
\end{theorem}
% \sd{We don't want to show that when $Q(T|X)$ is calibrated then IPTW based on $Q$ yields correct estimates..}
\begin{proof}[Example]
Consider a toy binary setting where $\mathcal{X} = \mathcal{T} = \{0,1\}, \mathcal{Y} = \{0,1\}^2$.

We set $Y = (X \oplus T, \bar{X} \oplus \bar{T}) $, $ P(T=1|X=0)=p_0,  P(T=1|X=1)=p_1$ and $P(X=1)=0.5$ such that $\oplus$ is logical `AND' and $\bar{V}$ denotes logical negation of binary variable $V$. We see that true ATE is $\tau=(0.5, -0.5)$. Let us assume that $Q(T=1|X=0) = q_0$ and $Q(T=1|X=1)=q_1$. Thus, with IPTW estimator based on $Q$, we estimate $\tau' = \mathbb{E} \bigg(\frac{TY}{Q(T=1|X)} - \frac{(1-T)Y}{1-Q(T=1|X)}\bigg) = (\frac{0.5.p_1}{q_1}, -\frac{0.5(1-p_0)}{1-q_0}).$ The treatment effect $\tau'=\tau$ only when $q_0=p_0$ and $q_1=p_1$, which is not true if $Q$ is not calibrated. Although this example allows multidimensional outcomes, this shows that we can pick an outcome function such that uncalibrated model $Q$ produces inaccurate treatment effect estimates using the IPTW estimator.
\end{proof}





\begin{proof} 
Let $\mathcal{P}$ be a space of valid probability distributions on $\mathcal{Y}$. We would like to prove that
$ \exists P'(Y|X=x, T=t) \in \mathcal{P}$ such that $$\lim_{n \rightarrow \infty} \text{Probability}(\hat{\tau}_n=\tau)=0$$ where 
\begin{itemize}
  
    \item $\tau$ is the true ATE
    \item $\hat{\tau}_n$ is the ATE estimated using IPTW estimator such that we have $n$ individuals and propensity score model is $Q(T=1|X)$
    \item The probability is taken over all propensity models $Q(T=1|X)$ such that $\exists q \in [0, 1], P(T=1|Q(T=1|X)=q) \neq q$, and all data-generating distributions $P'(Y, T, X) = P'(Y|X, T).P(T, X)$. 
\end{itemize}

Let $S_Q = \{q | \exists X \in \mathcal{X},  Q(T=1|X) = q\}$.
We partition $\mathcal{X}$ into buckets $\{B_q\}_{q \in S_Q}$ such that $B_q = \{X | Q(T=1|X)=q\}$. 

Let $\hat{\tau}(Q) = \lim_{n \rightarrow \infty} \tau_n$. 
Thus, for discrete $\mathcal{X},$ we could write 
\begin{align*}
    \hat{\tau}(Q) &= \mathbb{E}_{Y \sim P'(.|T, X); T, X \sim P}\left[\left(\frac{TY}{Q(T=1|X)} - \frac{(1-T)Y}{1-Q(T=1|X)}\right)\right]\\
    &\text{Computing expectation over $X$}\\
    &=  \sum_{X \in \mathcal{X}}\mathbb{E}_{Y \sim P'(.|T, X); T \sim P(.|X)}\left[\left(\frac{TY}{Q(T=1|X)} - \frac{(1-T)Y}{1-Q(T=1|X)}\right) P(X)\right]\\
    &\text{Computing expectation over $T$}\\
    &=\sum_{X \in \mathcal{X}}\mathbb{E}_{Y \sim P'(.|X, T=1)} \left[\left(\frac{P(T=1|X)Y}{Q(T=1|X)}\right) P(X)\right] + \sum_{X \in \mathcal{X}}\mathbb{E}_{Y \sim P'(.|X, T=0)} \left[\left(- \frac{(1-P(T=1|X))Y}{1-Q(T=1|X)}\right) P(X)\right]\\
    &=\sum_{X \in \mathcal{X}} \left(\mathbb{E}_{Y \sim P'(.|X, T=1)} \left[\left(\frac{P(T=1|X)Y}{Q(T=1|X)}\right)\right] - \mathbb{E}_{Y \sim P'(.|X, T=0)} \left[\left(\frac{(1-P(T=1|X))Y}{1-Q(T=1|X)}\right)\right] \right)P(X)\\
    &\text{Expressing the summation over $X$ differently}\\
    &=\sum_{q \in S_Q}\sum_{X \in {B_q}} \left(\mathbb{E}_{Y \sim P'(.|X, T=1)} \left[\left(\frac{P(T=1|X)Y}{Q(T=1|X)}\right)\right] - \mathbb{E}_{Y \sim P'(.|X, T=0)} \left[\left(\frac{(1-P(T=1|X))Y}{1-Q(T=1|X)}\right) \right]\right)P(X)\\
\end{align*}

Since $Q(T=1|X)$ is not calibrated, we know that $\exists q \in [0, 1], P(T=1|Q(T=1|X)=q) \neq q.$ Let us pick $q' \in S_Q$ such that $ P(T=1|Q(T=1|X)=q') \neq q'$.

We could design $P'(Y|X, T) = \mathbb{I}(Y=T.\mathbb{I}(X \in B_{q'}))/ P(X \in B_{q'})$. 

Now, we can write

\begin{align*}
    \hat{\tau}(Q)
    &=\sum_{q \in S_Q}\sum_{X \in {B_q}} \left(\mathbb{E}_{Y \sim P'(.|X, T=1)} \left[\left(\frac{P(T=1|X)Y}{Q(T=1|X)}\right)\right] - \mathbb{E}_{Y \sim P'(.|X, T=0)} \left[\left(\frac{(1-P(T=1|X))Y}{1-Q(T=1|X)}\right) \right]\right)P(X)\\
    & \text{ (Since $Y=0$ when $T=0$ or $X \notin B_{q'}$)} \\
    &= \sum_{X \in {B_{q'}}} \left(\left(\frac{P(T=1|X)P(X)}{Q(T=1|X) P(X \in B_{q'})}\right)\right) \\
    &= \sum_{X \in {B_{q'}}} \left(\left(\frac{P(T=1|X)P(X)}{q'P(X \in B_{q'})}\right)\right) \\
    &= \frac{P(T=1|X \in {B_{q'}}))}{q'}
\end{align*}
Also, for the above data-generation process, 
\begin{align*}
    \tau = \hat{\tau}(P) &= \sum_{X\in \mathcal{X}} (\mathbb{E}_{Y \sim P'(Y|X, do(T=1))}[Y] - \mathbb{E}_{Y \sim P'(Y|X, do(T=0))}[Y]) .P(X)\\
    &=\sum_{q \in S_Q}\sum_{X \in {B_q}} (\mathbb{E}_{Y \sim P'(Y|X, do(T=1))}[Y] - \mathbb{E}_{Y \sim P'(Y|X, do(T=0))}[Y]).P(X) \\
    &= \sum_{X \in {B_{q'}}} P(X) / P(X \in B_{q'}) \\
    &=  1 \\
\end{align*} Thus, 
\begin{align*}
  \lim_{n \rightarrow \infty}\text{Probability}(\tau_n = \tau) &= P(\hat{\tau}(Q) = \tau) \\
  &= \text{Probability}\left(\frac{P(T=1|X \in {B_{q'}})}{q'} = 1\right)\\  
  &= \text{Probability}\left(P(T=1|X \in {B_{q'}}) = q'\right)\\  
  &= \text{Probability}\left(P(T=1|Q(T=1|X) = q') = q'\right)\\  
  &=0,
\end{align*}
since we began with the assumption that $ P(T=1|Q(T=1|X)=q') \neq q'$. 

Please note that we could have defined a set of outcome functions that produce $Y=0$ for $X \in B_{q'}$, thus, potentially letting us compute unbiased treatment effects despite working with a miscalibrated model. However, we want our IPTW estimator to provide unbiased ATE estimates over all possible outcome functions. Here, we can see that IPTW estimator for ATE that uses a miscalibrated propensity score model cannot obtain unbiased treatment effect estimates on all possible outcome functions. 
% \sd{We could have easily defined a set of outcome functions that produce $Y=0$ for $X \in B_{q'}$, thus letting us compute accurate treatment effects despite miscalibrated model $Q(T=1|X)$, assuming $\forall q \neq q', P(T=|Q(T=1|X)=q)=q$.}

% For discrete X, assume $\mathcal{Y} = \{0, 1\}^{|\mathcal{X}|}$ such that $Y_i=X_i$. 
 
\end{proof}



% % \vk{todo: maybe complete this with some working numbers; if no space, put in appendix}


\subsection{Calibrated Uncertainties Improve Propensity Scoring Models}

\label{apdx:calibrated-uncertainties-improve-propensity}

% See Appendix~\ref{apdx:cal-improves-accuracy} for the proof. 
We define the true ATE as 
\begin{align*}
\tau &= \mathbb{E}_{y \sim P(Y=y | do(T=1))}[y] - \mathbb{E}_{y \sim P(Y=y | do(T=0))}[y]\\
&=\sum_y y(\sum_X P(Y=y | X, do(T=1)) P(X) - \sum_X P(Y=y | X, do(T=0)) P(X))\\
&=\sum_y y(\sum_X P(Y=y | X, T=1) P(X) - \sum_X P(Y=y | X, T=0) P(X))
\end{align*}


% We define the naive (biased) estimate as
% \begin{align*}
% \hat{\tau} &= \mathbb{E}_{y \sim P(Y=y |T=1)}[y] -\mathbb{E}_{y \sim P(Y=y |T=0)}[y]\\
% &= \sum_y y[\sum_X P(Y=y | X, T=1) P(X|T=1) - \sum_X P(Y=y | X, T=0) P(X|T=0)] \\
% & \approx \sum_y y[\sum_X P(Y=y | X, T=1) \frac{P(T=1|X) P(X)P(T=1)}{P(T=1)} - \sum_X P(Y=y | X, T=0) \frac{P(T=0|X) P(X)P(T=0)}{P(T=0)}] \\
% &= \sum_y y[\sum_X P(Y=y | X, T=1) {P(T=1|X) P(X)} - \sum_X P(Y=y | X, T=0) {P(T=0|X) P(X)}].
% \end{align*}
% % where $k_t$ is constant w.r.t variables X and Y. 
% % \sd{I get this additional constant in my derivation. The proof idea present here originally could get rid of the constant as we only looked at $\pi_{y, t}$ for a fixed T=t at a time and not the difference $\pi_{y, 1} - \pi_{y, 0}$. I might be missing something simple here. }
% Here, in line 3, we apply a multiplicative factor $P(T=t)$ to each term because we divide each of the two terms by the total number of samples n instead of $\sum_n \mathbb{I}(T=t)$ when computing a finite-sample formula. The `correct' Monte-Carlo estimate should divide each term by $\sum_n \mathbb{I}(T=t)$, but the one we use will divide by $n$. 

Next, recall that the finite-sample Inverse Propensity of Treatment Weight (IPTW) estimator with a model $Q(T=1|X)$ of $P(T=1|X)$ produces an estimate $\hat{\tau}_n(Q)$ of the ATE, which is computed as $$ \hat{\tau}_n(Q) = \frac{1}{n}\sum_{i=1}^n \bigg( \frac{t^{(i)}y^{(i)}}{Q(T=1|x^{(i)})} - \frac{(1-t^{(i)})y^{(i)}}{1-Q(T=1|x^{(i)})}\bigg).$$
We define $\hat{\tau}(Q)$ as the limit $lim_{n \rightarrow \infty}\hat{\tau}_n(Q)$ when the amount of data goes to infinity. Notice that we can write
$$\lim_{n \rightarrow \infty}(\hat{\tau}_n(Q)) = \hat{\tau}(Q) = \sum_y y[\pi_{y, 1}(Q) - \pi_{y, 0}(Q)]$$ where
% \sd{The statement below was taken from previous writeup but maybe there was a typo?}
% $$
% \pi_{y, t}(Q) = \sum_X P(Y=y | X, T=t) \frac{P(X|T=t)}{Q(X|T=t)} = {k'}_t. \sum_X P(Y=y | X, T=t) \frac{P(T=t|X)}{Q(T=t|X)} \frac{P(X)}{Q(X)}
% $$
% \sd{Cannot get rid of Q(X) in the denominator above? hence, I am defining this as}
% \sd{I am defining this as}
$$
\pi_{y, t}(Q) = P(T=t) \sum_X  P(Y=y | X, T=t) \frac{P(X|T=t)}{Q(T=t|X)} = \sum_X P(Y=y | X, T=t) \frac{P(T=t|X)}{Q(T=t|X)} {P(X)}
$$
We have a multiplicative term $P(T=t)$ in the above expression since we are dividing by $n$ in the finite-sample formula as opposed to $n_t$ (the number of samples with treatment $t$). In other words, in order for the finite-sample formula to be a valid Monte Carlo estimator with samples coming from $P(X|T=t)$, there needs to be an "effective adjustment factor" of $n_t / n$ (such that $(n_t / n) \cdot (1 / n_t) = (1 / n)$), and this term is $P(T=t)$ in the limit of infinite data.

Clearly, if $Q=P$ we have $\hat{\tau}(Q) = \hat{\tau}(P) = \tau$. 
If not, we can consider the error
$$E = |(\hat\tau(P) - \hat\tau(Q))|. $$

\subsubsection{Bounding the Error of Causal Effect Estimation Using Proper Losses}
\label{apdx:error-bound}
We can form a bound on $E$ as
\begin{align*}
E 
& = |[\hat\tau(P) - \hat\tau(Q)]| & \\
& = \left| \sum_y y[(\pi_{y, 1}(P) - \pi_{y, 0}(P)) - (\pi_{y, 1}(Q) - \pi_{y, 0}(Q))] \right| & \\
% & \leq  \mathbb{E}_y[|(\pi_{y, 1}(P) - \pi_{y, 0}(P)) - (\pi_{y, 1}(Q) - \pi_{y, 0}(Q))|] \\
& \leq  \sum_t \left|\sum_y y[(\pi_{y, t}(P) - \pi_{y, t}(Q)]\right| & \\
& \leq  \sum_t\sum_y[|y| |\pi_{y, t}(P) - \pi_{y, t}(Q)|] & \\
& = \sum_t  \sum_y |y| [\left|\sum_X  P(Y=y | X, T=t) {P(X)} \left(1- \frac{P(T=t|X)}{Q(T=t|X)}\right) \right|] & \\
& \leq \sum_t  \sum_y |y| [\sum_X  P(Y=y | X, T=t) {P(X)} \left| 1- \frac{P(T=t|X)}{Q(T=t|X)} \right|] & \\
& = \sum_t  \sum_y |y|.[\sum_X  P(Y=y | X, T=t) P(X) \ell_X(P_t,Q_t)^{1/2}] & \text{where } \ell_X(P_t,Q_t)=\left(1- \frac{P(T=t|X)}{Q(T=t|X)}\right)^2\\
% & = \sum_t {k'}\sum_y |y|. \mathbb{E}_{X \sim k. P(Y=y | X, T=t) P(X)} [\ell_X(P,Q)^{1/2}] & \text{Assuming ${k'} = 1/k$}\\
& = \sum_t \sum_y |y|.\mathbb{E}_{X \sim R_{y, t}} [\ell_X(P_t,Q_t)^{1/2}]& \\
\end{align*}
where $R_{t, y} \propto P(Y=y | X, T=t) P(X)$ (i.e. $R_{t, y} \sim k.P(Y=y | X, T=t) P(X)$, $k$ is constant) and $\ell_X(P,Q)$ is a type of expected Chi-Squared divergence between $P, Q$. It is a type of proper score. Thus when $P = Q$, we get zero error, and otherwise we get a bound.


% \sd{Question: Our constant $k_t = \frac{1}{P(T=t)}$ stays and changes the theorem statement in main paper slightly. Is that okay?}.


In the above derivation, we see that the expected error $|\pi_{y,t}(P) - \pi_{y,t}(Q)|$ induced by an IPTW estimator with propensity score model $Q$ is bounded as
$$|\pi_{y,t}(P) - \pi_{y,t}(Q)| \leq \mathbb{E}_{X \sim R_{y,t}}[ \ell_\chi(P_t,Q_t)^\frac{1}{2}]. $$


\subsubsection{Calibration Reduces Variance of Inverse Probability Estimators}
\label{apdx:variance-reduction}

\begin{theorem}
    Let $P$ be the data distribution, and suppose that $1 - \delta > P(T|X) > \delta$ for all $T, X$ and let $Q$ be a calibrated model relative to $P$. Then $1 - \delta > Q(T|X) > \delta$ for all $T, X$ as well.
\end{theorem}
\begin{proof}[Proof]
Suppose $Q(T=1|x) = q$ for some $x$ and $q < \delta$. 
Since $Q$ is calibrated, we have 
$P(T=1|Q(T=1|X) = q) = q <\delta$. 

However $P(T=1|x) > \delta$ for every $x$. Hence, $P(T=1|X \in A) > \delta$, for all sets $A \subseteq \mathcal{X}$. This implies that $P(T=1|Q(T=1|X) = q) > \delta$ for all $q \in [0, 1].$ 

Thus, we have a contradiction. 
\end{proof}
% \sd{Above proof from main paper is already good?}
\subsubsection{Calibration Improves Error Bounds on Causal Effect Estimate} 
\label{apdx:iptw-error-bound}
%with Accurate Propensity Models


We show that calibration strictly improves our $\ell_\chi$ bound on the IPTW error.
\begin{theorem}
\label{apdx:thrm:iptw-error-bound-lx}
    Let $\ell_1$ be the expected bound on the error of an uncalibrated IPTW estimator $Q_1$ in Corollary~\ref{corollary}, and let $\ell_2$ be the bound for $Q_2$, the recalibrated version of $Q_1$ with $\ell_\chi^{1/2}$ as the choice of loss $L$ to train the recalibrator. Then as the size of the calibration set $n \to \infty$ we have $\ell_2 \leq \ell_1$ with equality iff $Q_1 = Q_2$.
\end{theorem}
\vspace{-4mm}
% \sd{The proof refers to contents from subsequent section; can move it to a different location}
\begin{proof}[Proof]

Corollary~\ref{corollary} states that the error of an IPTW estimator with propensity score model $Q$ is bounded by $2|\mathcal{Y}| K \max_{y,t} \mathbb{E}_{R_{y,t}} \ell_\chi(P,Q)^\frac{1}{2},$ where $|y| \leq K$ for all $y\in\mathcal{Y}$, $R_{y,t} \propto P(Y=y | X, T=t) P(X)$ is a data distribution and $\ell_\chi(P,Q)= \left( 1- \frac{P(T=t|X)}{Q(T=t|X)} \right)^2$ is the $chi$-squared loss between the true propensity score and the model $Q$.

Thus, $\ell_1 = 2|\mathcal{Y}| K \max_{y,t} \mathbb{E}_{R_{y,t}} \ell_\chi(P,Q_1)^\frac{1}{2}$ and $\ell_2 = 2|\mathcal{Y}| K \max_{y,t} \mathbb{E}_{R_{y,t}} \ell_\chi(P,Q_2)^\frac{1}{2}$. Clearly, the upper bound $\ell_i$ depends on $\ell_\chi(P,Q_i)$ where $i \in \{1, 2\}$. 
%The part of $\ell_1, \ell_2$ that depends on $Q \in \{Q_1, Q_2\}$ is $L(Q, T) = \mathbb{E}_X \mathbb{E}_{T|X} \ell_\chi(Q(T=1|X), T)^{1/2}$. 

When we use Algorithm~\ref{alg:recalibrate} to perform recalibration, we obtain $Q_2 = R \circ Q_1.$ Here, we can choose the loss function  $L(Q, T) = \mathbb{E}_X \mathbb{E}_{T|X} \ell_\chi(Q(T=1|X), T)^{1/2}$. From Theorem~\ref{lem:loss}, it follows that $L(Q_2, T) = L(R \circ Q_1, T) \leq L(Q_1, T) + o(n)$ for a recalibrator $R$. 

As $n \to \infty$, $R \to B$ (Bayes optimal recalibrator; see Task~\ref{ass:density}). 

If $Q_1 \neq Q_2$, then $L(Q_2, T) \neq L(Q_1, T)$ because $L$ is strictly proper. Conversely, when $Q_1=Q_2$ clearly $\ell_1 = \ell_2$. Hence, the claim follows.
\end{proof}
Theorem~\ref{apdx:thrm:error-bound-lx} in Appendix~\ref{apdx:doubly-robust} proves a similar result for the AIPW estimator when the outcome model is inaccurate.
\subsubsection{Calibration Improves the Accuracy of Causal Effect Estimation}
\label{apdx:cal-improves-accuracy1}
Unfortunately, calibration by itself is not sufficient to correctly estimate treatment effects. For example, consider defining $Q(T|X)$ as the marginal $P(T)$: this $Q$ is calibrated, but cannot accurately estimate treatment effects.
However, if the model $Q$ is sufficiently accurate (as might be the case with a powerful neural network), calibration becomes the missing piece for an accurate IPTW estimator.

% Here and in the next section, we describe settings in which a calibrated $Q$ yields accurate IPTW estimates of causal effect. 
% Specifically, we identify two conditions that are weaker than full Bayes optimality: 
% First, we show that a separable and calibrated model yields accurate IPTW estimates; in the next section, we show that a post-hoc recalibrated model $Q'$ of a base model $Q$ can only improve the calibration bound, i.e., $\ell_\chi(P,Q') \leq \ell_\chi(P,Q) + o(n)$.

Specifically, we define
% Our first condition, 
separability, a condition which states that when $P(T|X_1) \neq P(T|X_2)$ for $X_1, X_2 \in \mathcal{X}$, then the model $Q$ satisfies $Q(T|X_1) \neq Q(T|X_2)$. Intuitively, the model $Q$ is able to discriminate between various $T$---something that might be achievable with an expressive neural $Q$ that has high classification accuracy. We show that a model that is separable and also calibrated achieves accurate causal effect estimation.
% \sd{Edit this section}

\begin{theorem}
% A model that is separable and also calibrated achieves accurate causal effect estimation
The error of an IPTW estimator with propensity model $Q$ tends to zero as $n \to \infty$ if:
% A model $Q$ achieves accurate causal effect estimation with the IPTW estimator if
\begin{enumerate}
    \item Separability holds, i.e., $\forall X_1, X_2 \in \mathcal{X}, P(T|X_1) \neq P(T|X_2) \implies Q(T|X_1) \neq Q(T|X_2)$
    \item The model $Q$ is calibrated, i.e., $\forall q \in (0, 1), P(T=1|Q(T=1|X)=q)=q$
\end{enumerate}
\end{theorem}
% Idea: use this bound on the calibration error.
% First, we define
% $$
% \tau = P(Y=1 | do(T=1)) = \sum_X P(Y=1 | X, do(T=1)) P(X) = \sum_X P(Y=1 | X, T=1) P(X)
% $$
% as the true treatment effect. We define
% $$
% \tau_0 = P(Y=1 |T=1) = \sum_X P(Y=1 | X, T=1) P(X|T=1) \propto \sum_X P(Y=1 | X, T=1) P(T=1|X) P(X)
% $$
% to be the naive (biased) estimate. We define
% $$
% \hat\tau(Q) = \sum_X P(Y=1 | X, T=1) \frac{P(X|T=1)}{Q(X|T=1)} \propto \sum_X P(Y=1 | X, T=1) \frac{P(T=1|X)}{Q(T=1|X)} P(X)
% $$
% as being the propensity score estimate with model $Q$. Clearly, if $Q=P$ we get the correct value. If not, we can consider the error
% $$E = (\hat\tau(P) - \hat\tau(Q))^2. $$
% We can form a bound on $E$ as
% \begin{align*}
% E 
% & = (\hat\tau(P) - \hat\tau(Q))^2 \\
% & \leq \approx \sum_X P(Y=1 | X, T=1) P(X) \left( 1- \frac{P(T=1|X)}{Q(T=1|X)} \right)^2, \\
% & \leq \approx \sum_X P(Y=1 | X, T=1) P(X) \ell(P,Q)
% \end{align*}
% which is a type of expected Chi-Squared divergence between $P, Q$, a type of proper score. Thus when $P = Q$, we get zero error, and otherwise we get a bound.



% We have treatment $T \in \{0, 1\}$, observed covariates $X \in \mathcal{X}$ and scalar outcome $Y \in \mathcal{R}$. $P(T=1|X)$ is the true treatment assignment mechanism and we train a propensity score model $Q(T=1|X)$. 

% For $X_1, X_2 \in \mathcal{X}$ such that $X_1 \neq X_2$, we assume that if $P(T=1|X_1) \neq P(T=1|X_2)$ then $Q(T=1|X_1) \neq Q(T=1|X_2)$. 
% \newpage
\begin{proof}
We prove this for discrete inputs at first and then prove it for continuous inputs.

{\textbf{Discrete Input Space.}}    

If our input space $\mathcal{X}$ is discrete, then the number of distinct values that $Q(T=1|X)$ can take is countable. Let us assume that $Q(T=1|X)$ takes values $\{q_i\}_{i=1}^M$. Thus, we can partition $\mathcal{X}$ into buckets $\{B_i\}_{i=1}^{M}$ such that $B_i = \{X| Q(T=1|X) = q_i\}$. Due to separability, we have $\forall X_1, X_2 \in \mathcal{X}, Q(T|X_1) = Q(T|X_2) \implies P(T|X_1) = P(T|X_2)$. Thus, we have $\forall i, \forall X_1, X_2 \in B_i, Q(T=1|X_1)=Q(T=1|X_2),$ and $  P(T=1|X_1)=P(T=1|X_2).$

Let us assume that for each bucket $B_i$, our true propensity $P(T=1|X)$ is $p_i$, i.e, if $X \in B_i$ then $Q(T=1|X)=q_i$ and $P(T=1|X)=p_i$.


% Thus, the model $Q(T=1|X)$ outputs values $\{q_i\}_{i=1}^{M}$ and true propensity $P(T=1|X)$ takes values $\{p_i\}_{i=1}^{M}.$ 

Assuming positivity, $0 < p_i < 1$.

Now, for all $i$, we can write 
\begin{align*}P(T=1|Q(T=1|X)=q_i) &= P(T=1| X \in B_i) \\ &= p_i.
\end{align*}

If $Q$ is calibrated, then by definition $p_i=q_i$. 

% \sd{Below is not necessary if we directly claim that P(T|X) and Q(T|X) are identical}

Now, we can write the expression for ATE $\tau$ as  
\begin{align*}
   {\tau} = \hat{\tau}(P) &= \mathbb{E}_{Y, T, X}[\frac{TY}{P(T=1|X)} - \frac{(1-T)Y}{(1-P(T=1|X))}] \\
   &=\sum_{i=1}^{N}
   P(X \in B_i) \mathbb{E}_{Y, T} \left(\frac{TY}{p_i} - \frac{(1-T)Y}{(1-p_i)}\right) \\
   % &=\sum_{i=1}^{N}
   % P(X \in B_i) E_{T, Y | X \in B_i}\left(\frac{TY}{p_i} - \frac{(1-T)Y}{(1-p_i)}\right) \\
\end{align*}
Using our propensity score model $Q(T=1|X)$, we estimate $\hat{\tau}$ as  
\begin{align*}
   \hat{\tau}(Q) &= \mathbb{E}_{Y, T, X}[\frac{TY}{Q(T=1|X)} - \frac{(1-T)Y}{(1-Q(T=1|X))}] \\
   &=\sum_{i=1}^{N}
   P(X \in B_i) \mathbb{E}_{Y, T} \left(\frac{TY}{q_i} - \frac{(1-T)Y}{(1-q_i)}\right) \\
   % &=\sum_{i=1}^{N}
   % P(X \in B_i) E_{X \in B_i}(\frac{TY}{q_i} - \frac{(1-T)Y}{(1-q_i)}) \\
\end{align*}



If our model $Q$ is calibrated, then $p_i = q_i$. Hence, $0 < q_i < 1$ and $\hat{\tau}$ is well-defined. Also, $\tau = \hat{\tau}(P) = \hat{\tau}(Q)$. 

When our observational data contains $n$ units, the IPTW estimator based on model $Q(T=1|X)$ is $\hat{\tau}_n = \frac{1}{n} \sum_{i=0}^n \left(\frac{T^{(i)} Y^{(i)}}{Q(T=1|X^{(i)})} - \frac{(1-T^{(i)})Y^{(i)}}{1-Q(T=1|X^{(i)})}\right).$

Hence, $\lim_{n \rightarrow \infty} \hat{\tau}_n = \hat{\tau}(Q) = \hat{\tau}(P) = \tau. $ 


% Thus, for $\mathcal{X}$ is discrete, we have a 

%if $\forall X_1 \neq X_2, P(T=1|X_1) \neq P(T=1|X_2) \implies Q(T=1|X_1) \neq Q(T=1|X_2)$ and  our model $Q$ is perfectly calibrated, then we can obtain unbiased estimate of treatment effect using the IPTW estimator.
% \newline

% \newline
{\textbf{Continuous Input Space.}}    

% \sd{Can retain the proof just for continuous X to make it concise.}

When $X$ is continuous, the number of buckets can be uncountable. The buckets can now be formed as $B_q = \{X | Q(T=1|X)=q\}, \forall q \in [0, 1]$. It is easy to see that $\{B_q\}_{q \in [0, 1]}$ partitions $\mathcal{X}$. Note that $B_q$ can be empty if there exists no $X$ such that $Q(T=1|X)=q$.

Due to separability, $\forall X_1, X_2 \in \mathcal{X}, Q(T|X_1) = Q(T|X_2) \implies P(T|X_1) = P(T|X_2)$. 
Thus, for all $q$, $P(T=1|X)$ takes on a unique value for all $X \in B_q$, i.e., $\forall q \in [0, 1], P(T=1|X \in B_q) = f(q),$ where function $f: [0,1] \rightarrow [0,1]$.

Hence, we can write 
\begin{align*}
\forall q \in [0, 1], P(T=1| Q(T=1|X)=q) &= P(T=1| X \in B_q) \\
&=f(q).
\end{align*} 
When model $Q(T=1|X)$ is calibrated by our definition, then $\forall q \in [0, 1], q = f(q).$

Therefore, $\forall q \in [0,1], Q(T=1|X \in B_q) = q = f(q) = P(T=1| X \in B_q)$.

Since $\{B_q\}_{q \in [0, 1]}$ partitions $\mathcal{X}$, we have $\forall X \in \mathcal{X}, P(T=1|X) = Q(T=1|X)$. Thus, $\hat{\tau} (P) = \hat{\tau} (Q)$.

\end{proof}

 % \subsection{Weak Separability}
 
% $$\hat{\tau} = \frac{1}{M} \sum_{i=0}^M (\frac{T_i Y_i}{Q(T=1|X_i)} - \frac{(1-T_i)Y_i}{1-Q(T=1|X_i)})$$

% Other ideas
% \begin{enumerate}
%     \item Bounds on the ratio $\tau'/\tau$ when a subset of buckets results in the same outcome $Q(T=1|X)$ (calibrated but not sharp for a subset of buckets)
%     \item When X is continuous, we might still derive the above when the number of buckets is countable. When buckets are uncountable, can we redo this proof using integral over buckets? Might need to add some conditions
%     \item Bounding $\tau'/\tau$ using upper bound on calibration error $\sum_{i=1}^N w_i|p_i - q_i|^2 = \sum_{i=1}^N P(X_i \in B_i)|p_i - q_i|^2 $
%     \item Calibration error as function of dataset size and relating that with $\tau'/\tau$
% \end{enumerate}
\subsection{Doubly Robust Estimators and Error Bounds on Causal Effect Estimation}
\label{apdx:doubly-robust}

Given a dataset $\{x_i, t_i, y_i\}_{i=1}^n$, the doubly robust AIPW (Augmented Inverse Propensity Weight) estimator can be used to compute ATE estimate as
$$ \hat{\tau'}_n(Q, f) = \frac{1}{n}\sum_{i=1}^n \bigg( f(x_i, 1) - f(x_i, 0) + \frac{t^{(i)}(y^{(i)} - f(x_i, 1))}{Q(T=1|x^{(i)})}  - \frac{(1-t^{(i)})(y^{(i)} - f(x, 0))}{1-Q(T=1|x^{(i)})}\bigg).$$



The outcome model $f(X=x, T=t)$ can be learned from available data to predict potential outcome $Y[X=x, do(T=t)]$, where the input covariates are set to $x$ and the applied  intervention is $T=t$. Let us assume that $f(X=x, T=t)$ produces an error of $\epsilon(X=x, T=t)$, i.e. 
$f(X, T) = Y[X, do(T)] + \epsilon(X, T)$.

Thus, we can rewrite the causal effect estimate $\hat{\tau'}_n(Q, \epsilon)$ as

$$ \hat{\tau'}_n(Q, \epsilon) = \frac{1}{n}\sum_{i=1}^n \bigg( Y[x_i, do(t=1)] - Y[x_i, do(t=0)] + \epsilon(x_i, 1) - \epsilon(x_i, 0) - \frac{t^{(i)}(\epsilon(x_i, 1))}{Q(T=1|x^{(i)})}  + \frac{(1-t^{(i)})( \epsilon(x, 0))}{1-Q(T=1|x^{(i)})}\bigg).$$

When $n \rightarrow \infty$, we have 

$$ \lim_{n \rightarrow \infty}\hat{\tau'}_n(Q, \epsilon) = \hat{\tau'}(Q, \epsilon) = \hat{\tau'}(Q, 0) + \mathbb{E}_{X, T}[\epsilon(X, 1) (1 - \frac{T}{Q(T=1|X)} -\epsilon(X, 0) (1 - \frac{1-T}{1-Q(T=1|X)}))],$$
%Similar to Corollary~\ref{corollary} for the IPTW estimator, 
where second equality is true due to doubly robust property. We state the following error bound for the AIPW estimator:
\begin{corollary}
\label{apdx:corollary}
Let $|\epsilon(X, T)| \leq \epsilon_{max}$ for all $X \in \mathcal{X}, T \in \{0, 1\}$.
The error of an AIPW estimator with propensity score model $Q$ and error in outcome model $\epsilon$ is bounded by $\epsilon_{max} \sum_{t} \mathbb{E}_{X}[l_{X} (P_t, Q_t)^{1/2}]$ where $P_t = P(T=t|X), Q_t = Q(T=t|X).$
\end{corollary}
Due to the doubly robust property, we know that the true ATE estimate $\tau = \hat{\tau'}(Q, 0) = \hat{\tau'}(P, \epsilon)$ for any propensity model $Q(T=1|X)$ and error function $\epsilon(X, T)$. 

The L1 error in our ATE estimate $\hat{\tau'}_n(Q, \epsilon)$ (after seeing infinite samples) can be expressed as 
$$E = |\hat{\tau'}(Q, \epsilon) - {\tau}| = | \hat{\tau'}(Q, \epsilon) - \hat{\tau'}(Q, 0)|$$
Thus, 
\begin{align*}
E 
& = 
\left|\mathbb{E}_{X, T}[\epsilon(X, 1) (1 - \frac{T}{Q(T=1|X)} -\epsilon(X, 0) (1 - \frac{1-T}{1-Q(T=1|X)}))] \right| &\\
& = 
\left|\mathbb{E}_{X}\mathbb{E}_{T|X}[\epsilon(X, 1) (1 - \frac{T}{Q(T=1|X)} -\epsilon(X, 0) (1 - \frac{1-T}{1-Q(T=1|X)}))] \right|&\\
& = 
\left|\mathbb{E}_{X}[\epsilon(X, 1) (1 - \frac{P(T=1|X)}{Q(T=1|X)} -\epsilon(X, 0) (1 - \frac{1-P(T=1|X)}{1-Q(T=1|X)}))] \right|&\\
& \leq 
\mathbb{E}_{X}[\left|\epsilon(X, 1) (1 - \frac{P(T=1|X)}{Q(T=1|X)}\right| + \left|\epsilon(X, 0) (1 - \frac{P(T=0|X)}{Q(T=0|X)}))\right|] &\\
& \leq 
\epsilon_{max} \mathbb{E}_{X}[\left|(1 - \frac{P(T=1|X)}{Q(T=1|X)}\right| + \left| (1 - \frac{P(T=0|X)}{Q(T=0|X)}))\right|] & \text{where } \epsilon_{max} = \max_{X, T} |\epsilon(X, T)|\\
& \leq 
\epsilon_{max} \sum_t \mathbb{E}_{X}[l_{X} (P_t, Q_t)^{1/2}] & \text{where } P_t = P(T=t|X), Q_t = Q(T=t|X)\\
\end{align*}
Thus, we have an error bound on the asymptotic ATE estimate that relates with the chi-squared divergence. Thus, given that the learned outcome model is inaccurate (due to possible mis-specification), training a recalibrator for the propensity score model with $l_X$ as loss function reduces the chi-squared divergence and improves the error bound. %(similar to Theorem~\ref{thrm:error-bound-lx} for IPTW estimator).

\begin{theorem}
\label{apdx:thrm:error-bound-lx}
    Let $\ell$ be the expected bound on the error of an uncalibrated AIPW estimator $Q$ in Corollary~\ref{apdx:corollary}, and let $\ell'$ be the bound for $Q'$, the recalibrated version of $Q$ obtained using Algorithm~\ref{alg:recalibrate} with $\ell_\chi^{1/2}$ as the choice of loss $L$. Then as the size of the calibration set $n \to \infty$ we have $\ell' \leq \ell$ with equality iff $Q = Q'$.
\end{theorem}
\vspace{-4mm}
% \sd{The proof refers to contents from subsequent section; can move it to a different location}
\begin{proof}[Proof]
    %Similar to the proof for Theorem~\ref{thrm:error-bound-lx}, 
    Corollary~\ref{apdx:corollary} states that the error of an AIPW estimator with propensity score model $Q$ and error in outcome model $\epsilon$ is bounded by $\epsilon_{max} \sum_{t} \mathbb{E}_{X}[l_{X} (P_t, Q_t)^{1/2}]$ where $|\epsilon(X, T)| \leq \epsilon_{max}$ for all $X \in \mathcal{X}, T \in \{0, 1\}$, $P_t = P(T=t|X), Q_t = Q(T=t|X)$ and $\ell_\chi(P_t,Q_t)= \left( 1- \frac{P(T=t|X)}{Q(T=t|X)} \right)^2$ is the $chi$-squared loss between the true propensity score and the model $Q$.

Thus, $\ell = \epsilon_{max} \sum_{t} \mathbb{E}_{X}[l_{X} (P_t, Q_t)^{1/2}]$ and $\ell' = \epsilon_{max} \sum_{t} \mathbb{E}_{X}[l_{X} (P_t, Q_t')^{1/2}]$. Clearly, the upper bound on $\ell$ and $\ell'$ depends on $\ell_\chi(P,Q)$ and $\ell_\chi(P,Q')$ respectively. 
%The part of $\ell_1, \ell_2$ that depends on $Q \in \{Q_1, Q_2\}$ is $L(Q, T) = \mathbb{E}_X \mathbb{E}_{T|X} \ell_\chi(Q(T=1|X), T)^{1/2}$. 

When we use Algorithm~\ref{alg:recalibrate} to perform recalibration, we obtain $Q' = R \circ Q.$ Here, we can choose the loss function  $L(Q, T) = \mathbb{E}_X \mathbb{E}_{T|X} \ell_\chi(Q(T=1|X), T)^{1/2}$. From Theorem~\ref{lem:loss}, it follows that $L(Q', T) = L(R \circ Q, T) \leq L(Q, T) + o(n)$ for a recalibrator $R$. 

As $n \to \infty$, $R \to B$ (Bayes optimal recalibrator; see Task~\ref{ass:density}). 

If $Q \neq Q'$, then $L(Q', T) \neq L(Q, T)$ because $L$ is strictly proper. Conversely, when $Q=Q'$ clearly $\ell = \ell'$. Hence, the claim follows.
\end{proof}



Now, we prove that calibration is a necessary condition for accurate causal effect estimation when the outcome model in AIPW estimator is inaccurate. 

\begin{theorem}
\label{apdx:thrm-calibration-necessary}
When propensity model $Q(T|X)$ is not calibrated and the  outcome model f(X, T) is inaccurate for $X \in \{X: Q(T=1|X)=q\} \subseteq \mathcal{X}$ such that $q \in (0, 1), P(T=1| Q(T=1|X')=q) \neq q$, then there exists true outcome function such that the doubly robust AIPW estimator based on Q and f yields an incorrect estimate of true causal effects almost surely.
\end{theorem}
\begin{proof}
    

Following the setup in \ref{apdx:calibration-necessary}, we let $S_Q = \{q | \exists X \in \mathcal{X},  Q(T=1|X) = q\}$.
We partition $\mathcal{X}$ into buckets $\{B_q\}_{q \in S_q}$ such that $B_q = \{X | Q(T=1|X)=q\}$. When $Q(T=1|X)$ is not calibrated, we know that $\exists q \in [0, 1], P(T=1|Q(T=1|X)=q) \neq q.$ 

We design the true outcome function $Y[X, do(T=t)]$ such that $Y[X, do(T=0)] = 0$. Since the learned outcome model $f(X, T) = Y[X, do(T=t)] + \epsilon(X, T)$ is inaccurate (possibly from learning a mis-specified model), let us define $\mathcal{X}_\epsilon \subseteq \mathcal{X}$ such that $\forall X \in \mathcal{X}_\epsilon,  \epsilon(X, T) \neq 0$ and  $\forall X \in \mathcal{X} /\mathcal{X}_\epsilon,  \epsilon(X, T) = 0$. For the sake of simplicity, we assume that the outcome model $f(x, t)$ can learn the true outcome function whenever $T=0$, since the true outcome is $0$ whenever $T=0$, Thus, $\forall X \in \mathcal{X}, \epsilon(X, T=0) = 0$. 

Now, let us pick $q' \in S_Q$ such that $ P(T=1|Q(T=1|X)=q') \neq q'$ and $B_{q'} \cap \mathcal{X}_\epsilon \neq \phi$. We can always pick such a $q'$ as long as $Q$ is uncalibrated and $ \exists X \in B_{q'}$ such that $, \epsilon (X, T=1) \neq 0$ (i.e. the learned outcome model $f(X, T)$ is inaccurate where the learned propensity model produces inaccurate uncertainties). 

With this, we can write the expression for PEHE (Precision in Estimation of Heterogenous Treatment Effect) estimate with $n$ samples $F_n (Q, \epsilon)$ as

$ F_n (Q, \epsilon) = \frac{1}{n}\sum_{i=1}^n \bigg( Y[x_i, do(t=1)] - Y[x_i, do(t=0)] + \epsilon(x_i, 1) - \epsilon(x_i, 0) - \frac{t^{(i)}(\epsilon(x_i, 1))}{Q(T=1|x^{(i)})}  + \frac{(1-t^{(i)})( \epsilon(x, 0))}{1-Q(T=1|x^{(i)}) } - (Y[x_i, do(t=1)] - Y[x_i, do(t=0)])\bigg)^2 = \frac{1}{n}\sum_{i=1}^n \bigg(\epsilon(x_i, 1) - \epsilon(x_i, 0) - \frac{t^{(i)}(\epsilon(x_i, 1))}{Q(T=1|x^{(i)})}  + \frac{(1-t^{(i)})( \epsilon(x, 0))}{1-Q(T=1|x^{(i)}))}\bigg)^2.$

Now, we will try to establish a lower bound on the error $F_n (Q, \epsilon)$ when $n \rightarrow \infty$. 
\begin{align*}
    F
    &= \lim_{n \rightarrow \infty} F_n (Q, \epsilon) & \\
    &= \mathbb{E}_{X, T}[(\epsilon(X, 1) (1 - \frac{T}{Q(T=1|X)} -\epsilon(X, 0) (1 - \frac{1-T}{1-Q(T=1|X)})))^2] \\
    &= \mathbb{E}_{X}\mathbb{E}_{T|X}[(\epsilon(X, 1) (1 - \frac{T}{Q(T=1|X)} -\epsilon(X, 0) (1 - \frac{1-T}{1-Q(T=1|X)})))^2] \\
    & \text{Following the setup in \ref{apdx:calibration-necessary}, we expand the expectation over X} & \\
    & \text{(similar expression can be written with} \int_X \text{if X is continuous)} & \\
    &=   \sum_{q \in S_Q}\sum_{X \in {B_q}} (\epsilon(X, 1) (1 - \frac{P(T=1|X)}{Q(T=1|X)} -\epsilon(X, 0) (1 - \frac{1-P(T=1|X)}{1-Q(T=1|X)})))^2 P(X)  & \\
    &=   \sum_{q \in S_Q}\sum_{X \in {B_q \cap \mathcal{X}_\epsilon}} (\epsilon(X, 1) (1 - \frac{P(T=1|X)}{Q(T=1|X)} -\epsilon(X, 0) (1 - \frac{1-P(T=1|X)}{1-Q(T=1|X)})))^2 P(X)  & \forall X \in \mathcal{X} /\mathcal{X}_{\epsilon}, \epsilon(X, T) = 0\\
    &\text{Since we assume that }\forall x \in \mathcal{X}, \epsilon(x, 0) = 0, & \\
    &=   \sum_{q \in S_Q}\sum_{X \in {B_q \cap \mathcal{X}_\epsilon}} (\epsilon(X, 1) (1 - \frac{P(T=1|X)}{q})^2 P(X)  & \\
    & \geq  \sum_{X \in {B_{q'} \cap \mathcal{X}_\epsilon}} (\epsilon(X, 1) (1 - \frac{P(T=1|X)}{q'})^2 P(X)  & P(T=1|Q(T=1|X)=q') \neq q' \\
    & \geq  \epsilon_{min} \sum_{X \in {B_{q'} \cap \mathcal{X}_\epsilon}} ((1 - \frac{P(T=1|X)}{q'})^2 P(X)  & \epsilon_{min} = \min_{X \in {B_q \cap \mathcal{X}_\epsilon}} \epsilon(X, 1)\\
\end{align*}
    
The above expression is non-zero since $P(T=1|Q(T=1|X)=q') \neq q'$ and $\epsilon_{min} \neq 0$ by design. Thus, when $Q(T|X)$ is not calibrated and the learned outcome model f(X, T) is inaccurate over the regions where $P(T=1|Q(T=1|X)=q) \neq q$, then there exists true outcome function such that the AIPW estimator based on Q and f yields an incorrect estimate of true causal effects almost surely.  
\end{proof}
% where $R_{y,t} \propto P(Y=y | X, T=t) P(X)$ is a data distribution and $\ell_\chi(Q,P)= \left( 1- \frac{P(T=t|X)}{Q(T=t|X)} \right)^2$ is the $chi$-squared loss between the true propensity score and the model $Q$. 

% \begin{proof}[Proof]

% \begin{align*}
%  R_{y,t} \sim k. P(Y=y | X, T=t) P(X), \\
% \end{align*}
% where k is constant w.r.t. variable X. 
% \begin{align*}
% |\pi_{y,t}(P) - \pi_{y,t}(Q)| &= \left| \sum_x P(y|x,t)\frac{P(t|x)}{P(t|x)}P(x) - \sum_x P(y|x,t)\frac{P(t|x)}{Q(t|x)}P(x) \right| \\
% &= \left|\sum_x (1-\frac{P(t|x)}{Q(t|x)})P(y|x,t) P(x)\right| \\
% & \leq \sum_x \left|(1-\frac{P(t|x)}{Q(t|x)})\right| P(y|x,t) P(x) \\
% & = \frac{1}{k}\sum_x [ \left| \left( 1- \frac{P(t|x)}{Q(t|x)} \right) \right|]{k. P(y | x, t) P(x)} \\
% & = \frac{1}{k} \mathbb{E}_{X \sim {k. P(y | x, t) P(x)}}[ \left| \left( 1- \frac{P(T=t|X)}{Q(T=t|X)} \right) \right|] \\
% &=\frac{1}{k} \mathbb{E}_{X \sim R_{y,t}}[ \ell_\chi(P,Q)^\frac{1}{2}]
% \end{align*}



% \end{proof}



\subsection{Algorithms for Calibrated Propensity Scoring}
% \sd{I am taking the following from ICML2022 paper and have also cited it.}
\label{apdx:algorithms-calibrated}
\subsubsection{Asymptotic Calibration Guarantee}
\label{apdx:asymptotic-calibration}

\begin{theorem}%[Calibration]
The model $R \circ Q$ is asymptotically calibrated and
the calibration error $\mathbb{E}[L_c(R \circ Q,S)] < \delta(m)$ for $\delta(m) = o(m^{-k}), k>0$ w.h.p.
% $\mathcal{L}_C \leq \epsilon$ when $T \to \infty$.
%\vk{todo: copy statement and proof to appendix}
\end{theorem}
\begin{proof}
    Any proper loss can be decomposed as: 
    proper loss = calibration - sharpness + irreducible term~\citep{guo2017calibration}. The calibration term consists of the error $\mathbb{E}[L_c(R \circ Q,S)]$. The sharpness and irreducible term can be represented as the refinement term $\mathbb{E}(L_r(S))$. Table~\ref{tbl:properlosses} provides examples of some proper loss functions and the respective decompositions. The rest of our proof uses the techniques of \citet{pmlr-v162-kuleshov22a} in the context of propensity scores.
    
     \citet{kullflach2015novel} show that the refinement term can be further divided as $\mathbb{E}(L_r(S)) = \mathbb{E} (L_g(S, B \circ Q)) + \mathbb{E}(L(B \circ Q, T)).$ Here, $B$ is the Bayes optimal recalibrator $P(T=1|Q(T=1|X))$ and $S$ is $P(T=1|R \circ Q).$ 

    Recall that if we solve the Task~\ref{ass:density}, we have for $\delta(m)=o(1)$
\begin{flalign*}
\mathbb{E}(L(B \circ Q, T)) & \leq \mathbb{E}(L(R \circ Q, T)) \leq \mathbb{E}(L(B \circ Q, T)) + \delta(m)&&\\
\text{Using \citet{gneiting2007probabilistic}}, & \text{~\citet{kullflach2015novel} we decompose $\mathbb{E}(L(R \circ Q, T))$} &&\\
\implies \mathbb{E}(L(B \circ Q, T)) &\leq \mathbb{E}(L_c(R \circ Q, S)) + \mathbb{E}(L_g(S, B \circ Q)) + \mathbb{E}(L(B \circ Q, T)) \leq \mathbb{E}(L(B \circ Q, T)) + \delta(m)&&\\
\implies \mathbb{E}(L_c(R \circ Q, S)) & + \mathbb{E}(L_g(S, B \circ Q)) \leq  \delta(m)&&\\
\implies \mathbb{E}(L_c(R \circ Q, S)) & \leq  \delta(m)&&\\
\end{flalign*}
\begin{table}
\begin{center}
\begin{tabular}{l|c|c|c}
\toprule
{\bf Proper Score} & {\bf Loss} & {\bf Calibration} & {\bf Refinement} \\
& $L(F,G)$ & $L_c(F,S)$ & $L_r(S)$ \\
\midrule
Logarithmic & $\mathbb{E}_{y\sim G}$ $\log f(y)$ & $\text{KL}(s||f)$ & $H(s)$ \\
CRPS & {\small $\mathbb{E}_{y\sim G}$ $(F(y) - G(y))^2$} & {\small $\int^{\infty}_{-\infty}(F(y) - S(y))^2$dy} & {\small $\int^{\infty}_{-\infty} S(y) (1 - S(y))dy$} \\
Quantile & {\small $\mathbb{E}^{\tau\in U[0,1]}_{y\sim G} \rho_\tau(y-F^{-1}(\tau))$} & {\small $\int_0^1 \int^{F^{-1}(\tau)}_{S^{-1}(\tau)}(S(y) - \tau)dyd\tau$} & {\small $\mathbb{E}^{\tau\in U[0,1]}_{y\sim S} \rho_\tau(y-S^{-1}(\tau))$} \\
\bottomrule
\end{tabular}
\end{center}
\caption{Proper loss functions. A proper loss is a function $L(F,G)$ over a forecast $F$ targeting a variable $y \in \mathcal{Y}$ whose true distribution is $G$ and for which $S(F,G) \geq S(G,G)$ for all $F$. Each $L(F,G)$ decomposes into the sum of a calibration loss term $L_c(F,S)$ (also known as reliability) and a refinement loss term $L_r(S)$ (which itself decomposes into sharpness and an uncertainty term). Here, $S(y)$ denotes the cumulative distribution function of the conditional distribution $\mathbb{P}(Y=y \mid F_X = F)$ of $Y$ given a forecast $F$, and $s(y), f(y)$ are the probability density functions of $S$ and $F$, respectively. We give three examples of proper losses: the log-loss, the continuous ranked probability score (CRPS), and the quantile loss.}\label{tbl:properlosses}
\end{table}
    Thus, solving Task~\ref{ass:density} allows us to obtain asymptotically calibrated $R\circ Q$ such that the calibration error is bounded as $\mathbb{E}[L_c(R \circ Q,S)] < \delta(m)$.  

\end{proof}
\subsubsection{No-Regret Calibration}
\label{apdx:no-regret}
\begin{theorem}
\label{lem:loss2}
The recalibrated model has asymptotically vanishing regret relative to the base model: $\mathbb{E}[L(R \circ Q,T)] \leq \mathbb{E}[L(Q,T)] + \delta,$ where $\delta >0, \delta=o(m^{-k}), k>0$. % is a bound that decreases with $m$. 
% \vk{todo: proof to appendix}
\end{theorem}

\begin{proof}[Proof]
%The claim holds by empirical risk minimization. Since $R \circ H$ minimizes $L$, but is more expressive than $H$ and $R$ can represent the identity map (by Assumption \ref{ass:density}).
Solving Task \ref{ass:density} implies $\mathbb{E}[L(R \circ Q,T)] \leq \mathbb{E}[L(B \circ Q,T)] + \delta \leq \mathbb{E}[L(Q,T)] + \delta$. The first inequality comes from definition of Task~\ref{ass:density} and the second inequality holds because a Bayes-optimal $B$ has lower loss than an identity mapping~\citep{pmlr-v162-kuleshov22a}.

\end{proof}
% \sd{Above proof from main paper is already good?}


