\section{Theoretical Analysis}
\sd{REMOVE THIS SECTION AND RETAIN THE FOLLOWING ONE}
\label{apdx:theoretical}
\subsection{Notation}
As described in Section~\ref{sec:background}, we are given an observational dataset $\mathcal{D}=\{(x^{(i)},y^{(i)},t^{(i)})\}_{i=1}^n$ consisting of $n$ units, each characterized by features $x^{(i)} \in \mathcal{X} \subseteq \mathbb{R}^d$, a binary treatment $t^{(i)} \in \{0,1\}$, and a scalar outcome $y^{(i)} \in \mathcal{Y} \subseteq \mathbb{R}$. 
We assume $\mathcal{D}$ consists of i.i.d.~realizations of random variables $X, Y, T \sim P$ from a data distribution $P$.
Although we assume binary treatments and scalar outcomes, our approach naturally extends beyond this setting.
The feature space $\mathcal{X}$ can be any continuous or discrete set.

\subsection{Calibration: a Necessary Condition for Propensity Scoring Models}
\label{apdx:calibration-necessary}
\begin{theorem}
When $Q(T|X)$ is not calibrated, an IPTW estimator based on $Q$ yields an incorrect estimate of the true causal effect almost surely.
% For each uncalibrated model $Q(T|X)$, the set of data distributions $P$ for which an IPTW estimator yields incorrect  probabilities has measure one.
\end{theorem}
% \sd{We don't want to show that when $Q(T|X)$ is calibrated then IPTW based on $Q$ yields correct estimates..}
\begin{proof}[Example]
Consider a toy binary setting where $\mathcal{X} = \mathcal{T} = \{0,1\}, \mathcal{Y} = \{0,1\}^2$.

We set $Y = (X \oplus T, \bar{X} \oplus \bar{T}) $, $ P(T=1|X=0)=p_0,  P(T=1|X=1)=p_1$ and $P(X=1)=0.5$ such that $\oplus$ is logical `AND' and $\bar{V}$ denotes logical negation of binary variable $V$. We see that true ATE is $\tau=(0.5, -0.5)$. Let us assume that $Q(T=1|X=0) = q_0$ and $Q(T=1|X=1)=q_1$. Thus, with IPTW estimator based on $Q$, we estimate $\tau' = \mathbb{E} \bigg(\frac{TY}{Q(T=1|X)} - \frac{(1-T)Y}{1-Q(T=1|X)}\bigg) = (-\frac{0.5(1-p_0)}{1-q_0}, \frac{0.5.p_1}{q_1}).$ The treatment effect $\tau'=\tau$ only when $q_0=p_0$ and $q_1=p_1$, which is not true if $Q$ is not calibrated. 
\end{proof}


\sd{We can always come up with a set of data-generating distributions P and uncalibrated Q(T|X) such that the estimated treatment effect is correct (The toy example for my other proof shows that if Q(t|x) is calibrated and separable, then true ATE is accurate). Hence, I wrote a slightly different proof. }
\begin{theorem}
 When $Q(T|X)$ is not calibrated, there exists an outcome function such that an IPTW estimator based on $Q$ yields an incorrect estimate of the true causal effect almost surely.
% For each uncalibrated model $Q(T|X)$, the set of data distributions $P$ for which an IPTW estimator yields incorrect  probabilities has measure one.
\end{theorem}



\begin{proof} 
Let $\mathcal{P}$ be a space of valid probability distributions on $\mathcal{Y}$. We would like to prove that
$ \exists P'(Y|X=x, T=t) \in \mathcal{P}$ such that $$\lim_{n \rightarrow \infty} \text{Probability}(\hat{\tau}_n=\tau)=0$$ where 
\begin{itemize}
  
    \item $\tau$ is true ATE
    \item $\hat{\tau}_n$ is the ATE estimated using IPTW estimator with $n$ individuals and propensity score model $Q(T=1|X)$
    \item The randomization is over all data-generating distributions $P'(Y|X, T).P(T, X)$ and all propensity models $Q(T=1|X)$ such that $\exists q \in [0, 1], P(T=1|Q(T=1|X)=q) \neq q.$
\end{itemize}

Let $S_Q = \{q | \exists X \in \mathcal{X},  Q(T=1|X) = q\}$.
We partition $\mathcal{X}$ into buckets $\{B_q\}_{q \in S_q}$. 

Let $\hat{\tau}(Q)$ be the ATE estimated using propensity score model $Q(T=1|X)$. 
Thus, for discrete $\mathcal{X},$ we could write 
\begin{align*}
    \hat{\tau}(Q) &= \mathbb{E}_{Y \sim P'(.|T, X); T, X \sim P}\left[\left(\frac{TY}{Q(T=1|X)} - \frac{(1-T)Y}{1-Q(T=1|X)}\right)\right]\\
    &\text{Computing expectation over $X$}\\
    &=  \sum_{X \in \mathcal{X}}\mathbb{E}_{Y \sim P'(.|T, X); T \sim P(.|X)}\left[\left(\frac{TY}{Q(T=1|X)} - \frac{(1-T)Y}{1-Q(T=1|X)}\right) P(X)\right]\\
    &\text{Computing expectation over $T$}\\
    &=\sum_{X \in \mathcal{X}}\mathbb{E}_{Y \sim P'(.|X, T=1)} \left[\left(\frac{P(T=1|X)Y}{Q(T=1|X)}\right) P(X)\right] + \sum_{X \in \mathcal{X}}\mathbb{E}_{Y \sim P'(.|X, T=0)} \left[\left(- \frac{(1-P(T=1|X))Y}{1-Q(T=1|X)}\right) P(X)\right]\\
    &=\sum_{X \in \mathcal{X}} \left(\mathbb{E}_{Y \sim P'(.|X, T=1)} \left[\left(\frac{P(T=1|X)Y}{Q(T=1|X)}\right)\right] - \mathbb{E}_{Y \sim P'(.|X, T=0)} \left[\left(\frac{(1-P(T=1|X))Y}{1-Q(T=1|X)}\right)\right] \right)P(X)\\
    &\text{Expressing the summation over $X$ differently}\\
    &=\sum_{q \in S_Q}\sum_{X \in {B_q}} \left(\mathbb{E}_{Y \sim P'(.|X, T=1)} \left[\left(\frac{P(T=1|X)Y}{Q(T=1|X)}\right)\right] - \mathbb{E}_{Y \sim P'(.|X, T=0)} \left[\left(\frac{(1-P(T=1|X))Y}{1-Q(T=1|X)}\right) \right]\right)P(X)\\
\end{align*}

Since $Q(T=1|X)$ is not calibrated, we know that $\exists q \in [0, 1], P(T=1|Q(T=1|X)=q) \neq q.$ Let us pick $q' \in S_Q$ such that $ P(T=1|Q(T=1|X)=q') \neq q'$.

We could design $P'(Y|X, T) = \mathbb{I}(Y=T.\mathbb{I}(X \in B_{q'}))$. 

Now, we can write

\begin{align*}
    \hat{\tau}(Q)
    &=\sum_{q \in S_Q}\sum_{X \in {B_q}} \left(\mathbb{E}_{Y \sim P'(.|X, T=1)} \left[\left(\frac{P(T=1|X)Y}{Q(T=1|X)}\right)\right] - \mathbb{E}_{Y \sim P'(.|X, T=0)} \left[\left(\frac{(1-P(T=1|X))Y}{1-Q(T=1|X)}\right) \right]\right)P(X)\\
    &= \sum_{X \in {B_{q'}}} \left(\left(\frac{P(T=1|X)P(X)}{Q(T=1|X)}\right)\right) \\
    &= \sum_{X \in {B_{q'}}} \left(\left(\frac{P(T=1|X)P(X)}{q'}\right)\right) \\
    &= \frac{P(T=1|X \in {B_{q'}})P(X \in {B_{q'}})}{q'}
\end{align*}
Also, for the above data-generation process, 
\begin{align*}
    \tau = \hat{\tau}(P) &= \sum_{X\in \mathcal{X}} (\mathbb{E}_{Y \sim P'(Y|X, do(T=1))}[Y] - \mathbb{E}_{Y \sim P'(Y|X, do(T=0))}[Y]) .P(X)\\
    &=\sum_{q \in S_Q}\sum_{X \in {B_q}} (\mathbb{E}_{Y \sim P'(Y|X, do(T=1))}[Y] - \mathbb{E}_{Y \sim P'(Y|X, do(T=0))}[Y]).P(X) \\
    &= \sum_{X \in {B_{q'}}} P(X) \\
    &=  P(X \in {B_{q'}})) \\
\end{align*} Thus, 
\begin{align*}
  \lim_{n \rightarrow \infty}\text{Probability}(\tau_n = \tau) &= P(\hat{\tau}(Q) = \tau) \\
  &= \text{Probability}\left(\frac{P(T=1|X \in {B_{q'}})P(X \in {B_{q'}})}{q'} = P(X \in {B_{q'}})\right)\\  
  &= \text{Probability}\left(P(T=1|X \in {B_{q'}}) = q'\right)\\  
  &= \text{Probability}\left(P(T=1|Q(T=1|X) = q') = q'\right)\\  
  &=0,
\end{align*}
since we began with the assumption that $ P(T=1|Q(T=1|X)=q') \neq q'$. 

\sd{We could have easily defined a set of outcome functions that produce $Y=0$ for $X \in B_{q'}$, thus letting us compute accurate treatment effects despite miscalibrated model $Q(T=1|X)$, assuming $\forall q \neq q', P(T=|Q(T=1|X)=q)=q$.}

% For discrete X, assume $\mathcal{Y} = \{0, 1\}^{|\mathcal{X}|}$ such that $Y_i=X_i$. 
 
\end{proof}



% % \vk{todo: maybe complete this with some working numbers; if no space, put in appendix}


\subsection{Calibrated Uncertainties Improve Propensity Scoring Models}

\label{apdx:calibrated-uncertainties-improve-propensity}

We define the true ATE as 
\begin{align*}
\tau &= \mathbb{E}_{y \sim P(Y=y | do(T=1))}[y] - \mathbb{E}_{y \sim P(Y=y | do(T=0))}[y]\\
&=\sum_y y(\sum_X P(Y=y | X, do(T=1)) P(X) - \sum_X P(Y=y | X, do(T=0)) P(X))\\
&=\sum_y y(\sum_X P(Y=y | X, T=1) P(X) - \sum_X P(Y=y | X, T=0) P(X))
\end{align*}


We define the naive (biased) estimate as
\begin{align*}
\hat{\tau} &= \mathbb{E}_{y \sim P(Y=y |T=1)}[y] -\mathbb{E}_{y \sim P(Y=y |T=0)}[y]\\
&= \sum_y y[\sum_X P(Y=y | X, T=1) P(X|T=1) - \sum_X P(Y=y | X, T=0) P(X|T=0)] \\
&= \sum_y y[\sum_X P(Y=y | X, T=1) \frac{P(T=1|X) P(X)}{P(T=1)} - \sum_X P(Y=y | X, T=0) \frac{P(T=0|X) P(X)}{P(T=0)}] \\
&= \sum_y y[k_1. \sum_X P(Y=y | X, T=1) {P(T=1|X) P(X)} - k_0\sum_X P(Y=y | X, T=0) {P(T=0|X) P(X)}],
\end{align*}
where $k_t$ is constant w.r.t variables X and Y. \sd{I get this additional constant in my derivation. The proof idea present here originally could get rid of the constant as we only looked at $\pi_{y, t}$ for a fixed T=t at a time and not the difference $\pi_{y, 1} - \pi_{y, 0}$. I might be missing something simple here. }

We define the IPTW estimate based on model Q as $\hat{\tau}(Q) = \sum_y y[\pi_{y, 1}(Q) - \pi_{y, 0}(Q)]$ where
% \sd{The statement below was taken from previous writeup but maybe there was a typo?}
% $$
% \pi_{y, t}(Q) = \sum_X P(Y=y | X, T=t) \frac{P(X|T=t)}{Q(X|T=t)} = {k'}_t. \sum_X P(Y=y | X, T=t) \frac{P(T=t|X)}{Q(T=t|X)} \frac{P(X)}{Q(X)}
% $$
% \sd{Cannot get rid of Q(X) in the denominator above? hence, I am defining this as}
\sd{I am defining this as}
$$
\pi_{y, t}(Q) = \sum_X P(Y=y | X, T=t) \frac{P(X|T=t)}{Q(T=t|X)} = {k}_t. \sum_X P(Y=y | X, T=t) \frac{P(T=t|X)}{Q(T=t|X)} {P(X)}
$$

Clearly, if $Q=P$ we have $\hat{\tau}(Q) = \hat{\tau}(P) = \tau$. 

If not, we can consider the error
$$E = |(\hat\tau(P) - \hat\tau(Q))|. $$

\subsubsection{Bounding the Error of Causal Effect Estimation Using Proper Losses}
\label{apdx:error-bound}
We can form a bound on $E$ as
\begin{align*}
E 
& = |[\hat\tau(P) - \hat\tau(Q)]| & \\
& = \left| \sum_y y[(\pi_{y, 1}(P) - \pi_{y, 0}(P)) - (\pi_{y, 1}(Q) - \pi_{y, 0}(Q))] \right| & \\
% & \leq  \mathbb{E}_y[|(\pi_{y, 1}(P) - \pi_{y, 0}(P)) - (\pi_{y, 1}(Q) - \pi_{y, 0}(Q))|] \\
& \leq  \sum_t \left|\sum_y y[(\pi_{y, t}(P) - \pi_{y, t}(Q)]\right| & \\
& \leq  \sum_t\sum_y[|y| |\pi_{y, t}(P) - \pi_{y, t}(Q)|] & \\
& = \sum_t  \sum_y |y| [\left|\sum_X {k}_t P(Y=y | X, T=t) {P(X)} \left(1- \frac{P(T=t|X)}{Q(T=t|X)}\right) \right|] & \\
& \leq \sum_t  \sum_y |y| [\sum_X {k}_t P(Y=y | X, T=t) {P(X)} \left| 1- \frac{P(T=t|X)}{Q(T=t|X)} \right|] & \\
& = \sum_t  \sum_y |y|.[\sum_X {k}_t. P(Y=y | X, T=t) P(X) \ell_X(P,Q)^{1/2}] & \text{where } \ell_X(P,Q)=\left(1- \frac{P(T=t|X)}{Q(T=t|X)}\right)^2\\
& = \sum_t {k'}_{t}\sum_y |y|. \mathbb{E}_{X \sim k. P(Y=y | X, T=t) P(X)} [\ell_X(P,Q)^{1/2}] & \text{Assuming ${k'}_t = k_t/k$}\\
& = \sum_t {k'}_{t}. \sum_y |y|.\mathbb{E}_{X \sim R_{y, t}} [\ell_X(P,Q)^{1/2}]& \\
\end{align*}
where $R_{t, y} \propto P(Y=y | X, T=t) P(X)$ (i.e. $R_{t, y} \sim k.P(Y=y | X, T=t) P(X)$, $k$ is constant w.r.t. variable $X$) and $\ell_X(P,Q)$ is a type of expected Chi-Squared divergence between $P, Q$, a type of proper score. Thus when $P = Q$, we get zero error, and otherwise we get a bound.


\sd{Question: Our constant $k_t = \frac{1}{P(T=t)}$ stays and changes the theorem statement in main paper slightly. Is that okay?}.


In the above derivation, we see that the expected error $|\pi_{y,t}(P) - \pi_{y,t}(Q)|$ induced by an IPTW estimator with propensity score model $Q$ is bounded as
$$|\pi_{y,t}(P) - \pi_{y,t}(Q)| \leq {k'}_t.\mathbb{E}_{X \sim R_{y,t}}[ \ell_\chi(P,Q)^\frac{1}{2}]. $$

% where $R_{y,t} \propto P(Y=y | X, T=t) P(X)$ is a data distribution and $\ell_\chi(Q,P)= \left( 1- \frac{P(T=t|X)}{Q(T=t|X)} \right)^2$ is the $chi$-squared loss between the true propensity score and the model $Q$. 

% \begin{proof}[Proof]

% \begin{align*}
%  R_{y,t} \sim k. P(Y=y | X, T=t) P(X), \\
% \end{align*}
% where k is constant w.r.t. variable X. 
% \begin{align*}
% |\pi_{y,t}(P) - \pi_{y,t}(Q)| &= \left| \sum_x P(y|x,t)\frac{P(t|x)}{P(t|x)}P(x) - \sum_x P(y|x,t)\frac{P(t|x)}{Q(t|x)}P(x) \right| \\
% &= \left|\sum_x (1-\frac{P(t|x)}{Q(t|x)})P(y|x,t) P(x)\right| \\
% & \leq \sum_x \left|(1-\frac{P(t|x)}{Q(t|x)})\right| P(y|x,t) P(x) \\
% & = \frac{1}{k}\sum_x [ \left| \left( 1- \frac{P(t|x)}{Q(t|x)} \right) \right|]{k. P(y | x, t) P(x)} \\
% & = \frac{1}{k} \mathbb{E}_{X \sim {k. P(y | x, t) P(x)}}[ \left| \left( 1- \frac{P(T=t|X)}{Q(T=t|X)} \right) \right|] \\
% &=\frac{1}{k} \mathbb{E}_{X \sim R_{y,t}}[ \ell_\chi(P,Q)^\frac{1}{2}]
% \end{align*}



% \end{proof}




\subsubsection{Calibration Reduces Variance of Inverse Probability Estimators}
\label{apdx:variance-reduction}

\begin{theorem}
    Let $P$ be the data distribution, and suppose that $1 - \delta > P(T|X) > \delta$ for all $T, X$ and let $Q$ be a calibrated model relative to $P$. Then $1 - \delta > Q(T|X) > \delta$ for all $T, X$ as well.
\end{theorem}
\begin{proof}[Proof]
The proof is by contradiction. Suppose $Q(T=1|x) = q$ for some $x$ and $q < \delta$. Then because $Q$ is calibrated, of the times when we predict $q$, we have $P(T=1|Q(T=1|X) = q) = q <\delta$, which is impossible since $P(T=1|x) > \delta$ for every $x$. 
\end{proof}
\sd{Above proof from main paper is already good?}

\subsubsection{Calibration Improves the Accuracy of Causal Effect Estimation}
\label{apdx:cal-improves-accuracy}
\begin{theorem}
% A model that is separable and also calibrated achieves accurate causal effect estimation
The error of an IPTW estimator with propensity model $Q$ tends to zero as $n \to \infty$ if:
% A model $Q$ achieves accurate causal effect estimation with the IPTW estimator if
\begin{enumerate}
    \item Separability holds, i.e., $\forall X_1, X_2 \in \mathcal{X}, P(T|X_1) \neq P(T|X_2) \implies Q(T|X_1) \neq Q(T|X_2)$
    \item The model $Q$ is calibrated, i.e., $\forall q \in (0, 1), P(T=1|Q(T=1|X)=q)=q$
\end{enumerate}
\end{theorem}
% Idea: use this bound on the calibration error.
% First, we define
% $$
% \tau = P(Y=1 | do(T=1)) = \sum_X P(Y=1 | X, do(T=1)) P(X) = \sum_X P(Y=1 | X, T=1) P(X)
% $$
% as the true treatment effect. We define
% $$
% \tau_0 = P(Y=1 |T=1) = \sum_X P(Y=1 | X, T=1) P(X|T=1) \propto \sum_X P(Y=1 | X, T=1) P(T=1|X) P(X)
% $$
% to be the naive (biased) estimate. We define
% $$
% \hat\tau(Q) = \sum_X P(Y=1 | X, T=1) \frac{P(X|T=1)}{Q(X|T=1)} \propto \sum_X P(Y=1 | X, T=1) \frac{P(T=1|X)}{Q(T=1|X)} P(X)
% $$
% as being the propensity score estimate with model $Q$. Clearly, if $Q=P$ we get the correct value. If not, we can consider the error
% $$E = (\hat\tau(P) - \hat\tau(Q))^2. $$
% We can form a bound on $E$ as
% \begin{align*}
% E 
% & = (\hat\tau(P) - \hat\tau(Q))^2 \\
% & \leq \approx \sum_X P(Y=1 | X, T=1) P(X) \left( 1- \frac{P(T=1|X)}{Q(T=1|X)} \right)^2, \\
% & \leq \approx \sum_X P(Y=1 | X, T=1) P(X) \ell(P,Q)
% \end{align*}
% which is a type of expected Chi-Squared divergence between $P, Q$, a type of proper score. Thus when $P = Q$, we get zero error, and otherwise we get a bound.



% We have treatment $T \in \{0, 1\}$, observed covariates $X \in \mathcal{X}$ and scalar outcome $Y \in \mathcal{R}$. $P(T=1|X)$ is the true treatment assignment mechanism and we train a propensity score model $Q(T=1|X)$. 

% For $X_1, X_2 \in \mathcal{X}$ such that $X_1 \neq X_2$, we assume that if $P(T=1|X_1) \neq P(T=1|X_2)$ then $Q(T=1|X_1) \neq Q(T=1|X_2)$. 
\begin{proof}
We prove this for discrete inputs at first and then prove it for continuous inputs.

{\textbf{Discrete Input Space.}}    

If our input space $\mathcal{X}$ is discrete, then the number of distinct values that $Q(T=1|X)$ can take is countable. Let us assume that $Q(T=1|X)$ takes values $\{q_i\}_{i=1}^M$. Thus, we can partition $\mathcal{X}$ into buckets $\{B_i\}_{i=1}^{M}$ such that $B_i = \{X| Q(T=1|X) = q_i\}$. Due to separability, we have $\forall X_1, X_2 \in \mathcal{X}, Q(T|X_1) = Q(T|X_2) \implies P(T|X_1) = P(T|X_2)$. Thus, we have $\forall i, \forall X_1, X_2 \in B_i, Q(T=1|X_1)=Q(T=1|X_2),$ and $  P(T=1|X_1)=P(T=1|X_2).$

Let us assume that for each bucket $B_i$, our true propensity $P(T=1|X)$ is $p_i$, i.e, if $X \in B_i$ then $Q(T=1|X)=q_i$ and $P(T=1|X)=p_i$.


% Thus, the model $Q(T=1|X)$ outputs values $\{q_i\}_{i=1}^{M}$ and true propensity $P(T=1|X)$ takes values $\{p_i\}_{i=1}^{M}.$ 

Assuming positivity, $0 < p_i < 1$.

Now, for all $i$, we can write 
\begin{align*}P(T=1|Q(T=1|X)=q_i) &= P(T=1| X \in B_i) \\ &= p_i.
\end{align*}

If $Q$ is calibrated, then by definition $p_i=q_i$. 

\sd{Below is not necessary if we directly claim that P(T|X) and Q(T|X) are identical}

Now, we can write the expression for ATE $\tau$ as  
\begin{align*}
   {\tau} = \hat{\tau}(P) &= \mathbb{E}_{Y, T, X}[\frac{TY}{P(T=1|X)} - \frac{(1-T)Y}{(1-P(T=1|X))}] \\
   &=\sum_{i=1}^{N}
   P(X \in B_i) \mathbb{E}_{Y, T} \left(\frac{TY}{p_i} - \frac{(1-T)Y}{(1-p_i)}\right) \\
   % &=\sum_{i=1}^{N}
   % P(X \in B_i) E_{T, Y | X \in B_i}\left(\frac{TY}{p_i} - \frac{(1-T)Y}{(1-p_i)}\right) \\
\end{align*}
Using our propensity score model $Q(T=1|X)$, we estimate $\hat{\tau}$ as  
\begin{align*}
   \hat{\tau}(Q) &= \mathbb{E}_{Y, T, X}[\frac{TY}{Q(T=1|X)} - \frac{(1-T)Y}{(1-Q(T=1|X))}] \\
   &=\sum_{i=1}^{N}
   P(X \in B_i) \mathbb{E}_{Y, T} \left(\frac{TY}{q_i} - \frac{(1-T)Y}{(1-q_i)}\right) \\
   % &=\sum_{i=1}^{N}
   % P(X \in B_i) E_{X \in B_i}(\frac{TY}{q_i} - \frac{(1-T)Y}{(1-q_i)}) \\
\end{align*}



If our model $Q$ is calibrated, then $p_i = q_i$. Hence, $0 < q_i < 1$ and $\hat{\tau}$ is well-defined. Also, $\tau = \hat{\tau}(P) = \hat{\tau}(Q)$. 

When our observational data contains $n$ units, the IPTW estimator based on model $Q(T=1|X)$ is $\hat{\tau} = \frac{1}{n} \sum_{i=0}^n (\frac{T^{(i)} Y^{(i)}}{Q(T=1|X^{(i)})} - \frac{(1-T^{(i)})Y^{(i)}}{1-Q(T=1|X^{(i)})}).$
As $n \rightarrow \infty,$ we can thus say that

$\hat{\tau}= \hat{\tau}(Q) = \hat{\tau}(P) = \tau$. 

% Thus, for $\mathcal{X}$ is discrete, we have a 

%if $\forall X_1 \neq X_2, P(T=1|X_1) \neq P(T=1|X_2) \implies Q(T=1|X_1) \neq Q(T=1|X_2)$ and  our model $Q$ is perfectly calibrated, then we can obtain unbiased estimate of treatment effect using the IPTW estimator.
% \newline

% \newline
{\textbf{Continuous Input Space.}}    

\sd{Can retain the proof just for continuous X to make it concise.}

When $X$ is continuous, the number of buckets can be uncountable. The buckets can now be formed as $B_q = \{X | Q(T=1|X)=q\}, \forall q \in [0, 1]$. It is easy to see that $\{B_q\}_{q \in [0, 1]}$ partitions $\mathcal{X}$. Note that $B_q$ can be empty if there exists no $X$ such that $Q(T=1|X)=q$.

Due to separability, $\forall X_1, X_2 \in \mathcal{X}, Q(T|X_1) = Q(T|X_2) \implies P(T|X_1) = P(T|X_2)$. 
Thus, for all $q$, $P(T=1|X)$ takes on a unique value for all $X \in B_q$, i.e., $\forall q \in [0, 1], P(T=1|X \in B_q) = f(q),$ where function $f: [0,1] \rightarrow [0,1]$.

Hence, we can write 
\begin{align*}
\forall q \in [0, 1], P(T=1| Q(T=1|X)=q) &= P(T=1| X \in B_q) \\
&=f(q).
\end{align*} 
When model $Q(T=1|X)$ is calibrated by our definition, then $\forall q \in [0, 1], q = f(q).$

Therefore, $\forall q \in [0,1], Q(T=1|X \in B_q) = q = f(q) = P(T=1| X \in B_q)$.

Since $\{B_q\}_{q \in [0, 1]}$ partitions $\mathcal{X}$, we have $\forall X \in \mathcal{X}, P(T=1|X) = Q(T=1|X)$. Thus, $\hat{\tau} (P) = \hat{\tau} (Q)$.

\end{proof}
 
% $$\hat{\tau} = \frac{1}{M} \sum_{i=0}^M (\frac{T_i Y_i}{Q(T=1|X_i)} - \frac{(1-T_i)Y_i}{1-Q(T=1|X_i)})$$

% Other ideas
% \begin{enumerate}
%     \item Bounds on the ratio $\tau'/\tau$ when a subset of buckets results in the same outcome $Q(T=1|X)$ (calibrated but not sharp for a subset of buckets)
%     \item When X is continuous, we might still derive the above when the number of buckets is countable. When buckets are uncountable, can we redo this proof using integral over buckets? Might need to add some conditions
%     \item Bounding $\tau'/\tau$ using upper bound on calibration error $\sum_{i=1}^N w_i|p_i - q_i|^2 = \sum_{i=1}^N P(X_i \in B_i)|p_i - q_i|^2 $
%     \item Calibration error as function of dataset size and relating that with $\tau'/\tau$
% \end{enumerate}
\subsection{Algorithms for Calibrated Propensity Scoring}
\sd{I am taking the following from ICML2022 paper and have also cited it.}
\label{apdx:algorithms-calibrated}
\subsubsection{Asymptotic Calibration Guarantee}
\label{apdx:asymptotic-calibration}

\begin{theorem}%[Calibration]
The model $R \circ Q$ is asymptotically calibrated and
the calibration error $\mathbb{E}[L_c(R \circ Q,S)] < \delta$ for $\delta = o(m^{-k}), k>0$ w.h.p.
% $\mathcal{L}_C \leq \epsilon$ when $T \to \infty$.
%\vk{todo: copy statement and proof to appendix}
\end{theorem}
\begin{proof}
    Any proper loss can be decomposed as: 
    proper loss = calibration - sharpness + irreducible term~\citep{guo2017calibration}. The calibration term consists of the error $\mathbb{E}[L_c(R \circ Q,S)]$. The sharpness and irreducible term can be represented as the refinement term $\mathbb{E}(L_r(S))$. \citet{kullflach2015novel} show that the refinement term can be further divided as $\mathbb{E}(L_r(S)) = \mathbb{E} (L_g(S, B \circ Q)) + \mathbb{E}(L(B \circ Q, Y)).$ Here, $B$ is the Bayes optimal recalibrator $P(T=1|Q(T=1|X))$ and $S$ is $P(T=1|R \circ Q).$ 

    As described by ~\citet{pmlr-v162-kuleshov22a}, if we solve the Task~\ref{ass:density}, we have 
\begin{flalign*}
\mathbb{E}(L(B \circ Q, Y)) & \leq \mathbb{E}(L(R \circ Q, Y)) \leq \mathbb{E}(L(B \circ Q, Y)) + \delta(m)&&\\
\text{Using \citet{gneiting2007probabilistic}}, & \text{~\citet{kullflach2015novel} we decompose $\mathbb{E}(L(R \circ Q, Y))$} &&\\
\implies \mathbb{E}(L(B \circ Q, Y)) &\leq \mathbb{E}(L_c(R \circ Q, S)) + \mathbb{E}(L_g(S, B \circ Q)) + \mathbb{E}(L(B \circ Q, Y)) \leq \mathbb{E}(L(B \circ Q, Y)) + \delta(m)&&\\
\implies \mathbb{E}(L_c(R \circ Q, S)) & + \mathbb{E}(L_g(S, B \circ Q)) \leq  \delta(m)&&\\
\implies \mathbb{E}(L_c(R \circ Q, S)) & \leq  \delta(m)&&\\
\end{flalign*}
    Thus, solving Task~\ref{ass:density} allows us to obtain asymptotically calibrated $R\circ Q$ such that the calibration error is bounded as $\mathbb{E}[L_c(R \circ Q,S)] < \delta(m)$.  

\end{proof}
\subsubsection{No-Regret Calibration}
\label{apdx:no-regret}
\begin{theorem}
\label{lem:loss}
The recalibrated model has asymptotically vanishing regret relative to the base model: $\mathbb{E}[L(R \circ Q,T)] \leq \mathbb{E}[L(Q,T)] + \delta,$ where $\delta >0, \delta=o(m)$. % is a bound that decreases with $m$. 
% \vk{todo: proof to appendix}
\end{theorem}

\begin{proof}[Proof]
%The claim holds by empirical risk minimization. Since $R \circ H$ minimizes $L$, but is more expressive than $H$ and $R$ can represent the identity map (by Assumption \ref{ass:density}).
Solving Task \ref{ass:density} implies $\mathbb{E}[L(R \circ Q,T)] \leq \mathbb{E}[L(B \circ Q,T)] + \delta \leq \mathbb{E}[L(Q,T)] + \delta$; the second inequality holds because a Bayes-optimal $B$ has lower loss than an identity mapping~\citep{pmlr-v162-kuleshov22a}.
\end{proof}
\sd{Above proof from main paper is already good?}


\sd{Submitting the following section (slightly edited)}


