\section{EMPIRICAL EVALUATION}\label{sec:empirical}

We perform experiments on several observational studies to evaluate calibrated propensity score models. We cover different types of treatment assignment mechanisms, base propensity score models, and varying dimensionality of observed covariates. 
% \begin{wrapfigure}{r}{4 cm}
% \centering
% \vspace{1 cm}
% \includegraphics[scale=0.18]{images/calib_curve_simA.png}
% % \includegraphics[scale=0.18]{images/calib_curve_simB.png}
% % \includegraphics[scale=0.18]{images/calib_curve_simC.png}
% % \includegraphics[scale=0.18]{images/calib_curve_simD.png}

% \caption{Calibration of propensity score model for Drug Effectiveness Study.\sd{Maybe overlap with propensity histograms}} 
% \vspace{-1cm}
% \label{fig:calib_curve_simA}
% \end{wrapfigure}
% \begin{wrapfigure}{r}{6 cm}
\begin{figure}
\centering
% \vspace{-0.3 cm}
\includegraphics[scale=0.25]{images/compare_hist_clipped1.png}
\caption{Recalibrating Propensity Score Model Reduces the Bias in Estimating Treatment Effect from Observational Data.} %The distribution of treatment effect estimates is shown in blue when using RCT and in yellow when using observational data directly. Using recalibrated propensity scores shifts the distribution of treatment effect estimates closest to the blue region.
 
\vspace{-0.5cm}
\label{fig:compare_toy_hist}
\end{figure}


\paragraph{\textbf{Setup.}} 
% We simulate datasets containing covariates $X$, binary treatment $T$ and scalar outcome $Y$. We train a propensity score model using the generated dataset and utilize it to estimate the average treatment effect (ATE). 
We use  the Inverse-Propensity Treatment Weight (IPTW) and Augmented Inverse Propensity Weight (AIPW) estimators in our experiments. We compare the estimates obtained through calibrated propensities with several baselines including estimators based on uncalibrated propensity scores. We use sigmoid or isotonic regression as the recalibrator and utilize cross-validation splits to generate the calibration dataset (Appendix~\ref{apdx:additional-details-on-cal-algorithm}). We measure the performance in terms of the absolute error in estimating ATE as $\epsilon_{ATE} = |\hat{\tau} - \tau|,$ where $\tau$ is the true treatment effect and $\hat{\tau}$ is our estimated treatment effect. %When $\tau$ is a vector, we use the $l_2$ norm to report $\epsilon_{ATE} = \|\hat{\tau} - \tau\|_2$.

\paragraph{\textbf{Analysis of Calibration}.} We evaluate the calibration of the propensity score model using expected calibration error (ECE), defined as 
${\mathbb{E}_{p \sim Q(T=1|X) }[|P(T=1|Q(T=1|X)=p) - p|]}$, where $Q(T=1|X)$ models the treatment assignment mechanism $P(T=1|X)$. 
To compute ECE, we divide the probabilistic output range $[0, 1]$ into equal-sized intervals $\{I_0,I_1,..,I_M\}$ such that we can generate buckets ${\{B_i\}_{i=1}^M}$, where ${B_i=\{(X, T, Y) | Q(T=1|X) \in I_i\}}$.  The estimated ECE is then computed as $${\text{ECE} = \sum_{i=1}^{M} \frac{|B_i|}{|\bigcup_{j=1}^M B_j|}|\text{avg}_i(B_i) - \text{pred}_i(B_i)|},$$ where $\text{avg}_i(B_i)=\sum_{j=1}^{|B_i|} T_j /|B_i|$ and 
${\text{pred}_i(B_i) = \sum_{j=1}^{|B_i|} Q(T=1|X_j) /|B_i|}$. 
%\vk{did we forget to square some of the errors?}


\subsection{Drug Effectiveness Study}
% \begin{table}
% \begin{wraptable}{r}{5.5cm}
% \vspace{-0.8cm}
% \caption{Comparison of different base propensity score models. }
% % \hspace{0.1cm}
% \vspace{0.1cm}
% \centering
% \small
% \begin{tabular}{lccccr}
% \toprule
% Base model & $\varepsilon_{ATE}$(Plain) & $\varepsilon_{ATE}$ (Calib) & \\
% \midrule
%  Log. Reg  & 0.479  {(0.005)} 
%  & 0.091 (0.022)
%  \\
%  MLP & 0.455 (0.042) & 0.027 (0.031)
% \\
% %  Decision Tree & 0.506 (0.003) & 0.000
% % (0.000)
% %  & 0.504
% % (0.003) & 0.000
% % (0.000)\\
% % Adaboost & 0.506 (0.003) 
% %  & 0.504
% % (0.003) \\
%  SVM & 0.485
% (0.004) & 0.454
% (0.013) \\
%  Naive Bayes & 0.471
% (0.003) & 0.021
% (0.018) \\
% \bottomrule

% \end{tabular}
% \vspace{-0.2cm}
% \label{table:comp-basemodels}
% % \end{wraptable}
% \end{table}
% \setlength{\tabcolsep}{2.5pt}
\begin{table*}[ht]
\caption{Recalibrating the Output of Propensity Score Model Results in Lower Error in Estimating Causal Effects. Reduction in ECE ($\Delta (ECE)$) implies that the calibration of the model improves with our technique. Results consisting of  $\varepsilon_{ATE}$ are averaged over 10 experimental repetitions and braces contain the standard error.}
%The baselines consist of weighing with plain propensities (Plain), trimmed propensities (Trim), stabilized weights (SW) and covariate balancing (Cov. Bal.).
% \vspace{-0.6cm}
% \caption{Recalibrating Propensities. True ATE in all the simulations below is 0.368.}
%Reduction in ECE ($\Delta (ECE)$) implies that the calibration of the model improves with our technique. Results consisting of  $\varepsilon_{ATE}$ are averaged over 10 experimental repetitions and braces contain the standard error.
\small
\centering


\begin{tabular}{llcccc}
\toprule
% 
%&{Setting} & \multicolumn{4}{c}{$\varepsilon_{ATE}$} \\%& \multicolumn{4}{c}{PEHE} \\
 & Setting & Sim A & Sim B & Sim C & Sim D \\%& Sim A & Sim B & Sim C & Sim D \\
\midrule
 & Naive & 0.498 (0.003) & 0.223 (0.003) & 0.279 (0.004) & 0.280 (0.006) \\%& 0.263 (0.002) & 0.075 (0.002) & 0.105 (0.002) & 0.103 (0.003)\\
% & T learner & 0.098 (0.005) & 0.200 (0.001) & 0.153 (0.002) &  0.058 (0.006) & 0.017 (0.001) & 0.062 (0.001) & 0.047 (0.001) & 0.035 (0.001)\\
\midrule
 & Plain propensities & 0.348  (0.035) & 0.211 (0.002) & 0.164 (0.002) & 0.075 (0.004) \\%& 0.149 (0.024) & 0.068 (0.001) & 0.052 (0.001) & \textbf{0.031 (0.001)}\\
 & Trimmed ~\citep{Lee2011-nv} & 0.481 (0.004) &  0.210 (0.002) & 0.153 (0.002) & 0.074 (0.004) \\%& 0.245 (0.004) & 0.067 (0.001) & 0.046 (0.001) & \textbf{0.031 (0.001)}\\
& Stablized Wt~\citep{Xu2010-jv}  & 0.422 (0.016) &  0.210 (0.002) & 0.158 (0.003) & 0.078 (0.005) \\%& 0.195 (0.013) & 0.076 (0.002) & 0.114 (0.004) & 0.112 (0.005)\\
\midrule
 & Covariate Balancing~\citep{tan2020regularized} & {0.509 (0.003)} & {\textbf{0.190 (0.002)}}& {0.169 (0.007)} & {0.092 (0.013)} \\%&\multirow{1}{*}{0.280 (0.003)} & \multirow{1}{*}{\textbf{0.056 (0.001)}} & \multirow{1}{*}{0.050 (0.003)} & \multirow{1}{*}{0.107 (0.007)}\\
% & \citet{tan2020regularized} & & & & & & & & \\
\midrule
 & Calibrated (Ours) & 0.107 (0.029) & {0.195 (0.002)} & \textbf{0.148 (0.001)} & \textbf{0.048 (0.010)} \\%& 0.047 (0.010) & \textbf{0.057 (0.001)} & \textbf{0.042 (0.001)} & \textbf{0.032 (0.001)}\\
  & Calibrated + Trimmed &  0.115 (0.028) & \textbf{0.193 (0.002)} & \textbf{0.148 (0.001)} & \textbf{0.048 (0.010)} \\%& 0.049 (0.010) & \textbf{0.057 (0.001)} & \textbf{0.042 (0.001)} &  \textbf{0.032 (0.001)} \\
 & Calibrated + Stablized Wt & \textbf{0.057 (0.026)} & {0.194 (0.002)} & \textbf{0.148 (0.001)} & \textbf{0.045 (0.009)} \\%& \textbf{0.030 (0.007)} & \textbf{0.057 (0.001)} & \textbf{0.042 (0.001)} &  0.033 (0.001)\\
 \midrule
 & $\Delta(ECE)$ & 0.010 (0.001) & 0.014 (0.001) & 0.025 (0.002) & 0.019 (0.001) \\%& 0.010 (0.001) & 0.014 (0.001) & 0.025 (0.002) & 0.019 (0.001) \\
\bottomrule
\end{tabular}
\vspace{-2mm}
\label{table:toy-expr}
\end{table*}

% \begin{table*}[ht]
% % \vspace{-0.8cm}
% \caption{Recalibrating the Output of Propensity Score Model Results in Lower Error in Estimating Causal Effects. Reduction in ECE ($\Delta (ECE)$) implies that the calibration of the model improves with our technique. Results consisting of  $\varepsilon_{ATE}$ are averaged over 10 experimental repetitions and braces contain the standard error.}
% \hspace{0.1cm}
% \small
% \centering
% \begin{tabular}{lccccccr}
% \toprule
% % 
% Setting &  Naive  & \multicolumn{2}{c}{Plain Propensities} & Cov Balance  & \multicolumn{2}{c}{Uncertainty Recalibration} & $\Delta$(ECE) \\

%  & estimation & Untrimmed  & Trimmed  & \citep{tan2020regularized} &  Untrimmed & Trimmed &  \\
% \midrule
% %  Sim A & 0.498 
% % (0.003) &  0.481 (0.004) & 0.348  (0.035) & 0.024 (0.001) & 0.509 (0.003) &  0.115 (0.028) & 0.107 (0.029) & 0.010 (0.001) \\
% %  Sim B & 0.223 (0.003) &  0.211 (0.002) & 0.210 (0.002) & 0.043 (0.001) &  0.188 (0.002) & 0.195 (0.002) & 0.193 (0.002) & 0.014 (0.001)\\
% %  Sim C &  0.279 (0.004) &  0.164 (0.002) & 0.153 (0.002) & 0.053 (0.001)  &  0.169 (0.007) & 0.148 (0.001) &  0.148 (0.001) & 0.025 (0.002) \\
% %  Sim D &  0.280 (0.006)&   0.075 (0.004) & 0.074 (0.004) & 0.119 (0.001) & 0.048 (0.010) & &  0.048 (0.010)  & 0.019 (0.001)\\

% Sim A & 0.498 
% (0.003)  & 0.348  (0.035) &  0.481 (0.004)  & 0.504 (0.003) & 0.107 (0.029)  &  0.115 (0.028) & 0.014 (0.001) \\
%  Sim B & 0.223 (0.003) &  0.211 (0.002) & 0.210 (0.002) & 0.190 (0.002) & 0.195 (0.002) & 0.193 (0.002) & 0.029 (0.001)\\
%  Sim C &  0.279 (0.004) &  0.164 (0.002) & 0.153 (0.002) &  0.169 (0.007) & 0.148 (0.001) &  0.148 (0.001) & 0.028 (0.002) \\
%  Sim D &  0.280 (0.006)&   0.075 (0.004) & 0.074 (0.004) & 0.092 (0.013) &0.048 (0.010) &   0.048 (0.010)  & 0.100 (0.001)\\
% \bottomrule

% \end{tabular}

% \label{table:toy-expr}
% \end{table*}




% \begin{table}
% % \begin{wraptable}{r}{5.5cm}
% % \vspace{-0.8cm}
% \caption{Comparison of Different Propensity Models.}
% % \hspace{0.8cm}
% \vspace{0.3cm}
% \centering
% \small
% \begin{tabular}{lccccr}
% \toprule
% Base model & $\varepsilon_{ATE}$(Plain) &$\varepsilon_{ATE}$ (Calib)\\
% \midrule
% Log. Reg. & 0.031 (0.003) & 0.016 (0.002)  \\

%  MLP & 0.014 (0.005)& 0.008 (0.003)  \\

%  SVM (Linear)  & 0.032  {(0.005)} &  0.015 (0.003) 
%  \\
%  SVM (RBF) & 0.012 (0.003) & 0.009 (0.004) 
% \\
% % Random Forests & 0.048
% % (0.007) & 0.033
% % (0.007) \\
% Adaboost & 0.039 (0.003)  
%  & 0.022 (0.004) \\
% Naive Bayes & 0.022 (0.004) & 0.017 (0.003)  \\
% %  Decision Tree & 0.506 (0.003) & 0.000
% % (0.000)
% %  & 0.504
% % (0.003) & 0.000
% % (0.000)\\

 
% \bottomrule

% \end{tabular}
% \vspace{-0.2cm}
% \label{table:comp-basemodels}
% % \end{wraptable}
% \end{table}

\begin{table*}[ht]

\caption{Reduction in ATE Estimation Error $\varepsilon_{ATE}$ with Structured and Unstructured Covariates.}
\vspace{0.2cm}
% \hspace{0.1cm}
\small
\centering
\begin{tabular}{lccccr}
\toprule
% % 
% Setting &  Naive  & \multicolumn{2}{c}{Plain Propensities} &  & \multicolumn{2}{c}{Uncertainty Recalibration} & $\Delta$(ECE) \\

%  & estimation &  &  & &   &  &  \\
% \midrule
%  Image Covariate & 0.187 (0.010) & & 0.107 (0.029) &  &  & 0.095 (0.005) & 0.137 (0.046) \\
%  Binary Covariate & 0.176 (0.019) & & 0.052 (0.011) &  & & 0.099 (0.008) & 0.112 (0.029)\\

% 
Setting &  Naive Est.  & {Plain Propensities} &  {Uncertainty Recalibration} & $\Delta$(ECE) \\
\midrule
 Image Covariate & 0.187 (0.010) & 0.107 (0.029) & 0.095 (0.005) & 0.137 (0.046) \\
 Binary Covariate & 0.176 (0.019) & 0.091 (0.011)  & 0.085 (0.008) & 0.112 (0.029)\\
\bottomrule
\vspace{-0.2cm}
\end{tabular}

\label{table:mnist-expr}
\end{table*}


We simulate an observational study of recovery time from disease in response to the administration of a drug~\citep{florian}.  The decision to treat an individual with the drug is dependent on the covariates specified as age, gender, and severity of disease. We use logistic regression as the propensity score model.  In Figure~\ref{fig:compare_toy_hist}, we see that weighing using recalibrated propensities allows us to approximate the distribution of individual treatment effect estimates better than uncalibrated propensities. Here, treatment effect estimates $\tau$ are computed as ratio $\mathbb{E}[Y[x,1]]/\mathbb{E}[Y[x,0]]$. %In Figure~\ref{fig:compare_prop_scores}, we compare the histogram of propensity scores before and after calibration. 
The true average causal effect is 0.368. Please refer to Appendix~\ref{apdx:drug-effectiveness} for details on the simulation, models used, and calibration plots. 
% \begin{wrapfigure}{r}{6 cm}
% \begin{figure}
% \centering
% % \vspace{-0.5 cm}
% \includegraphics[scale=0.25]{images/prop_histogram_simD.png}
% % \vspace{-0.5cm}

% \caption{Histogram of propensities pre- and post-calibration. Calibration reduces the occurrence of numerically small scores.} %The distribution of treatment effect estimates is shown in blue when using RCT and in yellow when using observational data directly. Using recalibrated propensity scores shifts the distribution of treatment effect estimates closest to the blue region.
 
% \vspace{-1cm}
% \label{fig:compare_prop_scores}
% \end{figure}
% \sd{Explain the tables better - variation in results}

In Table~\ref{table:toy-expr}, we employ different treatment assignment mechanisms in each simulated observational study, allowing us to compare mechanisms that may or may not be well-specified by a linear model, e.g., Simulation C uses the logical AND condition while Simulation D uses the logical XOR condition to assign treatment. (Appendix~\ref{apdx:drug-effectiveness}). 
We see that calibrated propensities produce lower absolute error in estimating average treatment effect ($\epsilon_{ATE}$) under varying mechanisms. Here, the naive estimation computes the outcomes without weighing the samples with propensities. Uncertainty-calibrated propensities reduce bias more consistently as compared to weighing with plain propensity scores, propensity trimming~\citep{Lee2011-nv}, stabilized weights~\citep{Xu2010-jv} and regularized covariate balancing optimization of propensity weights~\citep{tan2020regularized}. Since the optimal level of trimming is difficult to determine, it can sometimes increase bias, as seen in Simulation A. Similarly, the design of balancing equations impacts bias reduction in the covariate balancing approach~\citep{benmichael2021balancing} to calibration. We use the RCAL package~\citep{tan2020RCAL} to implement the covariate balancing baseline. Table~\ref{table:toy-expr-pehe} in Appendix~\ref{apdx:additional-experiments} reports the PEHE (Precision in Estimation of Heterogenous Effect) metric for all the experimental settings in Table~\ref{table:toy-expr} and demonstrates similar improvement with calibrated propensities. Table~\ref{apdx:table:comp-basemodels} in Appendix~\ref{apdx:additional-experiments} demonstrates the effectiveness of calibration over six different base propensity models (including logistic regression) that approximate a fixed treatment assignment function. 

% In Table~\ref{table:comp-basemodels}, we also compare a range of base propensity score models where the true treatment assignment function is non-linear logical XOR (Appendix ~\ref{apdx:drug-effectiveness}). We see the benefits of calibration across varying degrees of mis-specification in the base model. After calibration, non-linear MLP and SVM (RBF) show the best $\varepsilon_{ATE}$, while mis-specified linear models like logistic regression also show consistent reduction in $\varepsilon_{ATE}$. Table~\ref{apdx:table:comp-basemodels} in Appendix~\ref{apdx:additional-experiments} contains additional details including ECE as calibration metric and we observe greater reduction in bias ($\varepsilon_{ATE}$) with lowering ECE. 



In summary, calibrated propensities approximate the true distribution of individual treatment effects better and reduce the occurrence of numerically low scores. They reduce the error in ATE estimation across different propensity score models and treatment assignment mechanisms. In real-world observational studies, where we don't know the true treatment assignment mechanism, calibration can be useful to improve the treatment effect estimates from a potentially misspecified model. %\sd{TODO: Also plot relationship between ECE and ATE error OR ECE and covariate balance} 
 




\subsection{Unstructured Image Covariates}
We simulate a simple observational study following ~\citet{louizos2017causal} and \citet{deshpande2022deep} such that variables $X, T, Y \sim \mathbb{P}$ are binary and the true ATE is zero. Appendix~\ref{apdx:unstructured-covars} contains a detailed description of this simulation. We also introduce an unstructured image covariate $\mathbf{X}$ that represents $X$ as a randomly chosen MNIST image of a zero or one, depending on whether $X=0$ or $X=1$. Specifically, ${\mathbb{P}(\mathbf{X}|X=1)}$ is uniform~over MNIST images of `1' and ${\mathbb{P}(\mathbf{X}|X=0)}$ is uniform~over MNIST images of `0'.

We use a multi-layer perceptron as the propensity score model and recalibrate its output. In Table~\ref{table:mnist-expr}, we compare the IPTW estimates for ATE using binary $X$ and image $\mathbf{X}$ covariates (with $28 \times 28 = 784$ dimensions). The ECE is higher for the plain propensity score model trained on image covariates, indicating higher miscalibration with increasing covariate dimensions. We see that recalibration improves ATE estimates on high-dimensional image covariates. 


% \sd{TODO: Add plots to compare the calibrated propensities in the two cases.Explain/provide more intuition into results/why this experiment is useful}
\begin{table*}[ht]
% \vspace{-0.8cm}
\caption{GWAS with Calibrated Propensities. We compare IPTW and AIPW estimates using calibrated propensity scores against standard baselines and a specialized GWAS analysis system (LMM/LIMIX).}
% \hspace{0.1cm}
\vspace{0.2cm}
\centering
\small
\begin{tabular}{lcccccc}
\toprule
Dataset	& Spatial & 	Spatial & 	Spatial & 	HGDP	& TGP \\
& ($\alpha$=0.1)& 	 ($\alpha$=0.3)&  ($\alpha$=0.5)&  	& \\
\midrule
Naive	& 16.23 (0.91)	& 11.76 (0.84)	& 9.81 (0.69)& 	11.82 (0.11)	&  12.24 (0.71) \\
PCA	& 9.60 (0.37)	& 9.54 (0.41)	& 9.38 (0.38) & 	 	11.69 (0.20) & 	10.73 (0.38) \\
FA	& 9.55 (0.34) & 9.53 (0.44) & 	9.23 (0.30) & 11.65 (0.16)	& 10.59 (0.32) \\
LMM	 & 10.24 (0.41) & 9.58 (0.45) & \textbf{8.15 (0.40)} & \textbf{10.09 (0.35)} & \textbf{9.44 (0.57)} \\
\midrule
IPTW (Calib) 	& \textbf{8.13 (0.35)} & 	\textbf{8.69 (0.56)} & 	\textbf{8.32 (0.34)} & 	10.86 (0.13) & 	\textbf{9.57 (0.58)} \\
IPTW (Plain) & 12.56 (1.25) & 10.22 (0.81) & 9.09 (0.48) & 11.62 (0.12)	& 11.76 (0.86) \\
AIPW (Calib)	& 8.94 (0.29)	& 9.00 (0.58)& 	8.59 (0.39)  & 	11.06 (0.12) & 	10.32 (0.43) \\
AIPW (Plain)	& 13.89 (0.76) & 	10.46 (0.72) & 	8.99 (0.51)	&  11.38 (0.11)	& 11.56 (0.65) \\
$\Delta_{ECE}$ & 0.022 (0.001) & 0.016 (0.007) & 0.015 (0.001) & 0.011 (0.001)& 0.022 (0.001) \\
\bottomrule
\vspace{-0.2cm}
\end{tabular}

\label{table:gwas-basic}
\end{table*}
\subsection{Genome-Wide Association Studies}

Genome-Wide Association Studies (GWASs) attempt to estimate the treatment effect of genetic mutations (called SNPs) on individual traits (called phenotypes) from observational datasets. Each SNP acts as a treatment. Confounding occurs because of hidden ancestry: individuals with shared ancestry have correlated genes and phenotypes.

The key takeaways can be summarized as follows. First, {\em recalibration enables off-the-shelf IPTW and AIPW estimators to match or outperform a state-of-the-art GWAS analysis system} (LMM/LIMIX; see Tables \ref{table:gwas-basic} and \ref{table:gwas-increase-causal}). Second, our method  {\em enables the use of propensity score models that would otherwise be unusable} due to the poor quality of their uncertainty estimates (e.g., Naive Bayes; see Table \ref{table:gwas-nb-vs-lr}). Third, leveraging new types of propensity score models that are fast to train (such as Naive Bayes), {\bf improves the speed of GWAS analysis by more than two-fold} (see Table \ref{table:gwas-nb-vs-lr}).

\paragraph{Setup}
We simulate the genotypes and phenotypes of individuals following a range of standard models as described in Appendix~\ref{apdx:sim-gwas}. The outcome is simulated as 
$Y = \beta^T G + \alpha^T Z + \epsilon,$
where $G$ is the vector of SNPs, $Z$ contains the hidden confounding variables, $\epsilon$ is noise distributed as Gaussian, $\beta$ is the vector of treatment effects corresponding to each SNP and $\alpha$ holds coefficients for the hidden confounding variables. 
%
 We assume that the aspect of hidden population structure in $Z$ that needs to be controlled for is fully contained in the observed genetic data to ensure ignorability~\citep{Lin2011-tv}.
To estimate the average marginal treatment effect corresponding to each SNP, we iterate successively over the vector of SNPs such that the selected SNP is treatment $T$ and all the remaining SNPs are covariates $X$ for predicting the phenotypic outcome $Y$. The outcome is a vector of estimated treatment effects $\hat{\beta}$ corresponding to the vector of SNPs. We measure $\varepsilon_{ATE}$ as the $l_2$ norm of the difference between true and estimated marginal treatment effect vectors. 

We use calibrated propensity scores with the IPTW and AIPW estimators to compute these treatment effects. We compare the performance of these estimators with standard methods to perform GWAS, including Principal Components Analysis (PCA)~\citep{price2006pca, price2010new}, Factor Analysis (FA), and Linear Mixed Models (LMMs)~\citep{yu2006unified, lippert2011fast}, implemented in the popular LIMIX library~\citep{lippert2014limix}. Unless mentioned otherwise, 1\% of total SNPs are causal and we have 4000 individuals in the dataset. 
% \begin{table*}[!h]
% % \vspace{-0.8cm}
% \caption{We compare the AIPW estimate using calibrated propensities. Our methods unlock the use of certain propensity score models (e.g., Naive Bayes) which only work after recalibration. }
% \hspace{0.1cm}

% \centering
% \small
% \begin{tabular}{lcccccc}
% \toprule
% Dataset  &  Metrics  & LR  & MLP  &  Random Forest  & Adaboost  &  NB \\
% \midrule
% Spatial  & $\varepsilon_{ATE}$ (plain)
%  & 13.886 (0.755)
%  & 17.403 (1.070)
%  & 12.911 (0.612)
%  & 16.234 (0.916)
%  & 582.731 (64.514) \\
%  ($\alpha$=0.1) &  $\varepsilon_{ATE}$ (calib)
%  & 8.942 (0.287)
%  & 14.661  (0.762)
%  & 8.706 (0.322)
%  & 8.524 (0.297)
%  & 8.526 (0.472) \\
%  & $\Delta_{ECE}$ 
%  & 0.022 (0.001)
%  & 0.072 (0.003)
%  & 0.060 (0.001)
%  & 0.252 (0.006)
%  & 0.281 (0.002) \\
%  \midrule
% HGDP
%  &  $\varepsilon_{ATE}$ (plain)
%  & 11.380 (0.110)
%  & 12.358 (0.197)
%  & 11.529 (0.107)
%  & 11.816 (0.108)
%  & 138.086 (5.086) \\
%  &  $\varepsilon_{ATE}$ (calib)
%  & 11.060 (0.120)
%  & 11.198 (0.106)
%  & 11.299 (0.143)
%  & 11.070 (0.123)
%  & 11.430 (0.133) \\
%  & $\Delta_{ECE}$
%  & 0.011 (0.001)
%  & 0.069 (0.002)
%  & 0.053 (0.001)
%  & 0.275 (0.006)
%  & 0.206 (0.003) \\
%  \midrule
% TGP
%  & $\varepsilon_{ATE}$ (plain)
%  & 11.560 (0.650)
%  & 11.965 (0.754)
%  & 11.677 (0.614)
%  & 12.246 (0.713)
%  & 87.329 (5.716)\\
%  & $\varepsilon_{ATE}$ (calib)
%  & 10.320 (0.430)
%  & 11.530 (0.633)
%  & 10.519 (0.402)
%  & 10.244 (0.398)
%  & 9.070 (0.316) \\
%  & $\Delta_{ECE}$
%  & 0.022 (0.001)
%  & 0.061 (0.002)
%  & 0.070 (0.002)
%  & 0.204 (0.007)
%  & 0.267 (0.004) \\
% \bottomrule
% \end{tabular}

% \label{table:gwas-comp}
% \end{table*}


\begin{table*}[!ht]
% \vspace{-0.8cm}
\caption{Increasing Proportion of Causal SNPs. Calibrated propensities reduce the bias in treatment effect estimation across all setups and compare favorably against standard GWAS methods.}
% \hspace{0.1cm}
\vspace{0.2cm}
\centering
\small
\begin{tabular}{lcccccc}
\toprule
Method &	1\% Causal SNPs & 2\% Causal SNPs & 5\% Causal SNPs & 10\% Causal SNPs \\
\midrule
Naive  &	22.408 (5.752) &	15.150 (2.213) &	23.388 (5.021) &	14.846 ( 2.272) \\
PCA &18.104 (5.378)& 13.699 (2.413) &15.837 (3.331) &	11.683 (0.983) \\
FA & 18.532 (3.641)& 14.166 (2.259) & 16.855 (2.764) & 11.963 (0.958) \\
LMM	 & 17.575 (3.408) & 13.896 (2.152) &	14.681 (3.366)	& 10.108 (0.827) \\
\midrule
IPTW (Calib) &	\textbf{17.237 (3.054)}	& \textbf{13.113 (1.775)} &	\textbf{14.587 (3.432)} &	\textbf{8.625 (0.838)} \\
IPTW (Plain) &	19.297 (3.425) &	14.372 (1.482) &	18.290 (3.788) &	11.859 (0.95240) \\
AIPW (Calib) & 17.647 (3.208) & 13.382 (1.676) &	15.166 (3.597) &	9.078 (0.928) \\
AIPW (Plain)& 20.652 (3.286) &	13.720 (1.798) &	21.321 (4.750)	 & 12.904 (1.990) \\
%$\Delta_{ECE}$ 	& 0.0215 (0.000)	& 0.0215 (0.000)	 & 0.0215 (0.000) & 0.0215(0.000) \\
\bottomrule
\vspace{-0.4cm}
\end{tabular}
\label{table:gwas-increase-causal}
\end{table*}


In Table~\ref{table:gwas-basic}, we demonstrate the effectiveness of estimators using calibrated propensities on five different GWAS datasets (Appendix~\ref{apdx:sim-gwas}). Here, we have a total of 100 SNPs. In Table~\ref{table:gwas-increase-causal}, we increase the proportion of causal SNPs for the Spatial simulation and continue to see improved performance under calibration. In Table~\ref{table:apdx: gwas-comp} (Appendix~\ref{apdx:sim-gwas}), we compare five different base models to learn propensity scores over six standard GWAS simulations and show that calibration improves the performance in each case. The performance of plain Naive Bayes as the base propensity score model is very poor owing to the simplistic conditional independence assumptions, but calibration improves its performance significantly. In Table~\ref{table:gwas-nb-vs-lr}, we compare the computational throughput of calibrated Naive Bayes as the propensity score model with logistic regression. Here, we have a total of 1000 SNPs. We see that using calibrated Naive Bayes obtains performance competitive with logistic regression at a significantly higher throughput. Please refer to Appendix~\ref{apdx:additional-experiments} for results on additional GWAS datasets.  %Thus, we can model propensity scores over high-dimensional covariates with simpler models and obtain improved causal estimates at a significantly smaller computational cost by using calibration. 

\begin{table}[h]
% \begin{wraptable}{r}{7cm}
% \vspace{-1cm}
\caption{Calibrated Naive Bayes Yields Lower $\epsilon_{ATE}$ (IPTW) and Uses Lower Computational Resources As Compared to Logistic Regression.}
% \hspace{0.1cm}
\vspace{0.2cm}
\centering
\small
\begin{tabular}{lcccccc}
\toprule
Method & 	$\epsilon_{ATE}$ &	Tput (SNPs/sec)  \\
\midrule
% Naive	&		21.349 (3.640) & - \\
% PCA		&		19.186 (3.758)  & -\\
% FA	&	19.016 (3.852)  & -\\
LMM		&	19.908 (3.592)  & -\\
% \midrule
 Calibrated NB 
 & \textbf{18.210 (1.705)} & 47.6 \\
 % & $\epsilon_{ATE}$ & 21.319 (3.704) \\
		
  % \midrule
  Plain NB 
  &  1455.992 (185.084) & 68.6 \\
  % & $\epsilon_{ATE}$ & 336.671 (63.148) \\
	
 % \midrule
 Calibrated LR 
 & 23.618 (3.832) & 19.5  \\
% & $\epsilon_{ATE}$ & 22.795 (4.249) \\

  % \midrule
  Plain LR 
  %& IPTW & 19.297 (3.425)  & 29.309 (2.773)  & 23.525 (4.530) \\
 &  27.921 (4.713) &	20.1\\
		
\bottomrule
\vspace{-0.4cm}
\end{tabular}

\label{table:gwas-nb-vs-lr}
\end{table}




% \paragraph{\textbf{Plots to add}} 
% \begin{enumerate}
%     \item Calibration curves and propensity score histograms for toy example
%     \item Relate/plot calibration error with `balancing' property and error in treatment effect estimate before/after calibration
%     \item Plots similar to blog post in the drug recovery example - should cite
% \end{enumerate}



