\section{ALGORITHMS FOR CALIBRATED PROPENSITY SCORING}\label{sec:algorithms}

% \vk{need to clean up because material is taken from icml22}

\subsection{A Framework for Calibrated Propensity Scoring}
% \begin{wrapfigure}{R}{0.52\textwidth}
\begin{figure}
% \vspace{-1cm}
\begin{minipage}{0.52\textwidth}
  \begin{algorithm}[H]
    \caption{Calibrated Propensity Scoring}
    \label{alg:cal_prop_scoring}
    % \begin{algorithmic}
      % \STATE 
      1. Split $\mathcal{D}$ into training set $\mathcal{D}'$ and calibration set $\mathcal{C}$ 
      % \sd{note: we use cross-val splits}\vk{no problem}
      \\
      2. Train a propensity score model $Q(T|X)$ on $\mathcal{D}'$ \\
      3. Train recalibrator $R$ over output of $Q$ on $\mathcal{C}$ \\
      4. Apply IPW with $R \circ Q$ as prop.~score model
    % \end{algorithmic}
  \end{algorithm}
\end{minipage}
\vspace{-0.5 cm}
\end{figure}


Next, we propose algorithms that produce calibrated propensity scoring models. Our approach is outlined in Algorithm \ref{alg:cal_prop_scoring}; it differs from standard propensity scoring methods by the addition of a post-hoc recalibration step (step \#3) after training the model $Q$. 
% which recalibrates
% augmenting algorithms have the structure outlined in : they follow the structure
% We first introduce a framework for calibrated propensity scoring---a three-step process that relies on a calibration subroutine (Algorithm \ref{alg:cal_prop_scoring}).



The recalibration step 
in Algorithm \ref{alg:cal_prop_scoring} implements a post-hoc recalibration procedure \citep{Platt99probabilisticoutputs,kuleshov2018accurate} and is outlined in Algorithm \ref{alg:recalibrate}.  
The key idea is to learn an auxiliary model $R : [0,1] \to [0,1]$ such that the joint model $R \circ Q$ is calibrated. 
Below, we argue that if $R$ can approximate the density $P(T=1|Q(T|X)=p)$, $R \circ Q$ will be calibrated \cite{kuleshov2018accurate,pmlr-v162-kuleshov22a}.

Learning $R$ that approximates $P(T=1|Q(T|X)=p)$ requires specifying (1) a model class for $R$ and (2) a learning objective $\ell$.
One possible model class for $R$ are non-parametric kernel density estimators over $[0,1]$; their main advantage is that they can provably learn the one-dimensional conditional density $P(T=1|Q(T|X)=p)$. Examples of such algorithms are RBF kernel density estimation or isotonic regression.
Alternatively, one may use a family of  parametric models for $R$: e.g., logistic regression, neural networks.
Such parametric recalibrators can be implemented easily within deep learning frameworks and work well in practice, as we later demonstrate empirically.

Our learning objective for $R$ can be any proper scoring rule
% A natural choice of learning objective between $R$ and $P$ is a proper scoring rule 
such as the L2 loss, the log-loss, or the Chi-squared loss.
Optimizing it is a standard supervised learning problem. 
% \begin{wrapfigure}{R}{0.5\textwidth}
\begin{figure}
% \vspace{-0.5cm}
\begin{minipage}{0.5\textwidth}
  \begin{algorithm}[H]
    \caption{Recalibration Step}
    \label{alg:recalibrate}
      \textbf{Input:}
    Pre-trained model $Q : \mathcal{X} \to [0,1]$, recalibrator $R : [0,1] \to [0,1]$, calibration set $\mathcal{C}$\\
  \textbf{Output:}
    Recalibrated model $R \circ Q : \mathcal{X} \to [0,1]$.
   \begin{enumerate}
%    \item Fit the base model on $\mathcal{D}$: $\min_{H} \sum_{(x, y) \in \mathcal{D}} L(H(x), y)$
    \item Create a recalibrator training set: \\$\mathcal{S} = \{ (Q(x), y) \mid x, y \in \mathcal{C}\}$
    \item Fit the recalibration model $R$ on $\mathcal{S}$: \\
    $\min_{R} \sum_{(p,y) \in \mathcal{S}} L\left(R(p), y\right)$ 
   \end{enumerate}
  \end{algorithm}
\end{minipage}
\vspace{-0.5cm}
\end{figure}
% \begin{itemize}
%     \item Provide an algorithm for propensity scoring (as a separate float) that contains recalibration as subroutine
%     \item Then provide an algorithm (as a separate float) for recalibration, where the recalibrator is a free parameter.
%     \item Discuss different choices of recalibrator.
%     \begin{enumerate}
%         \item  parameteric: LR, NB, NN. Mention pros and cons.
%         \item non-parametric: KNN, IR
%     \end{enumerate}
% \end{itemize}

\subsection{Ensuring Calibration in Propensity Scoring Models}

Next, we seek to show that Algorithms \ref{alg:cal_prop_scoring} and \ref{alg:recalibrate} provably yield a calibrated model $R \circ Q$. This shows that the desirable property of calibration can be maintained in practice.

% separate this out into a proof section and an interpretation (calibration is a free lunch) section?
% \subsection{Calibration is a Free Lunch}

% Next, we prove that Algorithm \ref{alg:recal} indeed produces models that perform calibrated risk minimization. 

% We start with some notation. 
% Let $L$ be a proper loss and let $L_c$ be its associated calibration loss derived from the calibration-reliability decomposition of $L$. 
% Let $\mathcal{L}^{(H)}, \mathcal{L}^{(H)}_c$ denote the expected values of $L, L_c$ for a classifier $H$:
% \begin{align*}
%     \mathcal{L}^{(H)} = \mathbb{E}_{X,Y}[L(H(X),Y]
%     & & \mathcal{L}_C^{(H)} = \mathbb{E}_{X,Y}[L_C(H(X),Q],
% \end{align*}
% where $Q(y) := \mathbb{P}(Y=y \mid H(X))$ is the conditional distribution of $Y$ given a forecast of $H(X)$. Let $\mathcal{L}^*$ denote the optimal value of $\mathcal{L}^{(H)}$.
\paragraph{Notation}
% We start with some notation. 
We have a calibration dataset $\mathcal{C}$ of size $m$ sampled from $P$ and we train a recalibrator $R : [0,1] \to [0,1]$
over the outputs of a base model $Q$ to minimize a proper loss $L$. We denote the Bayes-optimal recalibrator by $B := P(T=1\mid Q(X))$;
the probability of $T=1$ conditioned on the forecast $(R \circ Q)(X)$ is $S := P(T=1 \mid (R \circ Q)(X))$.
% We are interested in the expectations of 
% various losses over $X,T$; 
To simplify notation, we omit the variable $X$, 
when taking expectations over $X,T$,
e.g. $\mathbb{E}[L(R \circ Q,T)] = \mathbb{E}[L(R(Q(X)),T)]$.

Our first claim is that if we can perform density estimation, then we can ensure calibration.
We first formally define the task of density estimation.
\begin{task}[Density Estimation]
\label{ass:density}
The model $R$ approximates the density $B := P(T=t\mid Q(X))$. The expected proper loss of $R$ tends to that of $B$ as $m \to \infty$ such that w.h.p.:
% The recalibrator $R$ learns a Bayes-optimal density by minimizing the proper loss $L$. 
% Given a dataset of size $T$, and a proper loss $L$ the recalibrator $R$ learns to approximate 
% The Bayes-optimal recalibrator $B(Y=y\mid H(X))$ such that w.h.p.~we have
\begin{align*}
\mathbb{E}[L(B\circ Q,T)] 
\leq \mathbb{E}[L(R \circ Q,T)] < \mathbb{E}[L(B \circ Q,T)] + \delta
    % L^* \leq \mathbb{E}[L((R\circ H)(X),Y)] \leq L^* + \delta
\end{align*}
% \mathbb{E}[L(R(H(X)),Y)]
where $\delta> 0$, $\delta = o(m^{-k}), k>0$ is a bound that decreases with $m$.% and $\mathbb{E}[L(B\circ H,Y)]$ is the irreducible loss. 
% \sd{should this be $o(m^{-k})$ where k is positive?}
\end{task}

% Although parametric neural network density estimators are not necessarily guaranteed to solve this task (due to non-convexity), we demonstrate empirically that in practice they do, and also one can evaluate their performance on a held out dataset.
Note that non-parametric kernel density estimation is formally guaranteed to solve one-dimensional density estimation given enough data.
%For some recalibrators, e.g., neural nets, it may not provably hold (e.g., because of non-convexity). However, neural networks are effective density estimators in practice, and we can quantify whether they estimate density well on a hold-out set.
%This assumption provably holds for many non-parametric density estimation methods.

\begin{fact}[\citet{wasserman2004all}]
When $R$ implements kernel density estimation and $L$ is the log-loss, Task~\ref{ass:density} is solved with $\delta=o(1/m^{2/3})$.
\end{fact}
% such as kernel density estimation \citep{wasserman2006all}. It may even hold when $R$ is a sufficiently expressive neural network, although we cannot prove it.
% Note that rate of $\delta$ is good when dimensionality is low.

%We now prove two key lemmas. We show that Algorithm \ref{alg:recal} outputs calibrated forecasts without reducing the performance of the base model, as measured by regret relative to loss $L$.

We now show that when we can solve Task~\ref{ass:density}, our approach yields models that are asymptotically calibrated in the sense that their calibration error tends to zero as $m \to \infty$.

\begin{theorem}%[Calibration]
\label{lem:calibration}
The model $R \circ Q$ is asymptotically calibrated and
the calibration error $\mathbb{E}[L_c(R \circ Q,S)] < \delta$ for $\delta = o(m^{-k}), k>0$ w.h.p.
% $\mathcal{L}_C \leq \epsilon$ when $T \to \infty$.
%\vk{todo: copy statement and proof to appendix}
\end{theorem}
See Appendix~\ref{apdx:asymptotic-calibration} for the full proof. 
% \begin{proof}
% Recall that the loss $\mathbb{E}[L(R \circ H,Y)]$ decomposes into a sum of calibration and refinement terms $\mathbb{E}[L_c(R \circ H,Q)] + \mathbb{E}[L_r(Q)]$ where $Q(y) := \mathbb{P}(Y=y \mid (R \circ H)(X))$.

% As shown by \citet{kull2015novel}, refinement further decomposes into a group loss and an irreducible term:
% $\mathbb{E}[L_r(Q)] = \mathbb{E}[L_g(Q,B\circ H)] + \mathbb{E}[L(B\circ H,Y)],$
% where $B(Y=y\mid H(X))$ is the Bayes-optimal recalibrator. The form of the group loss $L_g$ is the same as that of $L_c$.
% We may then write:
% \begin{align*}
% & \underbracket{\mathbb{E}[L(B\circ H,Y)]}_\text{Bayes-Optimal Loss} \\
% & \leq \underbracket{\mathbb{E}[L_c(R \circ H,Q)]}_\text{Calibration Loss} + \underbracket{\mathbb{E}[L_g(Q,B\circ H)]}_\text{Group Loss} + \underbracket{\mathbb{E}[L(B\circ H,Y)]}_\text{Bayes-Opt Loss} \\
% & = \underbracket{\mathbb{E}[L(R \circ H,Y)]}_\text{Proper Loss} 
% < \underbracket{\mathbb{E}[L(B \circ H,Y)]}_\text{Bayes-Optimal Loss} + \delta
%     % L^* \leq \mathbb{E}[L((R\circ H)(X),Y)] \leq L^* + \delta
% \end{align*}
% where $\delta>0, \delta=o(m)$. 
% In the first equality we used the decomposition of \citet{kull2015novel} and in the last inequality we used Assumption \ref{ass:density}.
% It follows that $\mathbb{E}[L_c(R \circ H,Q)] < \delta$, i.e. the calibration loss is small.
% \end{proof}


% Can have a similar structure to the ICML22 paper. May establish bound that says that calibration will be ensured when the density estimation problem can be solved. Then show that KNN can solve the density estimation problem and thus get calibration at a certain rate.

\subsection{No-regret Calibration}

% Show that calibration does not make the chi-squared loss worse, which doesn't make the bound on the estimator error worse.

Next, we show that Algorithms \ref{alg:cal_prop_scoring} and \ref{alg:recalibrate} produce a model $R \circ Q$ that is asymptotically just as good as the original $Q$
as measured by the proper loss $L$.
% In other words, our calibration procedure can never make things worse.

\begin{theorem}
\label{lem:loss}
The recalibrated model has asymptotically vanishing regret relative to the base model: $\mathbb{E}[L(R \circ Q,T)] \leq \mathbb{E}[L(Q,T)] + \delta,$ where $\delta >0, \delta=o(m)$. % is a bound that decreases with $m$. 
% \vk{todo: proof to appendix}
\end{theorem}

\begin{proof}[Proof (Sketch)]
%The claim holds by empirical risk minimization. Since $R \circ H$ minimizes $L$, but is more expressive than $H$ and $R$ can represent the identity map (by Assumption \ref{ass:density}).
Solving Task \ref{ass:density} implies $\mathbb{E}[L(R \circ Q,T)] \leq \mathbb{E}[L(B \circ Q,T)] + \delta \leq \mathbb{E}[L(Q,T)] + \delta$; the second inequality holds because a Bayes-optimal $B$ has lower loss than an identity mapping.
\end{proof}

 See Appendix~\ref{apdx:no-regret} for the full proof. Thus, given enough data, we are guaranteed to produce calibrated forecasts and preserve base model performance as measured by $L$ (including $L_\chi$ used in our calibration bound).
% In particular, if $L$ is chosen to be the chi-squared loss, we will not worsen our bound on the calibration error.

% Note that our analysis provides {\em finite-sample} and not only asymptotic bounds on the regret and calibration error---the bounds are stated in terms of $\delta$, which is $o(m)$. The bound $\delta$ on the calibration error directly depends on the finite-sample bound on the generalization error of the algorithm used as the recalibrator.



% \subsection{Doubly Robust Estimation and Calibrated Propensities}
% Due to sensitivity of the IPTW estimator toward mis-specification of propensity score model, ~\citet{Robins2000-kz} propose doubly robust Augmented Inverse Propensity Weighted (AIPW) estimator for ATE. The AIPW estimate is asymptotically unbiased when either the treatment assignment (propensity) model or the outcome model is well-specified, but this assumption is rarely satisfied in real world. Following the ideas 
% % We define the outcome model as $f(X=x, T=t)$ to approximate the outcome $Y[X=x, T=t]$ as defined in Section~\ref{sec:background}.

% % With this, we define the AIPW estimator as $\hat{\tau} = \frac{1}{n}\sum_{i=1}^n \Bigg[f(X_i, T=1) - f(X_i, T=0) + \frac{T_i (Y_i-f(X_i, T=1))}{e(X_i)} - \frac{(1-T_i)(Y_i-f(X_i, T=0))}{1-e(X_i)}\Bigg].$


% \sd{to write}
