\section{Methods}
\label{sec:methods}
% \ar{I feel this section should go before intuition and analysis}
% \ak{Done}

% \begin{algorithm*}[tbp]
% 	\caption{\calens{}}	\label{alg:calens}
% 	\begin{algorithmic}[1]
% 		\small
% 				\Require  in-distribution validation data $\{(\xval_i, \yval_i)\}_{i=1}^{\nval} \sim \Pid$, 
% 		\Statex	\hspace{0.7cm} standard and robust models $\fstd, \frob : \cX \to \R^K$
% \State Calibrate $\fstd$ on ID data: $\Tstd = \argmin_T \frac{1}{\nval} \sum_{i=1}^{\nval} l\Big(\frac{\fstd(\xval_i)}{T}, \yval_i\Big)$ \;
% \State Calibrate $\frob$ on ID data: $\Trob = \argmin_T \frac{1}{\nval} \sum_{i=1}^{\nval} l\Big(\frac{\frob(\xval_i)}{T}, \yval_i\Big)$ \;
% \State Return $\fens$ where $\fens(x) = \fstd(x) / \Tstd + \frob(x) / \Trob$
% 	\end{algorithmic}
% \end{algorithm*}

\begin{algorithm}[tbp]
	\caption{\calens{}}	\label{alg:calens}
	\begin{algorithmic}[1]
		\small
				\Require  in-distribution validation data $\{(\xval_i, \yval_i)\}_{i=1}^{\nval} \sim \Pid$, 
		\Statex	\hspace{0.7cm} standard and robust models $\fstd, \frob : \cX \to \R^K$
\State $\fstdbar$ = Calibrate $\fstd$ on in-distribution (ID) data \;
\State $\frobbar$ = Calibrate $\frob$ on in-distribution (ID) data \;
\State Return $\fens(x) = \frac{1}{2}\big(\softmax(\fstdbar(x)) + \softmax(\frobbar(x))\big)$
	\end{algorithmic}
\end{algorithm}

\textbf{Proposed method: \calens{}.}
Given a standard model $\fstd$ and robust model $\frob$, we first calibrate each model on the \emph{in-distribution} validation data, and then add up their predictions (Algorithm~\ref{alg:calens}).
In our experiments, we calibrate using temperature scaling~\citep{guo2017calibration} with the cross-entropy loss $\ell$:
\begin{align}
\Tstd &= \argmin_T \frac{1}{\nval} \sum_{i=1}^{\nval} \ell\Big(\frac{\softmax(\fstd(\xval_i))}{T}, \yval_i\Big) \\
\Trob &= \argmin_T \frac{1}{\nval} \sum_{i=1}^{\nval} \ell\Big(\frac{\softmax(\frob(\xval_i))}{T}, \yval_i\Big)
\end{align}
% \pl{cross entropy loss is not quite right since the first argument needs to be sent through a softmax}
% \ak{good point, changed}
We then ensemble the two models by adding up \pl{/averaging} the probabilities that they predict~\citep{lakshminarayanan2017simple}.
\pl{I would have expected geometric average...why does this make sense?}
\begin{equation}
	\fens(x) = \frac{1}{2}\Big( \softmax\Big(\frac{\fstd(x)}{\Tstd}\Big) + \softmax\Big(\frac{\frob(x)}{\Trob}\Big)\Big),
\end{equation}
\ak{Ideally we want softmax on the LHS too, although this still works fine technically}
where the predicted label is $\pred(\fens(x)) = \argmax_y \fens(x)_y$.
% \ar{Should mention this is our proposed method, and perhaps acknowledge in intro that this is very intuitive but we didn't find any reference to a paper that uses calibrated ensembles}
% \ak{added proposed method, will check intro}

% The goal of temperature scaling is to adjust each model's confidence on the in-distribution validation data.

\paragraph{Ablations.}
In Section~\ref{sec:experiments} we ablate each component of the method, for example the calibration step, way of combining the models, and we compare to (calibrated) ensembles of two standard models, or of two robust models.
% As ablations, we consider a number of additional ways of combining the standard and robust predictions.
% \emph{Vanilla ensembles} skip the calibration step (so use $\Tstd = \Trob = 1$).
% As a potentially stronger baseline, \emph{tuned ensembles} take a weighted sum of the standard and robust models' predictions, where the weight is tuned to maximize accuracy on the in-distribution validation set.
% We also compare adding the model's predictions in logit versus probability space---we found adding the probabilities to work very slightly better, as reported by prior work~\citep{lakshminarayanan2017simple}.
% As a sanity check, we also compare with (calibrated) ensembles of two standard models, or of two robust models.

% \emph{Vanilla Ensembling}: Here we simply return the average of the standard and robust model's probabilities, without temperature scaling.
% \begin{equation}
% \hat{p} = \frac{1}{2} \big( \softmax(\fstd(x)) + \softmax(\frob(x)) \big)
% \end{equation}

% \emph{Tuned Ensembling}: As a potentially stronger baseline, we consider outputting a weighted average of the standard and robust model's probabilities, where the weight $\alpha \in [0, 1]$ is tuned to maximize accuracy on the in-distribution set.
% \begin{equation}
% \hat{p} = \alpha \softmax(\fstd(x)) + (1 - \alpha) \softmax(\frob(x)) \big)
% \end{equation}

% \emph{Alternative combination methods}: We could also combine the logits of the models before taking the $\softmax$, or we could simply output the prediction of the more confident model. We chose to add up the probabilities since that is commonly done by prior work on ensembling~\citep{lakshminarayanan2017simple} and we found it to work slightly better than these alternatives.

% \emph{Other models}: Each of these ensembling methods takes two models and combines them. As such, we can apply each of these ensembling methods to two standard models or two robust models as well.
% \ar{I wonder if we should skip ``standard'' and ``robust'' models and just say two models $f_A$ and $f_B$ or something}
