\section{Results}
\label{sec:experiments}

\begin{table*}[t]
\caption{
~\citet{xie2021innout} propose In-N-Out (self-training) to mitigate ID-OOD accuracy tradeoffs---their method requires lots of unlabeled data.
Even without this unlabeled data, \calens{} are competitive with or outperform self-training ID and OOD.
We show results on all datasets used by~\citet{xie2021innout}.
}
\label{tab:self_train_results}
\vskip 0.15in
\begin{center}
\begin{tabular}{ccccccc}
\toprule
                      & \multicolumn{2}{c}{Cropland}            & \multicolumn{2}{c}{Landcover} & \multicolumn{2}{c}{CelebA} \\
                      & ID Acc                  & OOD Acc                 & ID Acc                  & OOD Acc   & ID Acc                  & OOD Acc   \\
\midrule
Standard model        & 95.3 (0.0)          & \textbf{85.6 (5.8)}  & 76.9 (0.3)          & 55.7 (1.1)         &  90.4 (0.5)  &  74.5 (0.6)  \\
Robust model          & 95.1 (0.1)          & 89.8 (0.4)             & 72.7 (0.2)           & \textbf{60.4 (1.1)}      &   \textbf{94.5 (0.2)}  &  76.3 (1.2) \\
Self-training         & 95.3 (0.2)          & \textbf{90.6 (0.6)} & \textbf{77.0 (0.4)} & \textbf{61.0 (0.7)}     & 93.1 (0.2) & \textbf{78.7 (0.7)} \\
Cal ensembling & \textbf{95.6 (0.1)} & \textbf{91.3 (0.8)} & \textbf{77.2 (0.2)} & \textbf{60.8 (0.8)}     &  \textbf{94.5 (0.5)} & \textbf{77.6 (1.2)} \\
\bottomrule
\end{tabular}
\end{center}
\vskip -0.1in
\end{table*}

% In the theoretical section, we showed that ensembling can work very well (gets the best of the standard and robust models) when there are spurious correlations ID that are suppressed or missing OOD
% But if the spurious correlations are more adversarial (anti-correlated) OOD, the ensembling does not do so well.
% We run experiments on a wide array of benchmark datasets, spanning blah
% Connecting geography, style, etc, shifts, to suppressed
% We run experiments on a wide range of naturally occuring shifts (geography shifts, style shifts, subpopulation shifts)
% We also run 
% \ak{Should we recap the goal again: goal is to get strong ID accuracy of standard model, robust accuracy of OOD model}
% \ak{Maybe say self-training is on the 3 datasets used by prior work}
% Our analysis (Section~\ref{}) predicts that calibrated ensembles can get the best of both worlds for \natshifts{}, but not for \advshifts{}.
In Section~\ref{sec:experiments-can-mitigate}, we show that \calens{} get the best of both worlds across the \numnat{} \natshifts{} we consider, but not on the \numadv{} adversarially synthesized \advshifts{}, as predicted by our analysis in Section~\ref{sec:analysis}.
\Calens{} match or outperform a prior state-of-the art approach based on self-training~\citep{xie2021innout}, which requires additional unlabeled data.
In Section~\ref{sec:experiments-how-ensemble}, we show ablations of our method.
Interestingly, we find that a common approach of tuning the ensemble weights to optimize ID accuracy can lead to poor OOD performance.
% we show that \emph{how} we ensemble the models is key: tuning the ensemble weights to optimize ID accuracy leads to poor OOD performance.
% However, calibrating (also only on \emph{in-distribution} data) leads to improved accuracies, as suggested by our analysis in Section~\ref{sec:analysis_id}.
% We sanity check that ensembling two standard or two robust models does not work as well (even with calibration).
% In Section~\ref{sec:experiments_models_miscalibrated}, we show that even after calibrating on ID data, the standard and robust models are miscalibrated OOD---but ensembles are still able to effectively combine their predictions.


\begin{table*}[t]
\caption{
\emph{
In-distribution (ID)} accuracies for the standard model, robust model, and \calens{}, across \numidnat{} \natshift{} datasets (colored blue) and \numadv{} \advshift{} datasets (colored red and starred).
On the \numidnat{} ID \natshift{} datasets, \calens{} match or outperform the best model in 8/9 cases, and on average outperforms both the standard and robust models.
For the remaining dataset, CIFAR-10, \calens{} close 97\% of the gap between the standard and robust model.
% As expected from our analysis (Section~\ref{sec:analysis}), \calens{} do not perform as well on \advshift{} datasets.
}
\label{tab:id_results}
\vskip 0.15in
\begin{center}

\begin{tabular}{cccccccc}
\toprule
& \color{blue}{Ent30} & \color{blue}{DomNet} & \color{blue}{CIFAR10} & \color{blue}{Liv17} & \color{blue}{Land} & \color{blue}{Crop} & \color{blue}{CelebA}\\
\midrule
Standard & \textbf{93.6 (0.2)} & 83.9 (1.0) & \textbf{97.4 (0.1)} & 96.9 (0.1) & \textbf{76.9 (0.3)} & 95.3 (0.0) & 90.4 (0.5)\\
Robust & 90.7 (0.2) & 89.2 (0.1) & 92.0 (0.0) & 97.0 (0.0) & 72.7 (0.2) & 95.1 (0.1) & \textbf{94.5 (0.2)}\\
Cal Ensemble & \textbf{93.7 (0.1)} & \textbf{91.2 (0.7)} & 97.2 (0.1) & \textbf{97.2 (0.2)} & \textbf{77.2 (0.2)} & \textbf{95.6 (0.1)} & \textbf{94.5 (0.5)}\\
\bottomrule
\end{tabular}
\vspace{1.2mm}
\newline
\begin{tabular}{ccc|ccc}
\toprule
 & \color{blue}{ImageNet} & \color{blue}{iWildCam} \;\;\; & \;\;\; \color{red}{MNLI*} & \color{red}{Waterbirds*} & \color{red}{CivilComments*}\\
\midrule
Standard & 81.7 (-) & 82.4 (-) \;\;\; & \;\;\; \textbf{82.9 (-)} & 88.3 (-) & \textbf{92.8 (-)}\\
Robust & 68.4 (-) & 81.8 (-) \;\;\; & \;\;\; 81.5 (-) & \textbf{93.2 (-)} & 86.3 (-)\\
Cal Ensemble & \textbf{82.0 (-)} & \textbf{84.0 (-)} \;\;\; & \;\;\; \textbf{82.8 (-)} & 92.9 (-) & 91.4 (-)\\
\bottomrule
\end{tabular}

\end{center}
\vskip -0.1in
\end{table*}


\begin{table*}[t]
\caption{
\emph{Out-of-distribution (OOD)} accuracies for the standard model, robust model, and \calens{}, across \numnat{} \natshift{} datasets (colored blue) and \numadv{} \advshift{} datasets (colored red and starred).
On the \numnat{} OOD \natshift{} datasets, \calens{} match or outperform the best model in 10/11 cases, and on average outperforms both the standard and robust models.
For the remaining dataset, DomainNet, \calens{} close 96\% of the gap between the standard and robust model.
As expected from our analysis (Section~\ref{sec:analysis}), on \advshifts{} the accuracy of \calens{} is between the standard and robust models.
}
\label{tab:ood_results}
\vskip 0.15in
\begin{center}

\begin{tabular}{cccccccc}
\toprule
& \color{blue}{Ent30} & \color{blue}{DomNet} & \color{blue}{STL10} & \color{blue}{Liv17} & \color{blue}{Land} & \color{blue}{Crop} & \color{blue}{CelebA}\\
\midrule
Standard & 60.7 (0.1) & 55.3 (0.4) & 82.4 (0.3) & 77.7 (0.6) & 55.7 (1.1) & \textbf{85.6 (5.8)} & 74.5 (0.6)\\
Robust & 63.2 (1.1) & \textbf{87.2 (0.1)} & 85.1 (0.2) & \textbf{82.2 (0.2)} & \textbf{60.4 (1.1)} & 89.8 (0.4) & 76.3 (1.2)\\
Cal Ensemble & \textbf{64.7 (0.5)} & 86.1 (0.2) & \textbf{87.3 (0.2)} & \textbf{82.2 (0.6)} & \textbf{60.8 (0.8)} & \textbf{91.3 (0.8)} & \textbf{77.6 (1.2)}\\
\bottomrule
\end{tabular}
\vspace{1.2mm}
\newline
\begin{tabular}{ccccc|ccc}
\toprule
 & \color{blue}{ImNet-R} & \color{blue}{ImNet-V2} & \color{blue}{ImNet-Sk} & \color{blue}{iWildCam} \;\;\; & \;\;\; \color{red}{MNLI*} & \color{red}{Waterbirds*} & \color{red}{Comments*}\\
\midrule
Standard & 52.4 (-) & 71.5 (-) & 40.5 (-) & 61.1 (-) \;\;\; & \;\;\; 65.5 (-) & 60.4 (-) & 56.8 (-)\\
Robust & 77.5 (-) & 61.9 (-) & 48.2 (-) & 63.0 (-) \;\;\; & \;\;\; \textbf{77.4 (-)} & \textbf{88.1 (-)} & \textbf{84.2 (-)}\\
Cal Ensemble & \textbf{77.9 (-)} & \textbf{73.2 (-)} & \textbf{52.3 (-)} & \textbf{66.3 (-)} \;\;\; & \;\;\; 73.2 (-) & 81.1 (-) & 71.8 (-)\\
\bottomrule
\end{tabular}

\end{center}
\vskip -0.1in
\end{table*}



\subsection{Main Results}
\label{sec:experiments-can-mitigate}

\textbf{Competitive with self-training.}
~\citet{xie2021innout} propose self-training on unlabeled data to mitigate ID-OOD accuracy tradeoffs.
We run experiments on all 3 datasets they consider (Landcover, Cropland, CelebA), taking checkpoints from the official CodaLab implementation of~\citet{xie2021innout}.
% \footnote{We compare to the In-N-Out results on their official CodaLab worksheet. Self-training requires additional unlabeled data which is not available in many datasets.}
% \footnote{We compare to the In-N-Out results on their official CodaLab worksheet.}
Table~\ref{tab:self_train_results} shows that \calens{} match or outperform self-training on all 3 of their datasets, both ID and OOD.
We believe this is interesting because our method is simple and does not need additional unlabeled data (which, for example, the other datasets do not have).
\pl{this is surprising; presumably we'd do better if we used both unlabeled data and calibrated ensembles?}

\ak{maybe reference the green and red/starred styling of the tables in the text}
\textbf{Strong ID and OOD accuracy.}
Across the \numnat{} \natshifts{}, \calens{} get the best of both worlds, typically outperforming the standard and robust model both ID (Table~\ref{tab:id_results}) and OOD (Table~\ref{tab:ood_results}).
Averaged across the \emph{\natshift{}} datasets, \calens{} get \calaccidnatural\% ID (vs. \stdaccidnatural{}\% for the standard model and \robaccidnatural{}\% for the robust model) and \calaccoodnatural{}\% OOD (vs. \robaccoodnatural{}\% for the robust model and \stdaccoodnatural{}\% for the standard model).
The method works across the board---\calens{} achieve the best performance on 8/9 ID \natshifts{}, and on 10/11 OOD \natshifts{}.
For the remaining two cases, DomainNet OOD and CIFAR-10 ID, \calens{} close over 95\% of the gap between the standard and robust model.
% \pl{for ID or OOD?}\ak{mentioned it before the comma}
 % (96\% for DomainNet, 97\% for CIFAR-10).

\textbf{Shift type is important.}
Our analysis in Section~\ref{sec:analysis} predicts that \calens{} \emph{do not} work as well on \advshifts{}, where a spurious feature is correlated with the label but anticorrelated OOD.
Indeed, in these cases the OOD accuracy of \calens{} is between the standard and robust model (Table~\ref{tab:ood_results}).
Even so, averaged across all \numtotal{} datasets \calens{} do well and get \calaccid{}\% ID (vs. \stdaccid{}\% for the standard model, \robaccid{}\% for the robust model) and \calaccood{}\% OOD (vs. \stdaccood{}\% for the standard model, \robaccood{}\% for the robust model).

% In addition, unlike self-training, \calens{} handle both models symmetrically and does not require prior knowledge of which model is better in what domain.
% \ak{There is a slight gap with the theory---we predicted ensembles }

% \textbf{Strong ID and OOD accuracy}:
% Calibrating and then ensembling a standard and a robust model, gets the best of both worlds, typically outperforming the standard and robust model both ID (Table~\ref{tab:id_results}) and OOD (Table~\ref{tab:ood_results}).
% Averaged across the datasets, \calens{} get 89.3\% ID (vs 87.9\% for the standard model and 84.7\% for the robust model) and 77.9\% OOD (vs 77.2\% for the robust model and 64.9\% for the standard model).
% The method works across the board---\calens{} achieve the best performance on 5/6 ID datasets, and on 5/6 OOD datasets.
% For the remaining two datasets, DomainNet OOD and CIFAR-10 ID, \calens{} close over 95\% of the gap between the standard and robust model (96\% for DomainNet, 97\% for CIFAR-10). 



\subsection{Ablations}
\label{sec:experiments-how-ensemble}

Our proposed method is a simple combination of a calibrated robust and calibrated standard model.
We vary the components of our method and try: (i) tuned ensembles without calibration, (ii) vanilla ensembles without calibration, and (iii) ensembles of two standard or two robust models.

\textbf{Tuned ensembles do not mitigate tradeoffs.} A natural way to ensemble the two models is ``tuned ensembles'': choosing the ensemble weights to optimize accuracy on the ID validation set. This approach is also known as stacking, and has performed well on the Netflix prize and Kaggle competitions~\citep{sill2009feature}.
% ~\citet{miller2021line} report that OOD accuracy can be quite correlated with ID accuracy, which suggests that this method might do well OOD.\ak{this line might be confusing}
Interestingly, we find that tuned ensembles do not do very well OOD, getting an average accuracy of \tunedaccood{}\% across the \numtotal{} datasets (vs. \calaccood{}\% for \calens{}).
The ID accuracies are similar---results for all datasets are in Table~\ref{tab:id_tuned} (ID) and Table~\ref{tab:ood_tuned} (OOD).
% Tuned ensembles do marginally better ID getting an average accuracy of \tunedaccid{}\% (vs. \calaccid{}\% for \calens{}). Naturally, we expect the tuned ensemble to do the best ID since its weights are tailored for ID---what is surprising is that the \calens{} do so much better OOD without using any OOD data either. We show results for all datasets in Table~\ref{tab:id_tuned} (ID) and Table~\ref{tab:ood_tuned} (OOD). 

\textbf{Calibration helps.} \calens{} (calibration is only done on ID data) outperform vanilla ensembles, getting an average ID accuracy of \calaccid{}\% (vs. \naiveaccid{}\% \pl{is this statistically signicant?} for vanilla ensembles) and an average OOD accuracy of \calaccood{}\% (vs. \naiveaccood{}\% for vanilla ensembles). We show results for all datasets in Table~\ref{tab:id_tuned} (ID) and Table~\ref{tab:ood_tuned} (OOD).
% and also compare combing the models' logits vs. their probabilities.

\textbf{Outperforms standard and robust ensembles.}
As a sanity check, Appendix~\ref{sec:per-dataset-ensemble-ablations} shows that our method outperforms 1. ensembling two (calibrated) standard models, and 2. ensembling two (calibrated) robust models.
% For these ensembles, calibration does not affect the accuracy much.

\textbf{Models are miscalibrated OOD.}
Even after ID calibration, we find that the standard and robust models are not calibrated OOD, which matches prior work~\citep{ovadia2019uncertainty}.
We estimate the expected calibration error (ECE; Equation 2 in~\citet{guo2017calibration}).
Since we calibrated on ID data, the ECE is low ID (\stdeceid\% for the standard model, \robeceid\% for the robust model; Table~\ref{tab:id_ece}).
However, the ECE is high OOD (\stdeceood\% for the standard model, \robeceood\% for the robust model; Table~\ref{tab:ood_ece})
% ---this agrees with prior work~\citep{ovadia2019uncertainty} shows that models calibrated ID are still not calibrated OOD.
Appendix~\ref{sec:per-dataset-calibration-appendix} shows that even the relative confidence of the models can be wrong: the standard model can be \emph{more confident} but \emph{less accurate} OOD, after ID-calibration.
Nonetheless, \calens{} get the best of both worlds---see Section~\ref{sec:analysis} for some simple intuitions for why this can happen.


% \subsection{Models are miscalibrated OOD}
% \label{sec:experiments_models_miscalibrated}
% \ak{Maybe add something about how prior work says the uncertainties are unreliable OOD (which is a bit more general than just calibration errors)}







% \subsection{Calibration and relative calibration}

% In this section we examine the calibration and relative calibration of our models, which provides a partial intuitive explanation for the success of \calens{}.
% Since we calibrated the standard and robust models on in-distribution (ID) data, we expect them to be calibrated even on held out ID test data---this follows from standard statistical guarantees for calibration.
% Table~\ref{tab:id_ece} confirms this intuition---the ECE of the standard and robust model are very low ID, on average the ID ECE is 1.2\% for the standard model and 2.0\% for the robust model.
% However, prior work shows that the calibration of models degrades substantially with distribution shift---indeed, Table~\ref{tab:ood_ece} shows that the OOD ECE of the standard (average: 11.0\%) and robust (average: 7.3\%) models are much higher.

% \begin{table*}[t]
% \caption{
% \emph{OOD} ECE: The expected calibration error (ECE) of the standard and robust models on OOD test data, after post-calibration in ID validation data.
% The calibration errors here are high, especially compared to the ID calibration errors in Table~\ref{tab:id_ece}.
% }
% \label{tab:ood_ece}
% \vskip 0.15in
% \begin{center}
% \begin{tabular}{ccccccc}
% \toprule
%  & Ent30 & DomNet & STL & Land & Crop & ImNet-R\\
% \midrule
% Calibrated Standard & 15.4 (0.8) & 13.6 (1.5) & 5.6 (1.1) & 16.4 (0.8) & 7.4 (4.8) & 7.8 (-)\\
% Calibrated Robust & 14.3 (1.5) & 5.5 (0.5) & 8.2 (0.0) & 6.5 (1.1) & 5.0 (0.3) & 4.0 (-)\\
% \bottomrule
% \end{tabular}
% \end{center}
% \vskip -0.1in
% \end{table*}

% \begin{table*}[t]
% \caption{
% \emph{ID} ECE: The expected calibration error (ECE) of the standard and robust models on ID test data, after post-calibration in ID validation data.
% The calibration errors are fairly low---note that we only use 500 examples to temperature scale, so for ImageNet we have fewer examples than classes for post-calibration, but the models are still fairly well calibrated.
% }
% \label{tab:id_ece}
% \vskip 0.15in
% \begin{center}
% \begin{tabular}{ccccccc}
% \toprule
%  & Ent30 & DomNet & CIFAR10 & Land & Crop & ImNet\\
% \midrule
% Cal Standard & 0.7 (0.1) & 2.0 (0.3) & 0.8 (0.2) & 1.1 (0.5) & 1.4 (0.3) & 1.0 (-)\\
% Cal Robust & 1.1 (0.4) & 2.2 (0.2) & 1.3 (0.2) & 1.7 (0.3) & 3.5 (0.2) & 2.3 (-)\\
% \bottomrule
% \end{tabular}
% \end{center}
% \vskip -0.1in
% \end{table*}


% Why do \calens{} mitigate the ID-OOD tradeoff even though neural networks are not calibrated OOD?
% We show that \emph{relative calibration} provides a partial answer towards this.
% The final prediction is typically made by the model that is more confident.
% So if the robust model is more confident than the standard model OOD then the ensemble inherits the high OOD accuracy of the robust model---even if the absolute ECE of the standard and robust models are both bad.
% % Since we are combining the predictive probabilities of the standard and robust models, more important than the absolute calibration of the models is \emph{relative calibration}

% To get a handle at relative calibration we measure 1. the \emph{accuracy gap} between the standard and robust model, which is the robust model's accuracy minus the standard model's accuracy, and 2. the \emph{confidence gap} between the two models: the robust model's average confidence minus the standard model's average confidence.
% Ideally, the two should align with each other and at least have the same sign: if the robust model is more accurate then it should also be more confident.
% In Table~\ref{tab:relative_cal} we show that this provides a reasonable initial explanation for the success of calibrated ensembles.
% The confidence gap and accuracy gap tend to have the same sign, and calibration aligns the two better.
% However, calibrated ensembles still work on LandCover---even though the relative calibration average across the entire dataset is not correct (the standard model is more confident but less accurate OOD) at the granularity of individual points ensembling is able to get the benefits of both the standard and robust models.

% \begin{table*}[t]
% \caption{
% We show the accuracy gap (difference between the accuracy of the standard and robust model) and the confidence gap (difference in confidence between the two) before and after calibration.
% The accuracy gap and confidence gap typically have the same sign, and this improves after calibrating ID.
% }
% \label{tab:relative_cal}
% \vskip 0.15in
% \begin{center}
% \begin{tabular}{ccccccc}
% \toprule
%                          & Ent30                        & DomNet                       & STL                         & Land                          & Crop                        & ImNet-R                      \\
% \midrule
% Acc Gap                  & \cellcolor[HTML]{CFE2F3}2.5  & \cellcolor[HTML]{A4C2F4}31.9 & \cellcolor[HTML]{CFE2F3}2.7 & \cellcolor[HTML]{CFE2F3}4.7   & \cellcolor[HTML]{CFE2F3}4.2 & \cellcolor[HTML]{A4C2F4}27.9 \\
% Conf Gap                 & \cellcolor[HTML]{F4CCCC}-2.3 & \cellcolor[HTML]{CFE2F3}3.6  & \cellcolor[HTML]{CFE2F3}1.1 & \cellcolor[HTML]{EA9999}-13.5 & \cellcolor[HTML]{CFE2F3}4.3 & \cellcolor[HTML]{C9DAF8}18.6 \\
% (+ Cal) Conf Gap & \cellcolor[HTML]{CFE2F3}1.4  & \cellcolor[HTML]{A4C2F4}23.7 & \cellcolor[HTML]{CFE2F3}5.5 & \cellcolor[HTML]{F4CCCC}-6    & \cellcolor[HTML]{CFE2F3}1.3 & \cellcolor[HTML]{C9DAF8}16.6 \\
% \bottomrule
% \end{tabular}
% \end{center}
% \vskip -0.1in
% \end{table*}

% Why might calibrating models ID improve the relative calibration OOD?
% Standard deep learning models tend to be overparameterized, and can often get near 0 loss on the training data.
% As such they tend to be highly overconfident on ID and OOD data.
% Robustness interventions typically involve additional constraints (projecting out spurious input features, lightweight fine-tuning, extensive data augmentation) and so robust models generally tend to be less overconfident.
% Calibrating both models, even just ID, can make their confidences more comparable---even though both models degrade OOD, the hope is that the relative calibration continues to track their accuracy.

% We note that all this is intuition (theory about OOD calibration and ensembling is scarce and highly challenging)---we hope that future work formalizes these arguments.
% We think relative calibration might be a promising future direction---even if models are not calibrated OOD, their relative calibration could be much better.

% \begin{table*}[t]
% \caption{
% }
% \label{tab:}
% \vskip 0.15in
% \begin{center}

% \end{center}
% \vskip -0.1in
% \end{table*}
