\section{Proofs for Section~\ref{sec:analysis}}
\label{app:analysis_appendix}

We begin with some standard background on Bayes optimal classifiers.
When then prove the results in Section~\ref{sec:analysis}.
By default, expectations are taken over all random variables.

\subsection{Background on Bayes-optimal classifiers}

These results are all standard, but we include it as background information since different texts use different notations.
Let $Z \in \cZ$ denotes some features (that can be complicated functions of the input $x$, for example the output of a neural network), and let $Y \in \cY$ denote the label.
Let $P$ be a distribution over $(Z, Y)$.
The Bayes-optimal classifier predicts the most likely label $y$ given features $z$.

\begin{definition}
\label{dfn:bayes-opt-appendix}
The Bayes-optimal classifier for $P$ given features $z$ is given by:
\begin{equation}
	y_*(z) = \argmin_{y \in \cY} P(y \mid z).
\end{equation}
\end{definition}

The Bayes-optimal classifier has the minimum misclassification error of all possible classifiers that use $z \in \cZ$ to predict $y \in \cY$.
Formally, the error of a classifier $\hat{y}$ is the probability that it gets the label incorrect.

\begin{definition}
\label{dfn:error-appendix}
The error of a predictor $\hat{y} : \cZ \to \cY$ on distribution $P$ is given by:
\begin{equation}
	\Err_P(\hat{y}) = P(Y \neq \hat{y}(Z)),
\end{equation}
\end{definition}

Alternatively, we can look at the error for each $Z$, and then take the average over $Z$, which gives us:

\begin{lemma}
\label{lem:alt_error_appendix}
The error of a predictor $\hat{y} : \cZ \to \cY$ on distribution $P$ can be written as:
\begin{equation}
	\Err_P(\hat{y}) = \E[1 - P(Y=\hat{y}(Z) \mid Z)].
\end{equation}
\end{lemma}

\begin{proof}
	We can write the misclassification probability as an expectation over an indicator and then apply the law of total expectation.
	\begin{align}
		P(Y \neq \hat{y}(Z)) &= \E[\mathbb{I}(Y \neq \hat{y}(Z))] \\
		&= \E[\E[\mathbb{I}(Y \neq \hat{y}(Z)) \mid Z]].
	\end{align}
	And then just write the inner expectation as a probability.
	\begin{align}
		\E[\E[\mathbb{I}(Y \neq \hat{y}(Z)) \mid Z]] &= \E[P(Y \neq \hat{y}(Z) \mid Z)] \\
		&= \E[1 - P(Y = \hat{y}(Z) \mid Z)].
	\end{align}
\end{proof}

The Bayes-optimal classifier selects the $y$ with the highest probability given $z$, so we have:

\begin{lemma}
\label{lem:bayes-opt-err-appendix}
The error of the Bayes-optimal classifier $y_*$ on a distributon $P$ can be written as (where $Z \sim P$):
\begin{equation}
	\Err_P(y_*) = \E[1 - \max_{y \in \cY} P(Y=y \mid Z)].
\end{equation}
\end{lemma}

\begin{proof}
	The proof is immediate by substituting the definition of the Bayes-optimal classifier (Definition~\ref{dfn:bayes-opt-appendix}) into the alternative formula for the error in Lemma~\ref{lem:alt_error_appendix}.
\end{proof}

From the above, it is clear that the Bayes-optimal classifier has lower error than any other classifier that uses only $z$, formalized below.

\begin{lemma}
\label{lem:bayes-opt-is-optimal-appendix}
The bayes-optimal classifier (for $P$) has lower error than all classifiers $\hat{y} : \cZ \to \cY$:
\begin{equation}
	\Err_P(y_*) \leq \Err_P(\hat{y}).
\end{equation}
\end{lemma}

\begin{proof}
Beginning from Lemma~\ref{lem:alt_error_appendix}, we have:
\begin{align}
	\Err_P(\hat{y}) &= \E[1 - P(Y=\hat{y}(Z) \mid Z)] \\
	&\geq \E[1 - \max_{y \in \cY} P(Y=y \mid Z)] \\
	&= \Err_P(y_*).
\end{align}
\end{proof}

As a simple corollary, we note that the accuracy of the Bayes-optimal classifier is at least the frequency of the most common label. 

\begin{corollary}
\label{cor:bayes-opt-better-trivial}
If $y_*$ is bayes-optimal for $P$ then,
	\begin{equation}
		\Err_P(y_*) \leq 1 - \max_{y \in \cY} P(Y=y)
	\end{equation}
\end{corollary}

So for example if $P$ is balanced, then the Bayes-opt classifier will have accuracy at least $1/K$, where $K$ is the number of classes.

Note that calibrated classifiers are Bayes-optimal given their outputs.
Formally, let $P$ be a distribution over $(x, y)$, and suppose $f$ is calibrated with respect to $P$.
Let $z = f(x)$ and let $P'$ be the induced distribution over $(z, y)$.
Then $f$ is Bayes-optimal for $P'$ given features $z$.
The label distributions $P'(y)$ and $P(y)$ are the same, so Lemma~\ref{lem:bayes-opt-is-optimal-appendix} applies to any calibrated classifier.



\subsection{Proof of Proposition~\ref{prop:calibration-ensemble-optimal}}

% First we show two basic properties about $\softmax$.

% \begin{lemma}
% 	Let $w \in \R^K$. There exists some constant $c \in \R$ so that,
% 	\begin{equation}
% 		\log{\softmax(w)_y} = w_y + c,\quad \mbox{for all }y \in [K].
% 	\end{equation}
% \end{lemma}

% \begin{lemma}
	
% \end{lemma}

% Recall that $\softmax : \R^K \to \R^K$ is defined by, for $y \in [K]$:
% \begin{equation}
% 	\softmax(w)_y = \frac{\exp{w_y}}{\sum_{i \in [K]} \exp{w_i}}
% \end{equation}
% So $\softmax$ is basically a normalized exponential.
% As such it shares similar properties to exponentials, for example the softmax of a sum is the same as the product of softmax, up to a normalizing constant to ensure the entries sum to $1$.
% \begin{lemma}
% \label{lem:softmax-add}
% Let $u, v \in \R^K$. Then for some constant $c \in \R$,
% 	\begin{equation}
% 		\softmax(u + v) = c \softmax(u)\softmax(v).
% 	\end{equation}
% Here $c$ is a normalizing constant to ensure the entries sum to $1$:
% \begin{equation}
% 	c = \frac{1}{\sum_{y \in [K]} \softmax(u)_y \softmax(v)_y }
% \end{equation}
% \end{lemma}

% Few other properties about softmax:
% Sometimes it's easy to directly work with softmax. Here, be careful about what is fixed (e.g., r, s in our proof). And then take all fixed stuff into a constant.
% Often don't even need to analyze the constant, just use the following property.
% If P(y) = c exp(r), then P(y) = softmax(r)
% Softmax has a nice multiplicative property like an exponential.
% softmax(u + v) = c softmax(u) softmax(v), where c \in \R
% Adding a constant does not change the softmax.
% $\softmax(u+c) = \softmax(u)$ if $c \in \R$, $u \in \R^K$, so entry-wise addition of $c$ and $u$.
% Logarithms and softmax are essentially invereses of each other/
% For example to recover logits from softmax, just need to take log. You'll recover it up to the above symmetry, 
% where if you add the same number to all entries then this does not change the softmax.
% Given $u \in \R^K$, \log(\softmax{u}) = u + c for some $c \in \R$
% \softmax(\log(u)) = u / sum_i u_i, so this is just a normalized u. Note that u needs to be non-negative
% If p is a probability vector, then \softmax(\log(p)) = p
% If \softmax(u) = c\softmax(v) then \softmax(u) = \softmax(v). Follows clearly by just taking log, and the above properties.

\newtheorem*{calibrationEnsembleOptimalProp}{Restatement of Proposition~\ref{prop:calibration-ensemble-optimal}}

\begin{calibrationEnsembleOptimalProp}
\calibrationEnsembleOptimalText
\end{calibrationEnsembleOptimalProp}

We first show that in the setting of the Proposition, we can write $P(y \mid \frob(x), \frob(x))$ in terms of $\frob(x)$ and $\fstd(x)$.

\begin{lemma}
	\label{lem:bayes_prob_softmax}
	In the setting of Proposition~\ref{prop:calibration-ensemble-optimal}, let $m \in \R^K$ be the log of the marginal probabilities $P(y)$:
	\begin{equation}
		\label{eqn:general_combination_lem_bayes_prob_softmax}
		m_y = \log{P(y)}, \quad \mbox{for all }y \in [K].
	\end{equation}
	Then we have:
	\begin{equation}
		\label{eqn:bayes_prob_softmax_appendix_imbalanced}
		P(y \mid \fstd(x), \frob(x)) = \softmax(\fstd(x) + \frob(x) - m)_y, \quad \mbox{for all }y \in [K].
	\end{equation}
	In the balanced setting, where $P(y) = 1/K$ for all $y$, this simplifies to:
	\begin{equation}
		\label{eqn:bayes_prob_softmax_appendix_balanced}
		P(y \mid \fstd(x), \frob(x)) = \softmax(\fstd(x) + \frob(x))_y, \quad \mbox{for all }y \in [K].
	\end{equation}
\end{lemma}

\begin{proof}
	Fix $r = \frob(x)$ and $s = \fstd(x)$, where $r, s \in \R^K$.
	We first rewrite the probability of $y$ given the robust and standard model outputs $P(y \mid r, s)$ in terms of the probability of $y$ given each of the individual model outputs: $P(y \mid r)$ and $P(y \mid s)$.
	We do this for discrete random variables for simplicity, but the same result follows by using Bayes rule for general random variables.
	\begin{align}
		P(y \mid r, s) &= \frac{P(r, s \mid y) P(y)}{P(r, s)} && \text{[Bayes rule]} \\
		&= \frac{P(r \mid y)P(s \mid y) P(y)}{P(r, s)} && \text{[$r \perp s \mid y$]} \\
		&= \frac{[\frac{P(y \mid r) P(r)}{P(y)} \frac{P(y \mid s) P(s)}{P(y)} P(y)}{P(r, s)} && \text{[Bayes rule]} \\
		&= \frac{P(y \mid r) P(y \mid s)}{P(y)} \Big[ \frac{P(r) P(s)}{P(r, s)} \Big] && \text{[Algebra]} \\
	\end{align}
	Since $r, s$ are fixed, we can denote the terms that do not depend on $y$ by a constant $c_1$,
	\begin{equation}
		c_1 = \frac{P(r) P(s)}{P(r, s)}.
	\end{equation}
	So then we can write:
	\begin{equation}
		\label{eqn:simplified_bayes_prob_0_appendix}
		P(y \mid r, s) = \frac{P(y \mid r) P(y \mid s)}{P(y)} c_1, \quad \mbox{for all $y \in [K]$}.
	\end{equation}
	Now, we assumed $P(Y=y \mid r) = \softmax(r)_y$ and $P(Y=y \mid s) = \softmax(s)_y$ for all $y \in [K]$. 
	For some constants $c_2, c_3 \in \R$, we can write this as: $P(Y=y \mid r) = \exp(r_y) / c_2$ and $P(Y=y \mid s) = \exp(s_y) / c_3$ for all $y \in [K]$.
	Substituting this into Equation~\ref{eqn:simplified_bayes_prob_0_appendix}, we get:
	\begin{equation}
		P(y \mid r, s) = \frac{\exp(r_y + s_y)}{P(y)} \frac{c_1}{c_2 c_3}, \quad \mbox{for all $y \in [K]$}.
	\end{equation}
	Writing $1/P(y)$ as $\exp(-\log{P(y)})$, and setting $c_4 = \frac{c_1}{c_2 c_3}$, this gives us:
	\begin{equation}
		P(y \mid r, s) = c_4 \exp(r_y + s_y - \log{P(y)}), \quad \mbox{for all $y \in [K]$}.
	\end{equation}
	Since the LHS is a probability, these must sum to $1$ and so $c_4$ must be a normalizing constant, that is, $c_4 = 1 / (\sum_{y \in [K]} \exp(r_y + s_y - \log{P(y)}))$.
	This gives us:
	 \begin{equation}
		P(y \mid r, s) = \softmax(r + s - m)_y, \quad \mbox{for all $y \in [K]$},
	\end{equation}
	which is precisely Equation~\ref{eqn:bayes_prob_softmax_appendix_imbalanced}.
	In the balanced setting, we have $P(Y) = 1/K$ so we simply fold $P(Y)$ into the constant $c_4$, and get:
	\begin{equation}
		P(y \mid r, s) = \softmax(r + s)_y, \quad \mbox{for all $y \in [K]$},
	\end{equation}
	which is precisely Equation~\ref{eqn:bayes_prob_softmax_appendix_balanced}.
\end{proof}

Now we are ready to prove Proposition~\ref{prop:calibration-ensemble-optimal}.

\begin{proof}[Proof of Proposition~\ref{prop:calibration-ensemble-optimal}]
	We assumed the ``balanced'' setting where $P(y) = 1/K$ for all $y$.
	From Lemma~\ref{lem:bayes_prob_softmax}, letting $\fens(x) = \fstd(x) + \frob(x)$, we have:
	\begin{equation}
		\label{eqn:first_in_proof_calibration-ensemble-optimal}
		P(y \mid \fstd(x), \frob(x)) = \softmax(\fens(x))_y,
	\end{equation}
	So this means that the ensemble prediction is the Bayes optimal given $(\fstd(x), \frob(x))$:
	\begin{equation}
		\pred(\fens(x)) = \argmax_y \fens(x)_y = \argmax_y \softmax(\fens(x))_y = \argmax_y P(y \mid \fstd(x), \frob(x)).
	\end{equation}
	But then from Lemma~\ref{lem:bayes-opt-is-optimal-appendix}, any other predictor which uses only $(\frob(x), \fstd(x))$ must have higher error.
	This completes the proof.

	Note that the inequality in the above proof is a strict inequality except in degenerate cases: as long as $\fstd$ and $\frob$ sometimes disagree in their predictions, and in some of these cases $\fstd$ assigns a higher probability to its predictions, and in some cases $\frob$ assigns a higher probability to its prediction, the inequalities will be strict inequalities.
\end{proof}


\subsection{Proof of Proposition~\ref{prop:supp_missing_ens_works}}

\newtheorem*{suppMissingEnsWorksProp}{Restatement of Proposition~\ref{prop:supp_missing_ens_works}}

\begin{suppMissingEnsWorksProp}
\suppMissingEnsWorksText{}
\end{suppMissingEnsWorksProp}

\begin{proof}
	We first note that errors are additive. That is, letting:
	\begin{equation}
		\Err(P, f) = \E[\pred(f(x)) \neq y] \mbox{, where }x, y \sim P,
	\end{equation}
	we have:
	\begin{equation}
		\Err(\alpha P_{\tau} + (1 - \alpha) P_0, f) = \alpha \Err(P_{\tau}, f) + (1 - \alpha) \Err(P_0, f)
	\end{equation}
	So it suffices to prove that the ensemble is better than the standard and robust models for $P_{\tau}$ and $P_0$ separately.

	\textbf{Suppressed features.}
	Let $\overline{\frob}(x) = \tau \frob(x)$ and $\overline{\fstd}(x) = \tau \fstd(x)$ be scaled versions of the standard and robust models.
	Definition~\ref{dfn:suppressed_spurious} implies that $\overline{\frob}$ and $\overline{\fstd}$ are calibrated.
	Since we assumed $P_{\tau}$ is balanced, by Proposition~\ref{prop:calibration-ensemble-optimal}, $\overline{\fens}$ given by $\overline{\fens(x)} = \tau \frob(x) + \tau \fstd(x)$ has optimal error on $P_{\tau}$.
	But for all $x$, the predictions of $\fens$ and $\overline{\fens}$ are the same (multiplying the outputs of a model by a constant does not change the predicted output, which is the $\argmax$).
	So $\fens$ also has optimal error on $P_{\tau}$:
	\begin{equation}
		\Err(P_{\tau}, \fens) \leq \Err(P_{\tau}, \fstd) \mbox{, and } \Err(P_{\tau}, \fens) \leq \Err(P_{\tau}, \frob)
	\end{equation}
	Note that these inequalities are strict inequalities except in degenerate cases: as long as $\fstd$ and $\frob$ sometimes disagree in their predictions, and in some of these cases $\fstd$ assigns a higher probability to its predictions, and in some cases $\frob$ assigns a higher probability to its prediction, the inequalities will be strict inequalities.

	\textbf{Missing spurious.}
	If $\fstd(x) = 0$ almost surely, then $\fens(x) = \frob(x) + \fstd(x) = \frob(x)$ almost surely.
	Furthermore, if $\fstd(x) = 0$ then its error is lower bounded by $1 - \max_y P_0(y)$.
	On the other hand, $\frob(x)$ is calibrated and therefore Bayes-optimal given $z = \frob(x)$ so from Lemma~\ref{cor:bayes-opt-better-trivial} (e.g., see the the discussion below the Lemma for more details) has error at most $1 - \max_y P_0(y)$.
	So we have:
	\begin{equation}
		\Err(P_0, \fens) = \Err(P_0, \frob) \leq \Err(P_0, \fstd)
	\end{equation}
	Note that the inequality is a strict inequality except in a degenerate case (where the probability that $\frob$ predicts for the most common class $\argmax_y P_0(y)$ is the same for all inputs).
\end{proof}


\subsection{Proof of Proposition~\ref{prop:anti_correlated_ens_fails}}

\newtheorem*{antiCorrelatedEnsFailsProp}{Restatement of Proposition~\ref{prop:anti_correlated_ens_fails}}

\begin{antiCorrelatedEnsFailsProp}
\antiCorrelatedEnsFailsText{}
\end{antiCorrelatedEnsFailsProp}

\begin{proof}
	Let $X, Y \sim \Pood$, and let $Z = (\fstd(X), \frob(X))$ be the predictions of the standard and robust models.
	Fix $z = (\fstd(x), \frob(x))$, and let $s = \fstd(x)$ and $r = \frob(x)$.
	We will analyze the errors for fixed $Z = z$ (showing that the robust model is better than the ensemble, which is better than the standard model).
	Since this is true for all $z$, we then use Lemma~\ref{lem:alt_error_appendix} (which is basically the law of total expectation), to get the desired result.

	\textbf{Bayes-opt classifier.}
	Recall that for some $\alpha, \beta > 0$, we have $\Padv(Y = y | \fstd(x)) = \softmax(-\beta \fstd(x))_y$ for all $x$ (note the minus sign), while $\Padv(Y = y \mid \frob(x)) = \softmax(\alpha \frob(x))_y$.
	Then, applying Lemma~\ref{lem:bayes_prob_softmax}, we have:
	\begin{equation}
		\Padv(y \mid (\fstd(x), \frob(x))) = \softmax(\alpha \frob(x) - \beta \fstd(x))_y.
	\end{equation}
	Rewriting this in terms of $z, r, s$, we have:
	\begin{equation}
		\label{eqn:bayes-opt-adv-spur}
		\Padv(y \mid z) = \softmax(\alpha r - \beta s)_y.
	\end{equation}

	% Latex macro note:
	% We use \jens, \jstd, \jrob to denote the indices selected by the ensemble, standard, and robust model
	\textbf{Ensemble vs. robust classifier.}
	Let $\jrob = \argmax_y r_y$ be the robust model's prediction, and $\jens = \argmax_y (r+s)_y$ be the ensemble model's prediction.
	Because $\jrob$ is the $\argmax$ of $r$, we have:
	\begin{equation}
		\label{eqn:rjrobvsjens_appendix}
		r_{\jrob} \geq r_{\jens}.
	\end{equation}
	Because $\jens$ is the $\argmax$ of $r+s$, we have:
	\begin{equation}
		\label{eqn:ensjrobvsjens_appendix}
		r_{\jens} + s_{\jens} \geq r_{\jrob} + s_{\jrob}.
	\end{equation}
	Taking the negation of this, we get:
	\begin{equation}
		\label{eqn:ensjrobvsjens_negation_appendix}
		-r_{\jrob} - s_{\jrob} \geq -r_{\jens} - s_{\jens}.
	\end{equation}
	Adding $\beta$ times Inequality~\ref{eqn:ensjrobvsjens_negation_appendix} to $(\alpha + \beta)$ times Inequality~\ref{eqn:rjrobvsjens_appendix}, we get:
	\begin{equation}
		\alpha r_{\jrob} - \beta s_{\jrob} \geq \alpha r_{\jens} - \beta s_{\jens}.
	\end{equation}
	Since $\softmax$ is monotonic, we have:
	\begin{equation}
		\softmax(\alpha r - \beta s)_{\jrob} \geq \softmax(\alpha r - \beta s)_{\jens}.
	\end{equation}
	But from Equation~\ref{eqn:bayes-opt-adv-spur} the LHS is the same as the robust model's probability of getting the label correct, and the RHS is the same as the ensemble's probability of getting the label correct:
	\begin{equation}
		\Padv(Y = \jrob \mid Z=z) \geq \Padv(Y = \jens \mid Z=z).
	\end{equation}
	Taking negations (to get the error), and then the expectation over $Z=z$, we get (note that below we write the error, which is why the sign is now flipped):
	\begin{equation}
		\Errood(\fens) \geq \Errood(\frob).
	\end{equation}
	Which is what we wanted to show.
	
	\textbf{Ensemble vs. standard classifier.}
	The argument is fairly analogous to the previous case, with some minor differences in the algebra in the first part.
	Let $\jstd = \argmax_y s_y$ be the standard model's prediction.
	Because $\jstd$ is the $\argmax$ of $s$, we have:
	\begin{equation}
		\label{eqn:sjstdvsjens_appendix}
		s_{\jstd} \geq s_{\jens}.
	\end{equation}
	Taking the negation of this, we get:
	\begin{equation}
		\label{eqn:sjstdvsjens_negation_appendix}
		-s_{\jens} \geq -s_{\jstd}.
	\end{equation}
	Because $\jens$ is the $\argmax$ of $r+s$, we have:
	\begin{equation}
		\label{eqn:ensjstdvsjens_appendix}
		r_{\jens} + s_{\jens} \geq r_{\jstd} + s_{\jstd}.
	\end{equation}
	Adding $\alpha$ times Inequality~\ref{eqn:ensjstdvsjens_appendix} with $(\alpha + \beta)$ times Inequality~\ref{eqn:sjstdvsjens_negation_appendix}, we get:
	\begin{equation}
		\alpha r_{\jens} - \beta s_{\jens} \geq \alpha r_{\jstd} - \beta s_{\jstd}.
	\end{equation}
	The rest of this step is the same as in the comparison between the ensemble and the robust model.
	Since $\softmax$ is monotonic, we have:
	\begin{equation}
		\softmax(\alpha r - \beta s)_{\jens} \geq \softmax(\alpha r - \beta s)_{\jstd}.
	\end{equation}
	But from Equation~\ref{eqn:bayes-opt-adv-spur} the LHS is the same as the robust model's probability of getting the label correct, and the RHS is the same as the ensemble's probability of getting the label correct:
	\begin{equation}
		\Padv(Y = \jens \mid Z=z) \geq \Padv(Y = \jstd \mid Z=z).
	\end{equation}
	Taking negations (to get the error), and then the expectation over $Z=z$, we get (note that below we write the error, which is why the sign is now flipped):
	\begin{equation}
		\Errood(\fstd) \geq \Errood(\fens).
	\end{equation}
	Which is what we wanted to show.

	% Combining these two inequalities (i.e., take the negative for Inequality~\ref{eqn:rjrobvsjens_appendix} to get $-r_{\jens} \geq -r_{\jrob}$ and then add that to Inequality~\ref{eqn:ensjrobvsjens_appendix}), we get:
	% \begin{equation}
	% 	\label{eqn:sjrobvsjens_appendix}
	% 	s_{\jens} \geq s_{\jrob}.
	% \end{equation}
	% Negating Inequality~\ref{eqn:sjrobvsjens_appendix} (to get $-s_{\jrob} \geq -s_{\jens}$) and adding to Inequality

	% Let's consider a more general case where $\Pood$ is balanced and satisfies the following conditions, for all $y \in [K]$:
	% \begin{align}
	% 	P(y \mid \fstd(x)) &= \softmax(\alpha \fstd(x))_y \\
	% 	P(y \mid \rob(x)) &= \softmax(\beta \rob(x))_y
	% \end{align}
	% Then from Lemma~\ref{lem:bayes_prob_softmax}, we have for all $y \in [K]$:
	% \begin{equation}
	% 	P(y \mid \fstd(x), \frob(x)) = \softmax(\fstd(x) + \frob(x))_y
	% \end{equation}

	% Let $X, Y \sim P$ and let $Z = (\fstd(X), \fstd(Y))$.
	% Fix some $z$. We define the accuracy conditioned on $z$ as:
	% % The predictions of the standard, robust, ensemble, can be written as a function of z
	% % I.e., acc(z, P, yhat) P(Y = \yhat(z) | Z=z)
	% % Can define accuracy of a model
	% % That's just acc(z, P, pred(u)
	% % Suppose standard model predicts a, robust model b, ensemble c
	% % Make it clear this is shorthand
	% % We get some inequalities
	% % We can show acc(z, P, c) >= acc(z, P, a) and
	% % acc(z, P, c) >= acc(z, P, b)
	% % Now need to link this to overall error. The point is that a, b, c are implicitly functions of z. So now can take expectation
\end{proof}

\paragraph{Dealing with class imbalance.}
Lemma~\ref{lem:bayes_prob_softmax}, Equation~\ref{eqn:general_combination_lem_bayes_prob_softmax} shows how to combine models in general, if the class-balanced assumption does not hold. Note the additional ``$-m$'' term. Here, the (marginal) probability of each class is defined in Equation~\ref{eqn:general_combination_lem_bayes_prob_softmax}.

(ID Analysis) Then, the ``Proof of Proposition~\ref{prop:calibration-ensemble-optimal}'' is identical for the general case, we just need to set $\fens(x) = \fstd(x) + \frob(x) - m$ on the first line. Equation~\ref{eqn:first_in_proof_calibration-ensemble-optimal} then follows from Lemma~\ref{lem:bayes_prob_softmax}, and the rest of the proof is identical.

(OOD Analysis) The OOD results, Proposition~\ref{prop:supp_missing_ens_works} and~\ref{prop:anti_correlated_ens_fails}, follow if the class marginal distributions match up between ID and OOD, so $\Pid(Y=y) = \Pood(Y=y)$. If the distribution over classes changes substantially, then ensembles can possibly do worse than the robust model.

% \subsection{Proof of Example~\ref{ex:imbalance-ensemble-fails}}

% \newtheorem*{imbalancedEnsembleFailsExample}{Restatement of Example~\ref{ex:imbalance-ensemble-fails}}

% \begin{imbalancedEnsembleFailsExample}
% \imbalancedEnsembleFailsText{}
% \end{imbalancedEnsembleFailsExample}