\documentclass{midl}






% Optional math commands from https://github.com/goodfeli/dlbook_notation.
\input{math_commands.tex}

% Note for Chris: I've set the jmlrvolume to our submission number - this doesn't seem to be super consistent across submissions (see https://2026.midl.io/camera-ready)
\jmlrvolume{-- 27}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\editors{Accepted for publication at MIDL 2026}


\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{xspace}         % Terms
\usepackage{amsmath}        % Maths
\usepackage{graphicx}       % graphics
% \usepackage{subcaption}  % For subfigures
\usepackage{multirow}    % Fancy tables
% \usepackage{amsthm}
\usepackage{amssymb}
\usepackage{pifont}
\newcommand{\xmark}{\ding{55}}
\newcommand{\cmark}{\ding{51}}


% Math stuff
\makeatletter
\newtheorem{assumption}{Assumption}
% \newtheorem{theorem}{Theorem}
\@ifundefined{lemma}{\newtheorem{lemma}{Lemma}}{}
% \newtheorem{proposition}{Proposition}
% \newtheorem{corollary}{Corollary}
\newcommand{\FairAUC}{\mathrm{FairAUC}}
\newcommand{\FairEnsemble}{\textsc{OxEnsemble}\xspace}
\newcommand{\qed}{$\hfill\blacksquare$}
\makeatother


\title{OxEnsemble: Fair Ensembles for Low-Data Classification}

% The \author macro works with any number of authors. There are two commands
% used to separate the names and addresses of multiple authors: \And and \AND.
%
% Using \And between authors leaves it to LaTeX to determine where to break the
% lines. Using \AND forces a line break at that point. So, if LaTeX puts 3 of 4
% authors names on the first line, and the last on the second line, try using
% \AND instead of \And before the third author name.




\midlauthor{
\Name{Jonathan Rystrøm\orcid{0000-0003-1030-5839}\nametag{$^{1}$}} \\
\addr $^{1}$ Oxford Internet Institute, University of Oxford, Oxford, UK \AND
\Name{Zihao Fu\nametag{$^{2}$}} \\
\addr $^{2}$ The Chinese University of Hong Kong, Hong Kong, China \AND
\Name{Chris Russell\orcid{0000-0003-1665-1759}\nametag{$^{1}$}\Email{firstname.lastname@oii.ox.ac.uk}}\\
}




\begin{document}


\maketitle

\begin{abstract}
We address the problem of fair classification in settings where data is scarce and unbalanced across demographic groups. Such low-data regimes are common in domains like medical imaging, where false negatives can have fatal consequences. 

We propose a novel approach \emph{OxEnsemble} for efficiently training ensembles and enforcing fairness in these low-data regimes. Unlike other approaches, we aggregate predictions across ensemble members, each trained to satisfy fairness constraints. By construction, \emph{OxEnsemble} is both data-efficient -- carefully reusing held-out data to enforce fairness reliably -- and compute-efficient, requiring little more compute than used to fine-tune or evaluate an existing model. We validate this approach with new theoretical guarantees.
Experimentally, our approach yields more consistent outcomes and stronger fairness-accuracy trade-offs than existing methods across multiple challenging medical imaging classification datasets.
\end{abstract}


\section{Introduction}



Deep learning performs exceptionally well when trained on large-scale datasets \citep{dengImageNetLargescaleHierarchical2009,gaoPile800gbDataset2020,hendrycksMeasuringMassiveMultitask2020}, but its performance deteriorates in small-data regimes. This is especially problematic for marginalised groups, where labelled examples are both scarce and demographically imbalanced \citep{dignazioDataFeminism2023,larrazabalGenderImbalanceMedical2020}. In medical imaging, underrepresentation of minority groups leads to poor generalisation and higher uncertainty \citep{riccilaraUnravelingCalibrationBiases2023,mehtaEvaluatingFairnessDeep2024,jimenez-sanchezPictureMedicalImaging2025}.
As a result, the very groups most at risk of discrimination are also those for which deep learning methods work least well.  




Existing fairness methods often fail in low-data settings \citep{pifferTacklingSmallData2024}. As data on disadvantaged groups is needed to learn effective representations \emph{and} to estimate group-specific bias, most methods underperform empirical risk minimisation \citep{zongMEDFAIRBenchmarkingFairness2022}.

Ensembles offer a natural way to address these challenges. By aggregating predictions across members, ensembles make more efficient use of scarce examples while leveraging disagreement between members for robustness \citep{theisenWhenAreEnsembles2023}. This makes ensembles particularly attractive for fairness in low-data regimes, but without theoretical foundations, improvements remain inconsistent \citep{koFAIRensembleWhenFairness2023,schweighoferDisparateBenefitsDeep2025}.

We address this by introducing \FairEnsemble: ensembles explicitly designed to enforce fairness constraints at the member level and provably preserve them at the ensemble level. Our theoretical results show when minimum rate and error-parity constraints are guaranteed to hold, and how much validation data is required to observe these guarantees in practice. Empirically, we demonstrate that \FairEnsemble outperforms strong baselines in medical imaging---where fairness is urgently needed but data for disadvantaged groups is limited.




We make three contributions:
\begin{enumerate}
    \item \textbf{Method:} We introduce an efficient ensemble framework of fair classifiers (\FairEnsemble) tailored to fairness in small image datasets.
    \item \textbf{Theory:} We prove that our fair ensembles are guaranteed to preserve fairness under both error-parity and minimum rate constraints, and we derive how much data is required to observe minimum rate guarantees in practice.
    \item \textbf{Results:} Across three medical imaging datasets, our method consistently outperforms existing baselines on fairness–accuracy trade-offs. 
\end{enumerate}

% TODO: Actually make sure this works.
The article is organised as follows: \secref{sec:related} presents related work in low-data fairness and fairness in ensembles. \secref{sec:method} describes both how we construct and train the ensemble (\secref{sec:ensemble}) and the formal guarantees for when it works (\secref{sec:theory}). Finally, \secref{sec:analysis-experiments} and \secref{sec:results} provide empirical support for the benefits of fair ensembles versus strong baselines on challenging datasets.


\begin{figure}[!tbp]
  \centering
  \setlength{\tabcolsep}{3pt} % tighten table cell padding

  % (a) Narrow table
  \subfigure[Comparison with related work.\label{fig:related-table}]{%
    \begin{minipage}{0.47\textwidth}
      \centering
      \scriptsize
      \renewcommand{\arraystretch}{1.02}
      \begin{tabular}{@{}p{1.9cm}ccccc@{}}
        \toprule
        \textbf{Paper} & \textbf{Deep} & \textbf{Img.} & \textbf{Interv.} & \textbf{Min.~Rates} & \textbf{Low-data} \\
        \midrule
        \citet{grgic-hlacaFairnessDiversityRandomness2017}      & \xmark & \xmark & \xmark & \xmark & \xmark \\
        \citet{bhaskaruniImprovingPredictionFairness2019} & \xmark & \xmark & \cmark & \xmark & \xmark \\
        \citet{goharUnderstandingFairnessIts2023}   & \xmark & \xmark & \xmark & \xmark & \xmark \\
        \citet{koFAIRensembleWhenFairness2023}           & \cmark & \cmark & \xmark & \xmark & \xmark \\
        \citet{claucichFairnessDeepEnsembles2025}      & \cmark & \cmark & \cmark & \xmark & \xmark \\
        \citet{schweighoferDisparateBenefitsDeep2025}         & \cmark & \cmark & \cmark & \xmark & \xmark \\
        \midrule
        \textbf{OxEnsemble}           & \cmark & \cmark & \cmark & \cmark & \cmark \\
        \bottomrule
      \end{tabular}
    \end{minipage}
  }\hfill
  % (b) Figure
  \subfigure[\FairEnsemble pipeline\label{fig:ensemble-illustration}]{%
    \begin{minipage}{0.43\textwidth}
      \centering
      \includegraphics[width=\linewidth]{figs/fairensemble-explainer.pdf}
    \end{minipage}
  }

  % Full main caption (preserve original pipeline caption)
  \caption{\textbf{(a) Comparisons.} We compare existing works on whether they study deep ensembles; have been applied to images; propose fairness interventions; enforce minimum rates, and target low-data regimes.
  \textbf{(b) \FairEnsemble pipeline.} \emph{Train (1):} Members share backbone and task + protected attributes. \emph{Validate (2):} Enforce fairness constraint while maximising accuracy. \emph{Predict (3):} Majority vote. Partitioning ensures full coverage; shared backbone improves efficiency, and voting provides guarantees.}
  \label{fig:table_and_figure}
\end{figure}


\section{Related Work} \label{sec:related}


\paragraph{Fairness Challenges in Low-Data Domains:}
Deep learning methods achieve near-human performance on overall metrics \citep{liuDeepLearningSystem2020}, yet consistently underperform for marginalised groups in medical imaging \citep{xuAddressingFairnessIssues2024,daneshjouDisparitiesDermatologyAI2022,seyyed-kalantariUnderdiagnosisBiasArtificial2021}. A central source of bias is unbalanced datasets \citep{larrazabalGenderImbalanceMedical2020}, where disadvantaged groups lack examples to learn reliable representations, leading to poor calibration and uncertain predictions \citep{riccilaraUnravelingCalibrationBiases2023,mehtaEvaluatingFairnessDeep2024,christodoulouConfidenceIntervalsUncovered2024}.  

Defining fairness is equally challenging. Standard parity-based metrics such as equal opportunity \citep{hardtEqualityOpportunitySupervised2016} can be satisfied trivially by constant classifiers in imbalanced datasets and often reduce performance for all groups, a phenomenon of ``levelling down'' with serious real-world consequences \citep{zhangImprovingFairnessChest2022,zietlowLevelingComputerVision2022,mittelstadtUnfairnessFairMachine2024}. In safety-critical domains such as medicine, \emph{minimum rate constraints}—which enforce a performance floor across groups—are often more appropriate to ensure that classifiers serve all subpopulations \citep{mittelstadtUnfairnessFairMachine2024}. For further works, see Appendix \ref{app:related}.
\paragraph{Fairness in Ensembles:}
Prior work has observed that ensembles sometimes improve fairness by boosting performance on disadvantaged groups \citep{koFAIRensembleWhenFairness2023,schweighoferDisparateBenefitsDeep2025,claucichFairnessDeepEnsembles2025,grgic-hlacaFairnessDiversityRandomness2017}. However, these studies are observational: improvements are not guaranteed, and in some cases ensembles can even worsen disparities \citep{schweighoferDisparateBenefitsDeep2025}. Our approach is interventionist. Building on theoretical results for ensemble competence \citep{theisenWhenAreEnsembles2023}, we extend their proofs to fairness settings. This allows us to show formally \emph{why and when} ensembles improve fairness, unlike prior works which only demonstrated that they sometimes do. See Table \ref{fig:related-table} for a complete comparison with related works.

\citet{schweighoferDisparateBenefitsDeep2025} proposed per-group thresholding \citep{hardtEqualityOpportunitySupervised2016} to enforce equal opportunity on an ensemble's output. This may not work for imaging tasks as it requires explicit group labels that are not part of images. It is also inappropriate for low-data regimes as it requires a large held-out test set to reliably correct for unfairness.


\section{Methods} \label{sec:method}
%\paragraph{The fundamental trade-off:}
At its heart, this paper looks to circumvent a fundamental trade-off:

\emph{Held-out data must be used to reliably measure and remove bias \citep{zietlowLevelingComputerVision2022}, but holding back data reduces performance of the base classifier -- particularly on data-scarce minority groups.}

 We circumvent this trade-off through ensemble-based data reuse. Each member of the ensemble has its fairness enforced using held-out data. However, as this data changes from ensemble member to member, the ensemble as a whole has better generalisation than a single member. Novel theoretical results show that we can expect fairness at the member level to transfer to behaviour of the ensemble as a whole (see \secref{sec:theory}).
 
\paragraph{Choice of fairness constraints:}
We focus on two fairness constraints: \emph{equal opportunity} \citep[$EO_p$, the maximum difference in recall across groups;][]{hardtEqualityOpportunitySupervised2016} and \emph{minimum recall} \citep[the lowest recall of any group ;][]{mittelstadtUnfairnessFairMachine2024}. Both target false negatives, which is appropriate when missing a positive case (e.g., a deadly disease) is far more costly than overdiagnosis---a scenario that is especially relevant in medical imaging \citep{seyyed-kalantariUnderdiagnosisBiasArtificial2021}. Of the two measures, we believe \emph{minimum recall rates} to be more clinically relevant, while \emph{equal opportunity} is more common in the field. While we highlight these two constraints, our approach can be applied to any other fairness metrics supported by OxonFair \citep{delaneyOxonFairFlexibleToolkit2024}.


\subsection{Ensemble Construction and Training} \label{sec:ensemble}
We consider an ensemble of deep neural networks (DNNs) sharing a pretrained convolutional backbone (\Figref{fig:ensemble-illustration}). Each ensemble member is trained on a separate fold, stratified by both target label and group membership \citep{trStratifiedKfoldsCrossvalidation2023}. Training each member on different folds allows us to fully utilise the dataset, unlike standard fairness methods that require held-out validation data \citep{hardtEqualityOpportunitySupervised2016,delaneyOxonFairFlexibleToolkit2024}. Predictions are aggregated by majority voting, which enforces the guarantees of \citet{theisenWhenAreEnsembles2023} (see \secref{sec:theory}).


\paragraph{Enforcing the fairness of ensemble members:}
Each ensemble member is trained as a multi-headed classifier following OxonFair \citep{delaneyOxonFairFlexibleToolkit2024}. These heads predict both the task label (e.g., disease vs. no disease) and the protected attribute (i.e., group membership; see \Figref{fig:ensemble-illustration}, left). The task prediction head is trained with standard cross-entropy loss, while the group heads predict a one-hot encoding of the protected attribute using a squared loss. 

The two heads are combined using OxonFair's multi-head surgery. This procedure takes a weighted average of the heads and a constant classifier, with weights selected on a validation set to enforce fairness constraints (e.g. the difference in recall between groups is less than 2\% or the minimum recall over any group is more than 70\%) while maximising accuracy. This averaging process can be performed in place, resulting in a single fair classifier with the same architecture as the single-headed model that predicts the original task label. \citep[See][section 4.2 for details]{delaneyOxonFairFlexibleToolkit2024}.


This formulation allows any group fairness definition that can be expressed as a function of per-group confusion matrices to be optimized. Because weights are selected using held-out data, we can enforce error-based criteria---such as equal opportunity or minimum recall---even when the base model overfits during training. In practice, we enforce fairness per member using the held-out data of their corresponding fold, and we optimize over accuracy together with an experiment-specific fairness constraint: either minimum recall or equal opportunity. 

%For robustness, we use a \emph{multi-split strategy}: all non-test data is divided into a different train/validation partition per member, and fairness constraints are enforced separately on each. In practice, we optimize over accuracy together with an experiment-specific fairness constraint: either minimum recall or equal opportunity.




\paragraph{Efficient ensembling of deep networks:}
The main computational bottleneck in deep CNNs is the backbone. To avoid repeatedly running the same backbone for ensemble members, we concatenate all classifier heads on a shared backbone. During training, the loss is masked so only the relevant head is updated for each data point. When the backbone is pretrained and frozen,\footnote{Freezing the backbone helps prevent overfitting on small datasets.} this is equivalent to training each member independently while requiring only a single backbone pass. A related idea with backbone fine-tuning is described by \citet{chenGroupEnsembleLearning2020}. We use EfficientNetV2 \citep{tanEfficientNetV2SmallerModels2021} pretrained on ImageNet \citep{dengImageNetLargescaleHierarchical2009} as the backbone in all experiments. We show alternative, but qualitatively similar, results with MobileNetv3 in Appendix \ref{app:mobilenetv3} \citep{howardSearchingMobileNetV32019}.

%The main computational bottleneck in deep CNNs is the backbone. To avoid repeatedly running the same backbone for different ensemble members, we concatenate all classifier heads on top of a shared backbone. During training, the loss is masked so that only the relevant head is updated for each data point. When the backbone is pretrained and frozen,\footnote{Freezing the backbone helps prevent overfitting on small datasets.} this procedure is effectively equivalent to training each ensemble member independently, while requiring only a single backbone pass. A related idea with backbone fine-tuning is described by \citet{chenGroupEnsembleLearning2020}. We use EfficientNetV2 \citep{tanEfficientNetV2SmallerModels2021} pretrained on ImageNet \citep{dengImageNetLargescaleHierarchical2009} as the backbone in all experiments.

This yields substantial efficiency gains. Inference speed is essentially identical to a single ERM model, while training is somewhat slower due to multiple heads, but still much faster than training all members separately (which would be about $M\times$ slower for an M-member ensemble). Appendix~\ref{app:efficiency} provides empirical comparisons for the efficiency gains (see Tables \ref{tab:single_image_latency} and \ref{tab:runtime_comparison}), and Appendix~\ref{app:implementation} gives implementation details. To ensure robustness, each experiment is repeated over three train/test splits.

\subsection{Formal Guarantees for Fairness} \label{sec:theory}
We now ask: under what conditions can ensembles be expected to \emph{guarantee} fairness improvements? As mentioned in \secref{sec:related}, most prior work on fairness in ensembles is observational, showing that ensembles sometimes improve fairness \citep[e.g.,]{claucichFairnessDeepEnsembles2025,koFAIRensembleWhenFairness2023}, while \citet{schweighoferDisparateBenefitsDeep2025} showed that fairness could be enforced on the output of an ensemble using standard postprocessing. In contrast, we take an interventionist approach and ask, \emph{after enforcing fairness per ensemble member, can we expect it to transfer to the ensemble as a whole?}. We provide theoretical conditions under which fairness is improved,and show how it can be used in practice.


%Specifically, we address two questions: 
%\begin{enumerate}
%    \item \textbf{Minimum rate constraints:} How high must minimum rate constraints be set to ensure that ensembles preserve fairness?
%    \item \textbf{Sample sizes:} How large must group sizes in the validation and evaluation sets be to observe these guarantees empirically? 
%\end{enumerate}

The theory is based on \citet{theisenWhenAreEnsembles2023}, who show that \textit{competent} ensembles never hurt accuracy. Informally, an ensemble is competent over a distribution $D$ if it is more likely to be confidently right than confidently wrong. Let the error rate of an ensemble $\rho$ be:\footnote{For definitions of all notation used see Table \ref{tab:notation}.} 
%
\[
W_\rho = W_\rho(X,y) = \E_{h\sim \rho} [1(h(X) \neq y )]
\]
and define
\[
 C_\rho(t) = P_{(X,y)\sim D}(W_\rho \in [t, 1/2)) - P(W_\rho  \in [1/2, 1 - t]) \,\,\,\forall t\in [0,1/2)
\]
%
The ensemble is \emph{competent} if  $C_\rho(t) \geq 0$ for all $t\in[0,1/2)$. This definition makes no distributional assumptions and can be verified on held-out data. 

\citet{theisenWhenAreEnsembles2023} showed that if competence holds on a dataset $(X,y)$, then majority voting improves accuracy relative to a single classifier, with the improvement bounded by the disagreement between members. 


To extend competence to fairness metrics, we evaluate competence on \emph{restricted subsets of the data}. Let $\mathcal{G}$ be the set of protected groups. For any group $g\in\mathcal{G}$, write $g{+}$ for the positives $(y=1, A=g)$ belonging to a group. We similarly write $D^+$ for the set of all positives in the distribution.% and $D-$ for all negatives.
We define
\begin{equation}
\label{restricted_compitence}
C_{\rho}^{g+}(t) =P_{(X,y)\sim g+}(W\rho \in [t, 1/2)) - P_{(X,y)\sim g+}(W\rho  \in [1/2, 1 - t])
\end{equation}
We say an ensemble is \emph{restricted groupwise competent} if $C_{\rho}^{g+}(t)>0$ for all $t$, $g\in G$, and say it is \emph{restricted competent} if $C_{\rho}^{D^+}(t)>0$.


Based on this, we derive three main results:
\begin{enumerate}
    \item \textbf{Minimum rate constraints:} If an ensemble is restricted groupwise competent, and every member of the ensemble satisfies a minimum rate constraint, then the ensemble as a whole also satisfies that minimum rate. 
    \item \textbf{Error parity:} If an ensemble is restricted groupwise competent, and if every member of the ensemble approximately satisfies an error parity measure (e.g., equal opportunity), then the ensemble as a whole also approximately satisfies it. The achievable bounds depend on disagreement- and error rates of the members.
    \item \textbf{Restricted Groupwise Competence} can be enforced by appropriate minimum recall constraints. 
\end{enumerate}

Together these results show how ensemble competence on restricted subsets provides guarantees for both minimum rate constraints and error parity measures, covering a broad range of fairness definitions. Moreover, (iii) shows that the conditions required for the theorem to hold are exactly those enforced by setting minimum recall rates.

We begin with a lemma.
\begin{lemma}
\label{lemma}
Restricted competent ensembles do not degrade recall relative to the average recall of a member.
\end{lemma}
\begin{proof}
Proof follows immediately by applying the main result of \citet{theisenWhenAreEnsembles2023} to $D^+$ rather than $D$, and observing that accuracy when restricted to the positives is equivalent to recall.\footnote{A similar argument can be made using the negatives and \emph{sensitivity}.} 

This main result bounds the \emph{Error Improvement Rate (EIR)}---the ensemble's relative improvement over a single classifier---by the \textit{Disagreement Error Ratio (DER)}. See Appendix \ref{app:formalism} for formal definitions. For binary classification, the bounds are given by Eq. \ref{eq:original-theorem} for an arbitrary data distribution, $\mathcal{D}$:
%
\begin{equation} \label{eq:original-theorem}
\text{DER}_\mathcal{D} \geq \text{EIR}_\mathcal{D} \geq \max(\text{DER}_\mathcal{D} - 1, 0)
\end{equation}
%
Replacing $D$ with $D^+$ implies the error improvement rate on the positives must be non-negative for a \emph{restricted competent} ensemble as required.
\end{proof}

\subsubsection{Restricted Groupwise Competence Guarantees}

\paragraph{1. Minimum rates for competent ensembles:}
We apply the result from lemma~\ref{lemma} to each group independently. We observe that if the ensemble is \emph{restricted groupwise competent}, the recall rate for each group can not degrade by ensembling. Therefore the minimum recall rate for any group, must also not be degraded. \qed

\paragraph{2. Error parity from competence:}
Error-parity constraints such as approximate equal opportunity \citep[equality of recall across groups;][]{hardtEqualityOpportunitySupervised2016} or approximate equality of accuracy \citep{zafarFairnessConstraintsFlexible2019} are harder to guarantee. The difficulty is that while ensembles can improve average performance, unequal improvements across groups can increase disparities \citep[see, e.g.,][]{schweighoferDisparateBenefitsDeep2025}. Nonetheless, \emph{restricted groupwise competence} still yields limited but useful bounds.

We consider the $L_\infty$ form of approximate fairness: a classifier has $k$-approximate fairness with respect to groups $\mathcal{G}$ if 
\begin{equation}
    k\geq \max_{g\in\mathcal{G}}L_g(h) - \min_{g\in\mathcal{G}}L_g(h)
\end{equation}
where $L_g$ is the average loss on group $g$, corresponding to $1$ minus one of the measures we are concerned with (typically recall).
%
The question then is, if every member of the ensemble exhibits $k$-approximate fairness, what fairness bounds do we have for the ensemble? 

By applying Eq. \ref{eq:original-theorem} (see Appendix~\ref{app:deo-derivation} for derivation), we obtain the following bound:
%
\begin{align} \label{eq:deo-bounds}
    k^*&\leq k + \max_{g\in\mathcal{G}}\mathbb{E}_{h\sim \rho} [L_g(h)]\text{DER}_{g^*}-\max(0,\min_{g\in\mathcal{G}}\mathbb{E}_{h\sim \rho} [L_g(h)](\text{DER}_{g^*}-1)) 
\end{align}
%
Both bounds are pessimistic. In practice, our approach works well for enforcing equal opportunity (see \secref{sec:results}). Still, two insights follow:
First, viewed through the governance lens of \emph{levelling down} \citep{mittelstadtUnfairnessFairMachine2024} these fairness violations are less concerning. Fairness was enforced per ensemble member, and presumably performance per group was set at an acceptable level. Any subsequent unfairness comes because groups are doing better than expected, rather than worse.
Second, the bound scales with $L_g$,and therefore the worst-case disparity shrinks as group losses decrease. In practice, this means that enforcing additional minimum rate constraints through our method can tighten the bounds.



\subsubsection{Guarantees for Minimum Recall} \label{sec:theory:min-recall}
The previous section showed that restricted groupwise-competent ensembles can improve minimum rates and fairness. In this section, we show how to ensure restricted groupwise competence by setting minimum recall rates.

Enforcing minimal recall rates for each ensemble member alters the decisions made. Looking at \eqref{restricted_compitence}, we observe that increasing the recall rate for all ensemble members over some group $g$ decreases the probability of error over the positives. As such, enforcing a sufficiently high recall rate can guarantee competence (i.e., perfect recall implies no errors and therefore competence).

In practice, identifying the smallest minimal recall rate that guarantees competence is an empirical question and requires a further held-out set to measure competence as a function of minimal recall.
Given the paucity of data, we are unable to do this. Instead, we prove that, for a minimum recall rate of more than $50\%$, competence is guaranteed for an ensemble where the members make independent errors. See appendix \ref{blah} for details. This result is consistent with \emph{Jury Theorems} \citep{condorcetEssaiLapplicationLanalyse1785,berendWhenCondorcetsJury1998,kanazawaBriefNoteFurther1998,pivatoEpistemicDemocracyCorrelated2017} that show that majority votes from mildly correlated voters with average accuracy $>0.5$ improve over individual voters, converging to perfect accuracy as ensemble size increases \citep{matteiAreEnsemblesGetting2025}. We emphasise that only the specific value of 0.5 depends on independence assumptions. The existence of \emph{some} threshold does not, and neither does the rest of the theory.


Similarly, when the minimum recall for every member falls below ($50\%$), independent ensembles are not restricted groupwise competent. We demonstrate this also holds empirically in Fig. \ref{fig:min-recall-competence}, where no group achieves competence when $k<0.5$ across two datasets (see \secref{sec:data}).

 



\begin{figure}
    \centering
    \includegraphics[width=\linewidth]{figs/recall_competence_simple_plot.pdf}
    \caption{\textbf{Competence Violations vs Recall.} Competence violations ($C_\rho$; 0=perfect) are high when recall<0.5 and stabilize at recall>0.5. \emph{Left}: Test set for fitting and evaluation. \emph{Right}: Validation set for fitting, test set for evaluation.}
    \label{fig:min-recall-competence}
\end{figure}





\subsubsection{Minimum validation and evaluation sizes} \label{sec:theory:sizes}
Under the assumption of independent errors, a minimum recall of $k > 0.5$ on the test set, guarantees that the ensemble will also have a minimum recall of $k$.
The challenge here is that recall constraints are imposed on validation data, and as we are dealing with very low-data groups, sometimes with $<100$ positive cases, the constraints need not generalise to test data. 


To ensure these constraints generalise to test data, we want to determine the minimum recall, $P_{\min}$, required the on a validation set with $m$ positives in the minority group such that with a probability $\alpha$, the recall on an evaluation set with $n$ positives will be at least $50\%$. This guarantees that the minimum recall of the ensemble is greater than the average recall of each member.

We assume that validation and test sets are of known sizes, $m$ and $n$ respectively, and drawn from the same distribution. By drawing on the literature for one-sided hypothesis tests on Bernoulli distributions, we arrive at Eq. \ref{eq:minimum-observed-recall}. 
%
\begin{equation} \label{eq:minimum-observed-recall}
p_{\min} = 0.5 + \tfrac{1}{2}z_{1-\alpha}\sqrt{\tfrac{1}{m}+\tfrac{1}{n}}.
\end{equation}
%
Here  $z_{1-\alpha}$ is the z-score for significance $1-\alpha$. The primary implication of Eq. \ref{eq:minimum-observed-recall} is that larger $n$ decreases the need for high validation thresholds -- especially in small-data settings. For derivations see Appendix \ref{app:minimum-sizes}.
%
We find empirical support for our theoretical guarantees of fairness on positive samples in Appendix \ref{app:competence-validation}. Here, we show that as long as the minimum recall is enforced at a sufficiently high threshold, we observe restricted groupwise competence on the test set. 

This result is more generally applicable outside of fairness, and to ensure a classifier has a recall of more than $k$ with probability $\alpha$, on an unseen test set, the recall on a validation set should be set to more than

\[
p_{\min} = k + z_{1-\alpha}\sqrt{\,k(1-k)\Bigl(\tfrac{1}{m}+\tfrac{1}{n}\Bigr)}.
\]

See Appendix \ref{app:minimum-sizes} for more details. 


%This combines our theoretical guarantees of competence (\secref{sec:theory:min-recall}) and minimum group sizes (\secref{sec:theory:sizes}).




\section{Experimental Setup} \label{sec:analysis-experiments}
\renewcommand{\arraystretch}{0.8}
\begin{table}[ht]
\footnotesize
\centering
\caption{Evaluation datasets. ``Min. Positives’’ is the number of \emph{positive} examples in the smallest group (bold). These small counts stress-test low-data fairness.}
\label{tab:datasets}
\begin{tabular}{llrl}
\toprule
\textbf{Dataset} & \textbf{Task} & \textbf{\# Min. Positives} & \textbf{Protected Attributes} \\
\midrule
HAM10000 & Skin cancer & 94 & Age (0-40, \textbf{40-60}, 60+) \\
Fitzpatrick17k & Dermatology & 60 & Skin type (I-IV, V, \textbf{VI}) \\
Harvard-FairVLMed & Glaucoma & 399 & Race (Asian, White, \textbf{Black}) \\
% \midrule
% \multicolumn{4}{l}{\textit{Natural Language}} \\
% Multilingual Twitter & Polish hate speech & 60 & Gender (male, \textbf{female}) \\
\bottomrule
\end{tabular}
\end{table}
\renewcommand{\arraystretch}{1.0}

\paragraph{Data and Protected Attributes} \label{sec:data}



We evaluate on three medical imaging datasets from MedFair \citep{zongMEDFAIRBenchmarkingFairness2022} and FairMedFM \citep{jinFairMedFMFairnessBenchmarking2024}---see Table \ref{tab:datasets}. Each task is a binary classification with image-only inputs (discarding all auxiliary features for fair comparison). %We add a multilingual hate speech dataset for cross-modality validation \citep{huangMultilingualTwitterCorpus2020}.
%
For Fitzpatrick17k, the common binary split (I–III vs.\ IV–VI) can mask harms to the darkest tone (VI), which comprises only 0.4\% of positives. We instead separate out V and VI, grouping I–IV to preserve adequate support elsewhere.

\paragraph{Preprocessing and splits:}
Images are centre-cropped and resized to 224x224 \citep{dengImageNetLargescaleHierarchical2009} with random augmentations during training. Dataset-specific validation/test sizes follow \secref{sec:theory:sizes} to guarantee 70\% minimum observable recall. See Appendix \ref{app:implementation} for full details.





\paragraph{Evaluation Metrics:} \label{sec:evaluation}

Medical classification is a non-zero-sum game where ``levelling down''---reducing groups' performance to achieve parity---can have fatal consequences \citep{mittelstadtUnfairnessFairMachine2024}. The predominant harm is failing to diagnose ill people from disadvantaged groups, making \textit{minimum recall} a more appropriate metric than disparity-based measures such as equal opportunity. Moreover, with positive class incidence below 10\% for disadvantaged groups, a trivial all-negative classifier achieves high accuracy, and perfectly satisfies equal opportunity, while misclassifying all sick patients.

%We evaluate models on the Pareto frontier between minimum recall and accuracy \citep{delaneyOxonFairFlexibleToolkit2024}. 
However, a key question when using \emph{minimum recall rates} is ``What should the rate be set to?'' Our position is that this is a deployment decision that must be made on a case-by-case basis. As such,  our primary metric, $\FairAUC$, summarizes the possible choices by averaging the best accuracy $a$ achievable for each minimum recall threshold $t \in T$:
%
\begin{equation} \label{eq:improvement}
\FairAUC = \frac{1}{|T|} \sum_{t \in T} \left( \max\limits_{(a,r) \in M, r \geq t} a \right)
\end{equation}
%
where $M$ are model configurations and $r$ is minimum recall. We evaluate over $T \in [0.5,1]$---the zone with theoretical guarantees (\secref{sec:theory}). Confidence intervals use 200 bootstrap samples at 95\%. For baselines without explicit thresholding, we generate Pareto frontiers by varying global thresholds on validation data. %$\FairAUC$ is undefined for error-parity measures.

\paragraph{Baselines and Ensemble Settings:} \label{sec:baseline}
\begin{figure}[!htbp]
    \centering
    \subfigure[Fitzpatrick17k\label{fig:fitzpatrick-improvements}]{
        \includegraphics[width=0.48\textwidth]{figs/efficientnet_s-fitzpatrick17k-all_threshold_evaluation_min_recall_improvement.pdf}
    }\hfill
    \subfigure[HAM10000\label{fig:ham10000-improvements}]{
        \includegraphics[width=0.48\textwidth]{figs/efficientnet_s-ham10000-all_threshold_evaluation_min_recall_improvement.pdf}
    }
    \caption{\textbf{Fairness–accuracy AUC (FairAUC) relative to ERM.} \FairEnsemble achieves higher FairAUC than all baselines on Fitzpatrick17k (left) and HAM10000 (right). Error bars show 95\% bootstrap CIs. Evaluation follows \secref{sec:evaluation} over minimum-recall thresholds in $[0.5,1]$.}
    \label{fig:combined-improvements}
\end{figure}
We compare against established fairness methods to ensure a meaningful contribution. As a reference, \textbf{Empirical Risk Minimisation (ERM)} minimises training error without considering fairness \citep{vapnikNatureStatisticalLearning2000}. We include \textbf{Domain-Independent Learning}, which trains a separate classifier for each protected class with a shared backbone, and \textbf{Domain-Discriminative Learning}, which encodes protected attributes during training and removes them at inference \citep{wangFairnessVisualRecognition2020}. \textbf{Fairret} introduces a regularisation term accounting for the protected attribute and fairness criterion \citep{buylFairretFrameworkDifferentiable2023}, while \textbf{OxonFair} tunes decision thresholds on validation data to enforce group-level fairness \citep{delaneyOxonFairFlexibleToolkit2024}. \textbf{Ensemble (HPP)} implements a homogenous ensemble \citep[similar to][]{koFAIRensembleWhenFairness2023} followed by Hardt Post Processing \citep{hardtEqualityOpportunitySupervised2016} as proposed by \citet{schweighoferDisparateBenefitsDeep2025}. Finally, \textbf{Ensemble (ERM)} is equivalent to our method without enforced constraints, serving as an ablation to assess whether the added fairness interventions of \FairEnsemble increases $\FairAUC$.


All baselines are trained with the same configuration as our ensembles. Minority groups are rebalanced via upsampling as suggested by \citet{claucichFairnessDeepEnsembles2025}, and we reimplement methods following \citet{zongMEDFAIRBenchmarkingFairness2022} and \citet{delaneyOxonFairFlexibleToolkit2024}. Fairret requires a hyperparameter search over regularisation weights. To generate comparable Pareto frontiers, we fit global prediction thresholds so that a minimum recall of $k$ is enforced on a held-out validation set, mirroring deployment where thresholds are tuned on available data but applied to unseen test data \citep{kamiranQuantifyingExplainableDiscrimination2013}. For minimum recall experiments, for all methods that do not naturally support minimum recall rates, we select a global threshold that maximises accuracy while achieving a recall $>k$ on the validation set.


\begin{table}[t]
\centering
\small
\setlength{\tabcolsep}{8pt}
\caption{Single-image inference. Details in Appendix \ref{app:efficiency}.}
\begin{tabular}{l rr}
\toprule
& \multicolumn{2}{c}{Latency (ms) $\downarrow$} \\
\cmidrule(lr){2-3}
Method & CPU & CUDA \\
\midrule
ERM & 112.22 $\pm$ 13.58 & 5.42 $\pm$ 0.31 \\
Ensemble & 107.15 $\pm$ 12.41 & 5.83 $\pm$ 0.38 \\
\bottomrule
\end{tabular}

\label{tab:single_image_latency}
\end{table}


\paragraph{Ensemble size:}
We use 21 members for all ensembles. Appendix~\ref{app:ensemble-size} shows that $\FairAUC$ is stable across different sizes from 3 to 21 within confidence intervals. We default to 21: it is consistent with our theory that majority voting benefits from more members, while our shared-backbone design keeps inference time essentially unchanged (see Table~\ref{tab:single_image_latency} for efficiency comparisons).




\section{Results} \label{sec:results}
See Appendix \ref{app:mobilenetv3} for similar results with an alternative backbone.
%\renewcommand{\arraystretch}{0.7}
\begin{table}[h]
\footnotesize
\centering
\caption{Accuracy and fairness violations. Best value in \textbf{bold}.}
\begin{tabular}{c|cc|cc}
\toprule
\multirow{2}{*}{Dataset} & \multicolumn{2}{c|}{Accuracy ↑} & \multicolumn{2}{c}{Fairness Violations ↓} \\
& \FairEnsemble & OxonFair & \FairEnsemble & OxonFair \\
\midrule
FairVLMed      & \textbf{0.665} & 0.657 & \textbf{0.009} & 0.011 \\
Fitzpatrick17K & \textbf{0.642} & 0.623 & 0.057 & \textbf{0.048} \\
HAM10000       & \textbf{0.707} & 0.679 & \textbf{0.067} & 0.082 \\
\bottomrule
\end{tabular}
\label{tab:ensemble_comparison}
\end{table}







\paragraph{FairVLMed:}
% \subsection{Medical Imaging}


In \Figref{fig:frontiers} (right), only \FairEnsemble maintains fairness at strict thresholds ($\text{EqualOpportunity}<4\%$). Most methods break down above 6\%. Compared to OxonFair, \FairEnsemble achieves higher accuracy with lower fairness violations (Table~\ref{tab:ensemble_comparison}). While standard ensembles have slightly higher accuracy, \FairEnsemble consistently reduces disparities further (e.g., equal opportunity from 6\% to $<5\%$ with $<1$pp accuracy loss). The HPP-based method from \citet{schweighoferDisparateBenefitsDeep2025} fails to enforce equal opportunity ($\text{EO}_p>11\%$).



\paragraph{Fitzpatrick17k:} \label{sec:fitz-results}
\begin{figure}
    \centering
    \includegraphics[width=\linewidth]{figs/efficientnet_s-all_datasets-all_iterations-all_threshold_evaluation.pdf}
    \caption{\textbf{Pareto frontiers across datasets.} \FairEnsemble (green) yields better fairness–accuracy trade-offs than baselines (grey). Left/centre: min recall (HAM10000, Fitzpatrick17k). Right: equal opportunity (FairVLMed). See \secref{sec:evaluation} for definitions.}

    \label{fig:frontiers}
\end{figure}
Here, in the most challenging setting (60 positive samples in the smallest group), \FairEnsemble clearly outperforms all baselines. It reaches $\FairAUC=67.7\%$, compared to 57.0\% for standard ensembles (58.9\% with HPP) and 51.3\% for ERM (\Figref{fig:fitzpatrick-improvements}). Across thresholds, \FairEnsemble is Pareto-optimal (\Figref{fig:frontiers}, centre).



\paragraph{HAM10000:} \label{sec:ham-results}
%Table~\ref{tab:ensemble_comparison} shows 
\FairEnsemble achieves the highest accuracy and lowest fairness violations. Its $\FairAUC=71.1\%$ significantly outperforms ERM (65.7\%), baseline ensembles (69.8\% \& 69.2\%), and OxonFair (67.9\%). All other methods perform worse than ERM.



\section{Conclusion} \label{sec:conclusion}
A lack of data for minority groups remains one of the fundamental challenges in ensuring equitable outcomes for disadvantaged groups.
We present a novel framework for constructing efficient ensembles of fair classifiers that address the challenge of enforcing fairness in these low-data settings. Across three medical imaging datasets, our method consistently outperforms existing fairness interventions on fairness-accuracy trade-offs. Unlike prior work on ensembles that observed occasional fairness improvements, our approach guarantees that fairness is not degraded and shows that ensembles are a practical tool for reusing scarce data to produce more reliable fairness estimates.

Our theoretical analysis explains \textit{why} these improvements occur. We prove that enforcing minimum rate constraints above $0.5$ ensures ensemble competence for the worst-performing groups, derive bounds for error-parity measures such as equal opportunity, and provide principled guidance on the validation and test sizes needed for these guarantees to hold in practice. Together, these results expand the understanding of both when and why ensembles improve fairness, offering a principled and empirically validated method for building more equitable classifiers in high-stakes domains. Code can be found on \href{https://github.com/jhrystrom/guaranteed-fair-ensemble}{GitHub}.




\bibliography{midl26_27}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\appendix

\section{Implementation Details} \label{app:implementation}
The code and instructions for reproducing the results can be found in our GitHub repository\footnote{Link: \url{https://github.com/jhrystrom/guaranteed-fair-ensemble}}. Optimisation for all models is done using Adam \citep{kingmaAdamMethodStochastic2015}  with a learning rate of 0.0001. 

The test splits for the baseline methods (see \secref{sec:baseline}) used the same seed as the first ensemble run. All experiments were run with deterministic seeds for reproducibility (see repository). 

To choose the sizes of the validation and test sets, we use the theory described in \secref{sec:theory:sizes}. Applying a minimum observable recall of 70\%, we obtain the following sizes. These were applied consistently across all methods.

\begin{itemize}
    \item \textbf{Fitzpatrick17K}: $|\valid|=33\%,|\test|=25\%$
    \item \textbf{HAM10000}: $|\valid|=20\%,|\test|=20\%$
    \item \textbf{FairVLMed}: $|\valid|=10\%,|\test|=10\%$
\end{itemize}

For fairret, we evaluate a set of regularisation parameters ranging from 0.5 to 1.5, including [0.5, 0.75, 1.0, 1.25, 1.5]. While \citet{buylFairretFrameworkDifferentiable2023} technically doesn't require a validation set, it makes use of a hyperparameter to govern the fairness/accuracy trade-off. This hyperparameter cannot be set a priori, and must be tuned for every dataset, requiring the use of validation data. We do not conduct any additional parameter search for Domain Discriminative, ERM, or Domain Independent. 

All training was done on a single H100. For the final results of the paper, we ran analysis on 3 datasets for 3 iterations using Weights \& Biases \citep{wandb}. Each run took \~11 minutes. In addition, the baseline experiments add an extra 20 runs. In total, this results in approximately 14.5 hours of compute to reproduce the complete results. Note that the experiments could have been run on cheaper hardware since the EfficientNetV2 models only have 43M parameters. 

While the above details the compute used to produce the results from the paper, we conducted further experiments before this. Particularly, we experimented with a less efficient ensemble structure requiring a separate run for each ensemble member. This required significantly more compute time.

\section{Data Access and Information} \label{app:data-access}
We provide links for accessing the data in Table \ref{tab:data-acces}. While all data is openly available for academic research, some of it requires approval by the providers. 

For detailed summary statistics for HAM10000 and Fitzpatrick17k, see the supplemental material in MedFair \citep{zongMEDFAIRBenchmarkingFairness2022}. For FairVLMed, we refer to the FairCLIP paper \citep{luoFairCLIPHarnessingFairness2024} as well as the GitHub page. For further details, see the original publications.


\begin{table}[ht]
\centering
\caption{Dataset access information}
\begin{tabular}{p{2.5cm}p{5cm}p{2.5cm}}
\toprule
\textbf{Dataset} & \textbf{Access URL} & \textbf{Reference} \\
\midrule
Fitzpatrick17k & \url{https://github.com/mattgroh/fitzpatrick17k} & \citep{grohEvaluatingDeepNeural2021} \\
HAM10000 & \url{https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/DBW86T} & \citep{tschandlHAM10000DatasetLarge2018} \\
FairVLMed & \url{https://github.com/Harvard-Ophthalmology-AI-Lab/FairCLIP} & \citep{luoFairCLIPHarnessingFairness2024} \\
\bottomrule
\label{tab:data-acces}
\end{tabular}
\end{table}

\section{Theoretical formalisms} \label{app:formalism}
\begin{table}[ht]
\centering
\caption{Summary of notation used in \secref{sec:theory}.}
\label{tab:notation}
\small
\begin{tabular}{ll}
\toprule
\textbf{Symbol} & \textbf{Definition} \\
\midrule
$\mathcal{D}$ & Data distribution over $(X,Y)$ \\
$X$ & Input features \\
$Y \in \{0,1\}$ & Binary label (1 = positive, 0 = negative) \\
$A \in \mathcal{G}$ & Protected attribute; $\mathcal{G}$ is the set of groups \\
$g \in \mathcal{G}$ & A particular protected group \\
$\mathcal{D}_{g,+}$, $\mathcal{D}_{g,-}$ & Conditional distributions $\mathcal{D}\!\mid\!(A=g, Y=1)$ and $\mathcal{D}\!\mid\!(A=g, Y=0)$ \\
$g{+}$, $g{-}$ & Shorthand for positives $(A=g,Y=1)$ and negatives $(A=g,Y=0)$ \\
\midrule
$h$ & Individual classifier (ensemble member) \\
$h'$ & Another (distinct) ensemble member \\
$\rho$ & Distribution over ensemble members (uniform in practice) \\
$h_{\mathrm{MV}}$ & Majority-vote classifier induced by $\rho$ \\
$N$ & Ensemble size (number of members) \\
\midrule
$L_{\mathcal{D}}(h)$ & Error rate (0–1 loss) of $h$ on $\mathcal{D}$ \\
$L_g(h)$ & Groupwise loss on group $g$ (e.g., $1-\text{recall}$ or $1-\text{accuracy}$) \\
$D_{\mathcal{D}}(h,h')$ & Disagreement rate between $h$ and $h'$ on $\mathcal{D}$ \\
$W_\rho(X,Y)$ & Ensemble error rate on $\mathcal{D}$: $\mathbb{E}_{h\sim\rho}[\1\{h(X)\neq Y\}]$ \\
$W^{g+}_\rho$ & Ensemble error rate on positives in group $g$ (i.e., on $\mathcal{D}_{g,+}$) \\
$W^{g-}_\rho$ & Ensemble error rate on negatives in group $g$ (i.e., on $\mathcal{D}_{g,-}$) \\
\midrule
$t \in [0,1/2]$ & Margin parameter in competence definitions \\
$C_\rho$ & Competence on $\mathcal{D}$: $P(W_\rho\!\in\![t,1/2)) - P(W_\rho\!\in\![1/2,1-t])$ \\
$C^{g+}_\rho$ & Restricted groupwise competence on $g{+}$ (analogously $C^{g-}_\rho$ for $g{-}$) \\
\midrule
$\text{EIR}_{\mathcal{D}}$ & Error Improvement Rate: $\frac{\mathbb{E}_{h\sim\rho}[L_{\mathcal{D}}(h)] - L_{\mathcal{D}}(h_{\mathrm{MV}})}{\mathbb{E}_{h\sim\rho}[L_{\mathcal{D}}(h)]}$ \\
$\text{DER}_{\mathcal{D}}$ & Disagreement–Error Ratio: $\frac{\mathbb{E}_{h,h'\sim\rho}[D_{\mathcal{D}}(h,h')]}{\mathbb{E}_{h\sim\rho}[L_{\mathcal{D}}(h)]}$ \\
$g^*$ & Index for the distribution on which DER/EIR are computed (e.g., $g{+}$, $g{-}$, or full) \\
$k$ & Minimum rate constraint (e.g., minimum recall/sensitivity) \\
$k^*$ & Upper bound on ensemble fairness gap under error-parity bounds \\
\midrule
$K$ & Number of positive predictions among $N$ members for a datapoint \\
$K_i$ & Bernoulli indicator of the $i$-th member’s positive prediction \\
$p_i$ & Success prob.\ of $K_i$; $p_i = k + \delta$ under enforced minimum rate \\
$\bar p$ & Mean recall across members: $\bar p = \frac{1}{N}\sum_{i=1}^N p_i$ \\
$\delta \ge 0$ & Margin by which enforced minimum rate exceeds $k$ on validation \\
\midrule
$m,n$ & \# positives in validation/test for the minority group (for power analysis) \\
$\alpha$ & Significance level in the one-sided test \\
$z_{1-\alpha}$ & $(1-\alpha)$-quantile of the standard normal distribution \\
$p_{\min}$ & Minimum observed validation recall to ensure test-time recall $>0.5$: \\
& $p_{\min} = 0.5 + \tfrac{1}{2} z_{1-\alpha}\sqrt{\tfrac{1}{m}+\tfrac{1}{n}}$ \\
\bottomrule
\end{tabular}
\end{table}



Table \ref{tab:notation} defines all notation used in the main paper.


As mentioned in the main paper, \cite{theisenWhenAreEnsembles2023} bound the improvements of an ensemble (i.e., the \textit{Ensemble Improvement Ratio (EIR)}) by the \textit{Disagreement-Error Ratio (DER)} of the ensemble, i.e., the ratio of the average pairwise disagreement rate to the average error of ensemble members. 

For completeness, we repeat their major results below. Note that while \cite{theisenWhenAreEnsembles2023} considers a fixed distribution $\mathcal{D}=(X,Y)$, which they frequently drop from their notation, we preserve it as we will want to vary $\mathcal{D}$.

Their results are as follows: 

The ensemble improvement rate is defined as:
\begin{equation} \label{eq:eir-definition}
    \text{EIR}_{\mathcal{D}} = \frac{\mathbb{E}_{h \sim \rho}[L_{\mathcal{D}}(h)] - L_{\mathcal{D}}(h_{\text{MV}})}{\mathbb{E}_{h \sim \rho}[L_{\mathcal{D}}(h)]}.
\end{equation}

and the Disagreement-Error Ratio as:
\begin{equation}
  \text{DER}_{\mathcal{D}} = \frac{\mathbb{E}_{h, h' \sim \rho}[D_{\mathcal{D}}(h, h')]}{\mathbb{E}_{h \sim \rho}[L_{\mathcal{D}}(h)]}.
\end{equation}

Where $L_\mathcal{D}(h)$ is the error rate for classifier, $h$, on data distribution, $\mathcal{D}$, $h_\text{MV}$ is the majority vote classifier,   $\mathbb{E}_{h \sim \rho}$ indicates the expected value over all ensemble members, and $\text{D}_\mathcal{D}(h, h')$ is the disagreement rate between classifiers, $h$ and $h'$. 

Specifically, the authors provide upper and lower bounds on the EIR. Crucially, this rests on an assumption of \textit{competence}, which informally states that ensembles should always be at least as good as the average member. More formally, \cite{theisenWhenAreEnsembles2023} state:

\begin{assumption}[Competence]
Let $W_{\rho,\mathcal{D}} \equiv W_{\rho}(X,Y) = \mathbb{E}_{h \sim \rho,\mathcal{D}}[\1(h(X) \neq Y)]$. The ensemble $\rho$ is \emph{competent} if for every $0 \leq t \leq 1/2$,
\begin{equation}
\mathbb{P}(W_{\rho,\mathcal{D}} \in [t, 1/2)) \geq \mathbb{P}(W_{\rho,\mathcal{D}} \in [1/2, 1-t]).
\end{equation}
\end{assumption}

This assumption can be interpreted as formalising the statement that a majority voting ensemble is more likely to be confidently right than confidently wrong. 

Based on this assumption, \cite{theisenWhenAreEnsembles2023} prove the following theorem: 

\begin{theorem}
    Competent ensembles never hurt performance, i.e., $\text{EIR} \geq 0$.
\end{theorem}

This assumption is only required to rule out pathological cases. For most real-world examples, this will be trivially satisfied. In the case of binary classification, the bounds on EIR can be simplified to \eqref{eq:original-theorem} from the main text.

\section{Ablation: Ensemble Sizes} \label{app:ensemble-size}
\begin{figure}
    \centering
    \includegraphics[width=0.9\linewidth]{figs/ensemble_size_vs_fair_auc.pdf}
    \caption{Relationship between \textbf{Ensemble Size} (X-axis) and \textbf{FairAUC} (Y-axis) across two datasets. No significant relationship is observed.}
    \label{fig:ensemble-size-fair-auc}
\end{figure}

In this section, we ask: ``How does ensemble size affect performance?'' We examine how $\FairAUC$ varies with ensemble size on the test set, and whether validation performance predicts test performance.  

Our design makes this straightforward: because ensemble members are trained independently, we can form smaller ensembles by subsampling members. We construct ensembles of size $m \in \{3,5,\dots,M\}$ with $M=21$, and compute $\FairAUC$ on both validation and test sets for HAM10000 \citep{tschandlHAM10000DatasetLarge2018} and Fitzpatrick17k \citep{grohEvaluatingDeepNeural2021} across all train/test partitions.  

\Figref{fig:ensemble-size-fair-auc} shows no consistent trend: confidence intervals are wide, and performance does not vary systematically with ensemble size. An alternative heuristic is to use validation $\FairAUC$ to select ensemble size, but as \Figref{fig:ensemble-size-validation} shows, the relationship between validation and test performance is too noisy to be useful. This is expected, as our method already leverages all non-test data to fit fairness weights.  

Lacking a strong empirical heuristic, we adopt the largest ensemble ($M=21$), which best aligns with our theoretical results: larger ensembles provide stronger guarantees under Jury-theorem arguments (see \secref{sec:theory:min-recall}).  

\begin{figure}
    \centering
    \includegraphics[width=0.9\linewidth]{figs/val_test_ensemble_size.pdf}
    \caption{Relationship between $\FairAUC$ on validation (X-axis) and test set (Y-axis) across ensemble sizes. The relationship is too noisy to guide model selection.}
    \label{fig:ensemble-size-validation}
\end{figure}





\section{Empirical Validation of Competence} \label{app:competence-validation}
\begin{figure}
    \centering
    % First subfigure
    \includegraphics[width=\textwidth]{figs/competence_mass_plot_by_group.png}
    % Main figure caption
\caption{\textbf{Empirical validation of competence proofs}. We show that enforcing minimum recall, $k>0.5+\delta$, leads to \textit{competent} ensembles (see \secref{sec:theory}). $\delta$ depends on the data size (\secref{sec:theory:sizes}) and $0.5$ comes from our proof in \secref{sec:theory:min-recall}. The data points \emph{above} thresholds, are above the X-axis, whereas the points \emph{below} the threshold are on both sides.}
    \label{fig:comptence-empirical}
\end{figure}

We empirically validate our proofs from \secref{sec:theory:sizes} and \secref{sec:theory:min-recall}. Specifically, we want to show that enforcing recall at $k>0.5+\delta$ leads to competent ensembles if $\delta$ is matches the size of the datasets. This would help validate both theoretical extensions of \citet{theisenWhenAreEnsembles2023}. 

To conduct this analysis, we set $\text{threshold}=k+\delta=0.7$ (as described in Appendix \ref{app:implementation}). We then run the competence calculations from \citet{theisenWhenAreEnsembles2023} for different $k$ above and below the threshold. The resulting figure is \Figref{fig:comptence-empirical}. 

\section{Benchmarking Efficiency} \label{app:efficiency}
A big advantage of our \FairEnsemble method is that it is efficient for training and inference because it utilises a shared backbone (see \secref{sec:ensemble}). In this section, we provide evidence for these claims. 

The results for inference can be seen in Table \ref{tab:single_image_latency}. Here, we see comparable inference speeds for ERM and ensemble across both CPU and GPU. The GPU runs are done on an NVIDIA H100 80GB GPU. The runs are with a batch size of 1, averaged over 100 runs, with a warm-up size of 10. There are no significant differences between the methods.

The results for training can be seen in Table \ref{tab:runtime_comparison} based on Weights \& Biases data \citep{wandb}. Here, we see a larger difference; ensembles take approximately 3x longer to train compared to ERM. This may be because we are in essence training 84 times more classifiers (21 members with four heads each). Still, because of the small size of the datasets, the training times are manageable.

It is worth noting that substantial optimisation is available for training. Because the backbone is frozen, the entire evaluation set (validation sets + test set) can be pre-computed. This would drastically speed up the training. However, these optimisations were not done in the interest of time.


\begin{table}[ht]
\centering
\caption{Average training runtime (in minutes)}
\begin{tabular}{lcc}
\toprule
Training Method & Avg. Runtime (min) & Std. Dev. (min) \\
\midrule
Ensemble & 31.79 & 5.13 \\
ERM      & 8.51  & 2.28 \\
\bottomrule
\end{tabular}

\label{tab:runtime_comparison}
\end{table}

\section{Derivations}
\subsection{Restricted Groupwise Competence under minimum recall and Independence Assumptions}
\label{blah}
To prove this, we assume independence of classifier errors and define $I_p$ for any subset of classifiers $p \in \rho$: 
\begin{equation}
    I_p(x) =\Pi_{i\in p} P(c_i(x)=1)\Pi_{j\in \bar p}P(c_j(x)=0)
\end{equation}
 then we decompose
%
\begin{equation}
    P(W^{g+}_\rho = t) = \sum_{\substack{p \in \rho \\ |p|=s}}I_p
\end{equation}
%
\paragraph{Sketch of the proof:}
The proof requires two observations:
\begin{enumerate}
    \item {Negative flips decrease probabilities}(given by Lemma \ref{flip}) Given a subset $p$ of ensemble models taking positive labels, with their complement taking negative labels, flipping some of $p$ so they also take negative labels to obtain a new $q$ subset will result in $q$ having a lower probability of occurring than $p$.
    \item{Matching $p$s and $q$s}(given by Lemma \ref{matchingpq}) It is possible to identify matching pairs of such $p$ of size $s$ and $q$ of size $|\rho|-s$ in equation \eqref{eq:scd-proof} determine.
\end{enumerate}
\begin{lemma}
\label{flip}
If $p\supseteq q$, the following inequality holds for their associated summands:
\begin{equation}
I_p\geq I_q    
\end{equation}
\end{lemma}
\begin{proof}
To see this, we write $n=\bar p$ for the members of the ensemble that take a negative label in both $p$ and $q$ and $a=p/q$ for members of the ensemble that alter from positive label to negative as we  move from $p$ to $q$.
Then 
\begin{equation}
    \Pi_a(c_a(X)=1)\geq k^{|a|} \geq (1-k)^{|a|} \geq \Pi_a(c_a(X)=0)
\end{equation}
and
\begin{align}
    \Pi_a(c_a(X)=1)\Pi_p P(c_p(X)=1)\Pi_nP(c_n(X)=0)\geq \nonumber \\
    \Pi_a(c_a(X)=0)\Pi_p P(c_p(X)=1)\Pi_n P(c_n(X)=0) 
    \end{align}
As required. 
\end{proof}
\begin{lemma}
\label{matchingpq}
Now we need to establish the existence of a monotonic bijection $m$ that maps from sets of size $s$ to sets of size $|\rho|-s$ such that if $m(p)=q$ then $p\supset q$.
\end{lemma}
\begin{proof}
This follows from the existence of symmetric chain decomposition (see \citet{greeneStructureSpernerKfamilies1976} for details).

A Symmetric Chain (SC) is a symmetric chain, that is, a chain
\[
A_0 \subset A_1 \subset \dots \subset A_t
\]
in the Boolean lattice $\mathcal{B}_n$ whose ranks satisfy
\[
|A_0| + |A_t| = n,
\]
so the chain begins at rank $k$ and ends at rank $n-k$, increasing in size by one at each step.

A Symmetric Chain Decomposition (SCD) is a  decomposition of $\mathcal{B}_n$, that is, a partition of the lattice into pairwise disjoint symmetric chains whose union contains every subset of $\{1,\dots,n\}$

By definition, every SC can only include one point of any size, and any SC that includes a point of size $k$ also includes a point of size $n-k$. As an SCD provides disjoint cover of the hypercube, every point of size $k$ is part of a single chain. Each of chain contains only one point of size $n-k$, and as such any SCD defines a monotonic bijection from points of size $k$ to points of size $n-k$. 
\end{proof}
\subsubsection{Proof}
Let $k\geq 0.5$ be the minimum recall rate. We will prove  a stronger statement that for each $t\in [0,0.5]$:
\begin{equation}
    P(W^{g+}_\rho = t) \geq P(W^{g+}_\rho  = 1-t) \forall g \in \cal G
\end{equation} 
For individual datapoints, unless $t=\frac s {|\rho|}$ for some integer $s<|\rho|/2$, the equation trivially holds as left and right side of the equation are both $0$.

When $t=\frac s {|\rho|}$, the above statement is equivalent to the probability of exactly $s\leq 0.5 |\rho|$ members of the ensemble voting correctly is higher than the probability of exactly $s$ members voting incorrectly.


We will establish a bijective correspondence between each summand to a smaller summand in the expression  
\begin{equation}
    P(W^{g+}_\rho = 1-t) = \sum_{\substack{p \in \rho \\|q|=|\rho|-s}}I_{q}
\end{equation}
%
By application of Lemma 2, followed by Lemma 1 we can rewrite:
%
\begin{equation} \label{eq:scd-proof}
    P(W^{g+}_\rho = 1-t) = \sum_{\substack{|q|=|\rho|-s\\ \forall s\leq |\rho|/2} }I_{q}=\sum_{\substack{q=m(p)\\|p|=s\\ \forall s\leq |\rho|/2}}I_q\leq \sum_{\substack{|p|=s\\ \forall s\leq |\rho|/2}}I_p =P(W^{g+}_\rho = t) \forall g \in \cal G 
\end{equation}

as required. \hfill\qed
\subsection{Minimum validation and evaluation sizes} \label{app:minimum-sizes}
\paragraph{Statistical Framework:}
We can frame the problem of ensuring minimum recall as a one-sided hypothesis test:


\begin{equation}
H_0: p_{\text{val}} = p_{\text{test}} = k
\quad\text{vs.}\quad
H_A: p_{\text{val}} > k.    
\end{equation}

Where $p_{\text{val}}$ is our threshold of interest. Because both the test set and validation sets are small, they both introduce sampling variability. Thus, we will explicitly account for the size of both. 

The hypothesis-testing framework has a few assumptions. First, it assumes that the validation and test sets are \textit{independently} drawn from the same distribution (an assumption we explicitly follow; see \secref{sec:data}). Second, it assumes that each positive instance is an independent \textbf{Bernoulli trial} that is either a true positive or a false negative. Finally, it assumes an approximately normal distribution. The normality assumption is met by the \textit{Large Counts Condition}, which heuristically states that $\min(mk, m(1-k), nk, n(1-k)) \ge 10$, which in our case simplifies to $\min(\frac{m}{2},\frac{n}{2}) \ge 10$. We thus minimally need roughly \textbf{20} positive instances for each group in both test and validation sets. 

\paragraph{Deriving minimums:}
Under $H_0$, the standard error of the difference between the minimum recall proportions in the validation and test set is:
$$
\text{SE}_0 = \sqrt{\,k(1-k)\Bigl(\tfrac{1}{m}+\tfrac{1}{n}\Bigr)}.
$$
The one-sided $z$ statistic fixing $p_{test}=k$ is
\[
z = \frac{p_{\text{val}} - k}{\text{SE}_0}.
\]
Requiring a significance level of $\alpha$ (i.e., $z \geq z_{1-\alpha}$) yields the minimal observable validation recall:

\[
p_{\min} = k + z_{1-\alpha}\sqrt{\,k(1-k)\Bigl(\tfrac{1}{m}+\tfrac{1}{n}\Bigr)}.
\]

For $k=0.5$, this simplifies to the result in Eq. \ref{eq:minimum-observed-recall}.

\subsection{Derivation of Equal Opportunity Bounds}\label{app:deo-derivation}

We derive the fairness bounds for ensembles under approximate equal opportunity (or accuracy) constraints.

Starting from the definition of $k'$-approximate fairness for the ensemble, we have
\begin{align}
    k'&=\max_{g\in\mathcal{G}}\mathbb{E}_{h\sim \rho}[L_g(h)](1-\text{EIR}_{g^*}) - \min_{g\in\mathcal{G}}\mathbb{E}_{h\sim \rho}[L_g(h)](1-\text{EIR}_{g^*})\\
    &\leq \max_{g\in\mathcal{G}}\mathbb{E}_{h\sim \rho}[L_g(h)] - \min_{g\in\mathcal{G}}\mathbb{E}_{h\sim \rho}[L_g(h)](1-\text{EIR}_{g^*})\\
    &\leq k- \min_{g\in\mathcal{G}}\mathbb{E}_{h\sim \rho} [L_g(h)]\cdot(-\text{EIR})_{g^*} \\
    &\leq k + \max_{g\in\mathcal{G}}\mathbb{E}_{h\sim \rho} [L_g(h)]\text{DER}_{g^*}
\end{align}

where $g^*$ is an appropriate distribution (e.g., positives, negatives or all points) constrained to a particular group $g$. By substituting in the lower bound from Theorem 2 instead of 0, we obtain the slightly tighter bound of Equation \ref{eq:deo-bounds}.

\section{Detailed Related Work} \label{app:related}

\paragraph{Fairness in Medical Imaging:}
Deep learning-based computer vision methods have become highly popular for medical imaging applications \citep{caiReviewApplicationDeep2020}, yet despite achieving near-human performance on top-level metrics \citep{liuDeepLearningSystem2020}, they consistently underperform for marginalised groups \citep{xuAddressingFairnessIssues2024,kocakBiasArtificialIntelligence2024}. These biases persist across different domains and modalities from dermatology \citep{daneshjouDisparitiesDermatologyAI2022} to chest X-rays \citep{seyyed-kalantariUnderdiagnosisBiasArtificial2021} and retinal imaging \citep{coynerAssociationBiomarkerbasedArtificial2023}. For instance, there is pervasive bias in skin condition classification \citep{oguguoComparativeStudyFairness2023,daneshjouDisparitiesDermatologyAI2022,grohEvaluatingDeepNeural2021}, likely due to both bias in data collection \citep{drukkerFairnessArtificialIntelligence2023} and treatment procedures \citep{obermeyerDissectingRacialBias2019}.

Unfairness arise from different stages in the development process \citep{drukkerFairnessArtificialIntelligence2023}. One persistent issue is unbalanced datasets \citep{larrazabalGenderImbalanceMedical2020}. Unbalanced datasets can lead to insufficient support for disadvantaged groups, which can lead to worse representations and more uncertain results \citep{riccilaraUnravelingCalibrationBiases2023,mehtaEvaluatingFairnessDeep2024}. 

A successful approach to mitigating fairness is to do extensive hyperparameter and architecture search \citep{duttFairTuneOptimizingParameter2023,dooleyImportanceArchitecturesHyperparameters2022}. By jointly optimising for fairness and performance, these methods can reduce the generalisation gap and outperform other methods. However, because of their computational cost, we do not compare against these in this work. However, our method can be built on top of the backbones found by the architecture search.

Defining fairness in the context of medical imaging is another challenge. While traditional fairness metrics, like equal opportunity \citep{hardtEqualityOpportunitySupervised2016}, are concerned with minimising disparities between groups, this might not be appropriate in a medical context. For instance, Zhang et al. \citep{zhangImprovingFairnessChest2022} find that methods which optimise this notion of group performance reduces the performance of all groups. This phenomenon of `levelling down' \citep{zietlowLevelingComputerVision2022} can have fatal consequences for patients and not meet the legal standards of fairness \citep{mittelstadtUnfairnessFairMachine2024}. Instead, researchers should strive to enforce minimum rate constraints, i.e., the performance of the worst-performing groups, which can help reduce persistent problems of underdiagnosis and undertreatment of disadvantaged groups \citep{seyyed-kalantariUnderdiagnosisBiasArtificial2021}. 


\section{Alternative Backbones} \label{app:mobilenetv3}
\begin{figure}[!htbp]
    \centering
    \subfigure[Fitzpatrick17k\label{fig:fitzpatrick-improvements-mobilenetv3}]{
        \includegraphics[width=0.45\textwidth]{figs/mobilenetv3_large-fitzpatrick17k-all_threshold_evaluation_min_recall_improvement.pdf}
    }\hfill
    \subfigure[HAM10000\label{fig:ham10000-improvements-mobilenetv3}]{
        \includegraphics[width=0.45\textwidth]{figs/mobilenetv3_large-ham10000-all_threshold_evaluation_min_recall_improvement.pdf}
    }
    \caption{\textbf{Fairness–accuracy AUC (FairAUC) relative to ERM with Mobilenetv3 backbone}. \FairEnsemble outperforms all baselines on Fitzpatrick17k (left) and HAM10000 (right). Error bars show 95\% bootstrap CIs. Evaluation follows \secref{sec:evaluation} over minimum-recall thresholds in $[0.5,1]$.}
    \label{fig:combined-improvements-mobilenetv3}
\end{figure}
Here, we report the experiments conducted with a different backbone to show the robustness of our method. Specifically, we use the very small MobileNetv3 \citep{howardSearchingMobileNetV32019}, which is popular for on-edge devices. 

\Figref{fig:combined-improvements-mobilenetv3} shows the main results. It shows that \FairEnsemble convincingly outperforms all baselines on both HAM10000 and Fitzpatrick17k -- the same results as for efficientnet in the main text.

\end{document}