\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\jmlrvolume{-- Under Review}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026 submission}
\editors{Under Review for MIDL 2026}

\title[Fourier Failure Modes]{Fourier Profiles Predict Failure Modes in Chest X-ray Classification Models}
% \title[Short Title]{Spectral Blind Spots: Failure Mode Discovery in X-Ray Models using Fourier Profiles}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
% \midlauthor{\Name{Author Name1\midljointauthortext{Contributed equally}\nametag{$^{1,2}$}}
% \orcid{1111-2222-3333-4444} \Email{abc@sample.edu}\\
% \addr $^{1}$ Address 1 \\
% \addr $^{2}$ Address 2 \AND
% \Name{Author Name2\midlotherjointauthor\nametag{$^{1}$}} \Email{xyz@sample.edu}\\
% \Name{Author Name3\nametag{$^{2}$}} \Email{alphabeta@example.edu}\\
% \Name{Author Name4\midljointauthortext{Contributed equally}\nametag{$^{3}$}} % \Email{uvw@foo.ac.uk}\\
% \addr $^{3}$ Address 3 \AND
% \Name{Author Name5\midlotherjointauthor\nametag{$^{4}$}} \Email{fgh@bar.com}\\
% \addr $^{4}$ Address 4
\midlauthor{\Name{Samuel Halimi}
\orcid{1111-2222-3333-4444} \Email{samuel@azmed.co}\\
\Name{Loïc Themyr} \Email{loic@azmed.co}\\
\Name{Arnaud Abreu} \Email{arnaud@azmed.co}\\
\addr 10 rue d'Uzès 75002 Paris
}

\begin{document}

\maketitle

\begin{abstract}
Deep learning models for chest X-ray anomaly detection remain vulnerable to subtle distributional shifts (e.g., acquisition technique, patient-related factors, and preprocessing).
Traditional error analysis often relies on semantic metadata or model embeddings, which can mask low-level signal variations that degrade performance.
In this work, we propose a data-centric framework for automated failure mode discovery using spectral analysis.
We project images into the frequency domain and extract a compact profile summarizing the distribution of signal energy across frequency bands.
By performing unsupervised clustering on these spectral profiles, we demonstrate that model failures are not randomly distributed, but are strongly concentrated within specific spectral clusters.
This method effectively isolates "blind spots", enabling the prediction of model reliability and the discovery of performance-degrading data slices without requiring ground-truth failure annotations.
\end{abstract}

\begin{keywords}
Chest X-ray, Failure Mode Discovery, Spectral Analysis, Model Robustness, Unsupervised Clustering.
\end{keywords}

\section{Introduction}
% Fail to generalize (distribution shifts)
Deep learning has achieved remarkable success in the interpretation of chest X-rays (CXRs), where algorithms now frequently match or exceed radiologist performance in controlled settings \cite{katzman2024deep, bettinger2024evaluation}.
However, in the context of clinical deployment, models often fail to generalize to data that deviates statistically from their training distribution, resulting in significant and unanticipated performance degradation \cite{zech2018variable, yu2022external}.
Ensuring the reliability of these systems requires the ability to anticipate and identify failure modes before they impact patient care.

% Existing approaches
Two main approaches have been explored to isolate underperforming samples from unseen data. Metadata stratification uses sensitive categorical attributes to spot biases in the system \cite{gichoya2023ai}, while model-centric techniques examine the embedding of models, either to spot out-of-distribution samples \cite{hong2024out} or to cluster coherent underperfoming slices in the datasets \cite{olesen2024slicing}.
However, both metadata stratification and latent space analysis operate on high-level semantic abstractions and fail to capture subtle signal-level shifts and irregularities that drive performance degradation.

% Our method

Therefore, in this work, we propose a \emph{data-centric} framework that prioritizes signal characteristics over semantic features or metadata labels.
Inspired by recent studies in the field of domain adaptation and generalization \cite{wang2020high, xu2021fourier}, we hypothesize that some systematic model failures correlate with specific profiles in the frequency domain.
From the \emph{Fourier transform} of an image, we compute a compact representation that summarizes the distribution of signal energy across frequency bands.
In that representation space, we apply an unsupervised clustering method to partition validation data into spectrally coherent subgroups.
Our experiments demonstrate that these spectral clusters might act as predictors of model reliability.
We show that model failures are not uniformly distributed, but are concentrated within specific spectral clusters and effectively reveal "blind spots" in the model’s generalization capability.
This approach allows for the identification of performance-degrading data slices rooted in image physics, enabling predictive reliability estimation without the need for ground-truth failure annotations.

% Related work
\section{Related work}

\subsection{Slice Discovery in medical image datasets}
% \subsection{Metadata-Driven Slice Discovery}
To identify systematic failures, the standard approach for medical applications consists in slicing validation data by categorical attributes \cite{seyyed2020chexclusion, larrazabal2020gender, ahluwalia2023subgroup}.
Although essential for fairness auditing, this approach is entirely based on the availability of tabular data.
It remains blind to "hidden stratification" \cite{oakden2020hidden}, where performance degrades due to signal-level characteristics that are not recorded in clinical logs.

%\subsection{Out-of-Distribution (OOD) detection}
Beyond categorical attributes, Out-of-Distribution (OOD) detection uses feature space distances to spot anomalies \cite{lee2018simple, liu2020energy} and is also widely explored in medical image applications \cite{tardy2019uncertainty, berger2021confidence, roy2022does, araujo2023few}.
Although effective in identifying stark outliers, these methods struggle with subtle variations in image acquisition that degrade performance without triggering distance-based alarms \cite{wiles2021fine}.
Also, OOD detection reflects the uncertainty about individual samples and is not meant to extract interpretable and coherent underperforming slices.

%\subsection{Unsupervised Slice Discovery in latent space}
To uncover coherent high-error subsets of data, recent unsupervised methods apply clustering algorithms in the latent embedding space of models \cite{eyuboglu2022domino, d2022spotlight, olesen2024slicing}.
However, this approach faces a technical paradox: deep networks are explicitly optimized to be invariant to nuisance variables \cite{achille2018emergence}.
Consequently, the latent space often suppresses the irregularities that cause model failure, rendering these failure modes invisible to latent space clustering strategies.

\subsection{Frequency domain analysis in deep learning}
In this work, we look for failure modes in the frequency domain of the images, to remove metadata supervision and dependence on subjective model embeddings to focus only on the intrinsic irregularities of the signal.
Important studies on image frequencies have been conducted in domain generalization (DG) \cite{huang2021fsdr, zhao2022test} and domain adaptation (DA) \cite{huang2021rda, yang2022source} to increase the robustness of deep learning models with respect to frequency perturbations.
These methods alter either the low frequencies \cite{guo2018low} or the high frequencies \cite{wang2020high} of the images to train models on adversarial examples in a data augmentation scheme.
Although not directly related to our approach, the success of these techniques brings evidence that some failure modes of deep learning models are explained by characteristics of the Fourier spectrum of input images.

Unlike DG and DA approaches that manipulate spectra to train robust models, we utilize spectral analysis as a \emph{post-hoc} diagnostic tool.
This allows us to cluster data based on image physics rather than semantics, exposing signal-driven failures that escape standard monitoring.

% Methods
\section{Methods}
\subsection{Radially Averaged Power Spectrum (RAPS)}
We analyze signal-level variations by projecting each image into the frequency domain.
For an image $x$, we compute its 2-D discrete Fourier transform $F = \mathcal{F}(x)$ and magnitude spectrum $S = \lvert F \rvert$.
To obtain a 1-D orientation-invariant descriptor $P$, we compute the Radially Averaged Power Spectrum (RAPS) of the image following the extraction protocol by \citet{torralba2003statistics}.
The computation of the RAPS is illustrated in \figureref{fig:raps}.

\noindent For a given discrete radius $r \in \mathbb{N}$, let $\Omega(r)$ be the set of frequency coordinates such as:

\begin{equation}
    \Omega(r) = \{(u, v) \in \mathbb{Z}^2 \mid \sqrt{u^2 + v^2} = r\}
\end{equation}

\noindent Then, for a given sampling of $N$ discrete radii $[r_1,\ldots,r_N]$, the $n$-th component $P(n)$, of the RAPS $P$, is computed as the average of the spectral magnitude $S$ over $\Omega(r_n)$:

\begin{equation}
    P(n) = \frac{1}{\lvert \Omega(r_n) \rvert} \sum_{(u, v)\in\Omega(r_n)}S(u, v)
\end{equation}

\noindent The RAPS captures the distribution of signal energy across frequency bands under the assumption that acquisition noise depends primarily on frequency magnitude rather than direction, consistent with isotropic noise models in radiographic imaging.

\begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:raps}
  {\caption{Computation of the RAPS of an image. We highlight the contribution of three frequency bands (red, green, blue) in the spectrum to the RAPS representation (dashed lines with corresponding colors).}}
  {\includegraphics[width=1.\linewidth]{MIDLLatexTemplate/images/methods/RAPS.png}}
\end{figure}

To ensure comparability across images, all Fourier spectra must lie on the same frequency grid.
We therefore restrict the analysis to images of similar native resolution and enforce identical spatial dimensions using minimal spatial cropping.
This avoids spectral distortions: resizing modifies high-frequency content through interpolation, while padding artificially increases low-frequency energy.
A minimal uniform crop preserves the relevant spectral structure and maintains the physical interpretation of Fourier frequencies.

\subsection{Similarity metric}
For the subsequent clustering phase, we advocate the use of the correlation distance as a similarity metric between RAPSs.
Given the RAPSs of two different unseen images, $P_i$ and $P_j$, their correlation distance is given by:

\begin{equation}
    d_{\text{corr}}(i,j)
    = 1 - \mathrm{corr}(P_i, P_j)
\end{equation}

\noindent Where the correlation between $P_i$ and $P_j$ is computed across the $N$ sampled radii as:

\begin{equation}
    \mathrm{corr}(P_i, P_j)
    = \frac{\sum_{n=1}^N \big(P_i(n) - \bar P_i\big)\big(P_j(n) - \bar P_j\big)}
    {\sqrt{\sum_{n=1}^N \big(P_i(n) - \bar P_i\big)^2}
           \sqrt{\sum_{n=1}^N \big(P_j(n) - \bar P_j\big)^2}}
\end{equation}

\noindent Unlike the Euclidean distance, which is sensitive to absolute magnitude, the correlation distance is invariant to additive and multiplicative shifts in signal intensity.
This ensures that vectors are grouped based on their intrinsic spectral morphology rather than physically irrelevant variations in global energy or sensor gain.
Consequently, the clustering process effectively isolates stable structural features, such as relative sharpness and characteristic spectral slopes, while remaining robust to acquisition-level scaling and offsets.

% \subsection{Clustering RAPSs}\label{Clustering}

% \medskip\noindent\textbf{K-means clustering}\quad
\subsection{K-means clustering}\label{K-means}
In a first exploratory phase, we use the standard K-means clustering to validate the relevance of the space of RAPSs for performance analysis. As it is not specifically optimized for slice discovery, it allows unbiased exploration of the relationship between the frequency domain and model failures. The possibility of varying the number of clusters is also convenient to confirm an observed tendency across partitions of different granularities.

% \medskip\noindent\textbf{Hierarchical Agglomerative Clustering (HAC)}\quad
\subsection{Hierarchical Agglomerative Clustering (HAC)}\label{HAC}
To optimize the slice discovery process, we resort to a hierarchical clustering technique.
In a bottom-up fashion, each radial profile starts as an individual cluster and we iteratively merge the closest pairs of clusters according to the average linkage rule.
To partition the resulting dendrogram we set the $0.5$-quantile of the distribution of pairwise distances as a threshold to stop the merging procedure.
Adopting this strategy yields three key advantages.
First, the partition scale naturally adapts to the inherent variability of the spectral representations.
Second, it circumvents the limitation of pre-specifying an arbitrary cluster count.
Finally, this flexibility effectively optimizes the isolation and discovery of specific underperforming slices.

\section{Experimental setup}
\subsection{Datasets and preprocessing}

We conducted all experiments using four publicly available chest X-ray datasets commonly employed for disease classification.

\noindent\textbf{CheXpert} \cite{irvin2019chexpert}. A large-scale dataset from Stanford containing frontal and lateral views, labeled for 14 thoracic findings using report-derived NLP.

\noindent\textbf{MIMIC-CXR} \cite{johnson2019mimic}. A de-identified dataset from Beth Israel Deaconess Medical Center with paired radiology reports and substantial acquisition and population diversity.

\noindent\textbf{PadChest} \cite{bustos2020padchest}. A Spanish dataset with multi-view radiographs and detailed labels covering findings, diagnoses, and anatomical locations.

\noindent\textbf{NIH ChestX-ray14} \cite{wang2017chestx}. A frontal-view dataset labeled for 14 conditions via report mining, notable for heterogeneous acquisition and label noise, making it a common robustness benchmark.

To ensure that Fourier transforms and spectral profiles were comparable across datasets, we standardized image resolution with minimal geometric distortion. We selected 264×224 as the target size, as it was the most common resolution across all sources. Images were first resized while preserving aspect ratio and then center-cropped to the exact target dimensions. Samples with incompatible shapes (e.g., near-square images) were discarded to avoid excessive rescaling.

We focus on three clinically common findings: Atelectasis, Consolidation, and Pleural Effusion. We trained our models in a mono-pathology setting. For each pathology, we constructed separate training and validation splits. The training sets are exclusively based on CheXpert images. The test sets are built from each dataset, so MIMIC-CXR, PadChest, and NIH are unseen-domain in our evaluation. This enables the analysis of cross-dataset generalization under distributional shift.


\subsection{Models, training and performance assessment}
For all experiments, we trained a DenseNet-121 classifier using binary cross-entropy loss and the Adam optimizer (learning rate $1\times 10^{-4}$, weight decay $1\times 10^{-3}$).
A Reduce-on-Plateau scheduler (minimum learning rate $1\times 10^{-5}$) and a batch size of 16 were used.
Models were trained for up to 30 epochs with early stopping based on validation performance.
All training was performed on an NVIDIA GeForce RTX 3080 GPU.

We report the Area Under the Receiver Operating Characteristic (AUROC) to evaluate the discriminative performance of the models.
This metric was chosen for its invariance to decision thresholds and its robustness to the fluctuating prevalence in unsupervised clustering assignments.


\section{Experiments and results}
% Relevance of spectral slices
% \subsection{K-Means}
\subsection{K-Means: To be renamed!!!}

Given a K-means partition (Section \ref{K-means}), we use the standard deviation of the AUROC across clusters to measure how effectively the partition stratifies performance.
To isolate the effect of spectral coherence from artifacts of cluster size or class distribution, we implemented two stochastic baselines: a \emph{fully random} assignment and a \emph{pseudo-random} control that preserves the specific cluster size and label statistics of the K-means solution.

As shown in \figureref{fig:kmeans2_corr}, the AUROC variation of the correlation-based K-means clustering remains consistently above those of both random baselines.
The \emph{fully random} and \emph{pseudo-random} partitions define the level of variability expected when no meaningful structure is present.
The persistent separation between these curves and the clustering curve therefore indicates that the partitions induced by the correlation distance capture genuine heterogeneity in the model behavior.

Because the correlation distance emphasizes shape differences in RAPSs, this gap suggests that differences in spectral composition, such as relative distribution of low versus high frequencies, decay rates, or local spectral slopes, are systematically associated with differences in model performance.
The fact that this phenomenon appears for all pathologies and all values of $K$ supports the hypothesis that the spectral representation explains at least part of the observed variability in the reliability of the model.


\begin{figure}[htbp!]
\floatconts
  {fig:kmeans2_corr}
  {\caption{AUROC variability across different number of clusters ($K$) made by the k-means method with the correlation metric vs. clusters made by fully and pseudo random methods. Results are averaged over 50 seeds.}}
  {%
    \centering
    \begin{tabular}{ccc}
      \includegraphics[width=0.3\linewidth]{MIDLLatexTemplate/images/exp2/kmeans/corr/kmeans_plot_correlation__ONLYCHEXPERT_CORR_all_seed_std_stats_n50_filtred_train_atelectasis_landscape_fromImagenet.png} &
      \includegraphics[width=0.3\linewidth]{MIDLLatexTemplate/images/exp2/kmeans/corr/kmeans_plot_correlation__ONLYCHEXPERT_CORR_all_seed_std_stats_n50_filtred_train_consolidation_landscape_fromImagenet.png} &
      \includegraphics[width=0.3\linewidth]{MIDLLatexTemplate/images/exp2/kmeans/corr/kmeans_plot_correlation__ONLYCHEXPERT_CORR_all_seed_std_stats_n50_filtred_train_pleuralEffusion_landscape_fromImagenet.png} \\
      (a) Atelectasis & (b) Consolidation & (c) Pleural Effusion
    \end{tabular}
  }
\end{figure}




\begin{figure}[htbp]
\floatconts
  {fig:ablation_atelectasis}
  {\caption{Ablation study evaluating the influence of the distance metric used in k-means clustering on AUROC variation for the Atelectasis pathology. We compare three metrics: Euclidean ($L_2$) distance, Spearman distance, and the correlation distance employed in our proposed method.}}
  {%
    \centering
    \begin{tabular}{ccc}
      \includegraphics[width=0.3\linewidth]{MIDLLatexTemplate/images/exp2/kmeans/l2/kmeans_plot_l2__ONLYCHEXPERT_all_seed_std_stats_n50_filtred_train_atelectasis_landscape_fromImagenet.png} &

      \includegraphics[width=0.3\linewidth]{MIDLLatexTemplate/images/exp2/kmeans/spearman/kmeans_plot_spearman__ONLYCHEXPERT_SPEARMAN_all_seed_std_stats_n50_filtred_train_atelectasis_landscape_fromImagenet.png} &

      \includegraphics[width=0.3\linewidth]{MIDLLatexTemplate/images/exp2/kmeans/corr/kmeans_plot_correlation__ONLYCHEXPERT_CORR_all_seed_std_stats_n50_filtred_train_atelectasis_landscape_fromImagenet.png} \\
      
      (a) $L_2$ & (b) Spearman & (c) Correlation
    \end{tabular}
  }
\end{figure}




% Detection of systematic underperforming slices
\subsection{Outliers analysis: to be renamed!!!}

To confirm the robustness of our data-centric framework, we apply HAC in the space of RAPSs to isolate under-performing data slices on unseen domain data.
After clustering, following the method from Section \ref{HAC}, we aggregate clusters with fewer than $10$ samples into a single outlier group; this ensures sufficient sample sizes for the primary partitions while isolating spectrally atypical profiles.
We then assess the AUROC on each cluster and the outlier group independently, allowing us to pinpoint specific spectral regions where model performance deviates from the norm.

As depicted in Fig. \ref{fig:outliers_2}, the resultant AUROC across HAC-derived clusters exhibits distinct and variable performance levels for every pathology and external validation center.
This finding mirrors the performance stratification observed with the K-means clustering approach.
This convergence confirms that the spectral differences encoded through the correlation distance reliably capture meaningful variations in downstream model performance, regardless of the underlying clustering algorithm employed.

A particularly characteristic and instructive pattern emerged from the analysis of the outlier super-group, which was formally defined by merging all HAC clusters containing fewer than ten samples. The AUROC for this super-group consistently exhibited a marked separation from the performance mean of the main clusters. Specifically, the performance was frequently significantly lower; as observed, for example, in Atelectasis and Pleural Effusion on the MIMIC and NIH datasets, and in Consolidation on Padchest and MIMIC; indicating substantial failure concentration. Conversely, in select cases, the AUROC was noticeably higher (e.g., Atelectasis and Pleural Effusion on Padchest), but critically, it was never comparable to the average cluster performance.

This consistent performance separation provides strong quantitative evidence: slices possessing atypical spectral signatures reliably translate into performance-level outliers within the model. This experiment thus highlights the methodological utility of applying HAC to the RAPSs as a robust mechanism to automatically isolate under-performing data slices by specifically targeting and analyzing low-AUROC outlier super-group.



\begin{figure}[htbp]
\floatconts
  {fig:outliers_2}
  {\caption{Comparative analysis of model performance (AUROC) across data clusters (blue) defined by HAC. The figure highlights the distinct model reliability on the identified clusters versus the significantly different performance (red) observed on the HAC-isolated outlier set. Results on unseen domain test datasets.}}
  {%
    \centering
    \begin{tabular}{c}
      \includegraphics[width=1.\linewidth]{MIDLLatexTemplate/images/exp2/outliers/sep/hierarchical_plot_outliers__auroc__filtred_train_atelectasis_landscape_fromImagenet_v2.png} \\
      (a) Atelectasis \\
      \includegraphics[width=1.\linewidth]{MIDLLatexTemplate/images/exp2/outliers/sep/hierarchical_plot_outliers__auroc__filtred_train_consolidation_landscape_fromImagenet_v2.png} \\
      (b) Consolidation \\
      \includegraphics[width=1.\linewidth]{MIDLLatexTemplate/images/exp2/outliers/sep/hierarchical_plot_outliers__auroc__filtred_train_pleuralEffusion_landscape_fromImagenet_v2.png} \\
      (c) Pleural Effusion \\
    \end{tabular}
  }
\end{figure}


\begin{figure}[htbp]
\floatconts
  {fig:ablation_atelectasis}
  {\caption{Comparative analysis of model performance (AUROC) across data clusters (blue) defined by HAC. The figure highlights the distinct model reliability on the identified clusters versus the significantly different performance (red) observed on the HAC-isolated outlier set. Results on seen domain test datasets: CheXpert.}}
  {%
    \centering
    \begin{tabular}{ccc}
      \includegraphics[width=0.3\linewidth]{MIDLLatexTemplate/images/exp2/outliers/sep/hierarchical_plot_outliers__auroc__filtred_train_atelectasis_landscape_fromImagenet_vchexpert.png} &
      \includegraphics[width=0.3\linewidth]{MIDLLatexTemplate/images/exp2/outliers/sep/hierarchical_plot_outliers__auroc__filtred_train_consolidation_landscape_fromImagenet_vchexpert.png} &
      \includegraphics[width=0.3\linewidth]{MIDLLatexTemplate/images/exp2/outliers/sep/hierarchical_plot_outliers__auroc__filtred_train_pleuralEffusion_landscape_fromImagenet_vchexpert.png} \\
      
      (a) Atelectasis & (b) Consolidation & (c) Pleural Effusion
    \end{tabular}
  }
\end{figure}


% Template

\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{We thank a bunch of people.}


\bibliography{midl-samplebibliography}


\appendix

\section{kmeans}
\begin{figure}[htbp]
\floatconts
  {fig:kmeans1_corr}
  {\caption{TO DO: kmeans with correlation distance exp 1}}
  {%
    \centering
    \begin{tabular}{ccc}
      \includegraphics[width=0.3\linewidth]{MIDLLatexTemplate/images/exp1/kmeans/corr/kmeans_plot_correlation_auroc__filtred_train_atelectasis_002.png} &
      \includegraphics[width=0.3\linewidth]{MIDLLatexTemplate/images/exp1/kmeans/corr/kmeans_plot_correlation_auroc__filtred_train_consolidati_002.png} &
      \includegraphics[width=0.3\linewidth]{MIDLLatexTemplate/images/exp1/kmeans/corr/kmeans_plot_correlation_auroc__filtred_train_pleuralEffu_002.png} \\
      (a) Atelectasis & (b) Consolidation & (c) Pleural Effusion
    \end{tabular}
  }
\end{figure}


\begin{figure}[htbp]
\floatconts
  {fig:ablation_atelectasis}
  {\caption{TO DO: Ablation of kmeans distance on Consolidation }}
  {%
    \centering
    \begin{tabular}{ccc}
      \includegraphics[width=0.3\linewidth]{MIDLLatexTemplate/images/exp2/kmeans/l2/kmeans_plot_l2__ONLYCHEXPERT_all_seed_std_stats_n50_filtred_train_consolidation_landscape_fromImagenet.png} &

      \includegraphics[width=0.3\linewidth]{MIDLLatexTemplate/images/exp2/kmeans/spearman/kmeans_plot_spearman__ONLYCHEXPERT_SPEARMAN_all_seed_std_stats_n50_filtred_train_consolidation_landscape_fromImagenet.png} &

      \includegraphics[width=0.3\linewidth]{MIDLLatexTemplate/images/exp2/kmeans/corr/kmeans_plot_correlation__ONLYCHEXPERT_CORR_all_seed_std_stats_n50_filtred_train_consolidation_landscape_fromImagenet.png} &
      
      (a) L2 & (b) Spearman & (c) Correlation
    \end{tabular}
  }
\end{figure}


\begin{figure}[htbp]
\floatconts
  {fig:ablation_conso}
  {\caption{TO DO: Ablation of kmeans distance on pleaural effusion }}
  {%
    \centering
    \begin{tabular}{ccc}
      \includegraphics[width=0.3\linewidth]{MIDLLatexTemplate/images/exp2/kmeans/l2/kmeans_plot_l2__ONLYCHEXPERT_all_seed_std_stats_n50_filtred_train_pleuralEffusion_landscape_fromImagenet.png} &

      \includegraphics[width=0.3\linewidth]{MIDLLatexTemplate/images/exp2/kmeans/spearman/kmeans_plot_spearman__ONLYCHEXPERT_SPEARMAN_all_seed_std_stats_n50_filtred_train_pleuralEffusion_landscape_fromImagenet.png} &

      \includegraphics[width=0.3\linewidth]{MIDLLatexTemplate/images/exp2/kmeans/corr/kmeans_plot_correlation__ONLYCHEXPERT_CORR_all_seed_std_stats_n50_filtred_train_pleuralEffusion_landscape_fromImagenet.png} &
      
      (a) L2 & (b) Spearman & (c) Correlation
    \end{tabular}
  }
\end{figure}


\begin{figure}[htbp]
\floatconts
  {fig:ablation_pl}
  {\caption{TO DO: Ablation of kmeans distance on pleural effusion }}
  {%
    \centering
    \begin{tabular}{ccc}
      \includegraphics[width=0.3\linewidth]{MIDLLatexTemplate/images/exp2/kmeans/l2/kmeans_plot_l2__ONLYCHEXPERT_all_seed_std_stats_n50_filtred_train_pleuralEffusion_landscape_fromImagenet.png} &

      \includegraphics[width=0.3\linewidth]{} &

      \includegraphics[width=0.3\linewidth]{MIDLLatexTemplate/images/exp2/kmeans/corr/kmeans_plot_correlation__ONLYCHEXPERT_CORR_all_seed_std_stats_n50_filtred_train_pleuralEffusion_landscape_fromImagenet.png} &
      
      (a) L2 & (b) ??? & (c) Correlation
    \end{tabular}
  }
\end{figure}


\begin{figure}[htbp]
\floatconts
  {fig:outliers_1}
  {\caption{TO DO: Outliers exp1}}
  {%
    \centering
    \begin{tabular}{c}
      \includegraphics[width=1.\linewidth]{MIDLLatexTemplate/images/exp1/outliers/hierarchical_plot_outliers__auroc__filtred_train_atelect_002.png} \\
      (a) Atelectasis \\
      \includegraphics[width=1.\linewidth]{MIDLLatexTemplate/images/exp1/outliers/hierarchical_plot_outliers__auroc__filtred_train_consoli_002.png} \\
      (b) Consolidation \\
      \includegraphics[width=1.\linewidth]{MIDLLatexTemplate/images/exp1/outliers/hierarchical_plot_outliers__auroc__filtred_train_pleural_002.png} \\
      (c) Pleural Effusion \\
    \end{tabular}
  }
\end{figure}





\end{document}
