\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

% Our packages
\usepackage{bm} % to bold symbols
\usepackage{booktabs} % table optimizations
\usepackage{multirow} % table design
\usepackage{adjustbox} % table layout
\usepackage{listings} % pseudocode
\lstdefinestyle{mypython}{
    language=Python,
    basicstyle=\ttfamily\footnotesize,
    keywordstyle=\color{blue},
    commentstyle=\color{gray},
    stringstyle=\color{red},
    numbers=left,
    numberstyle=\tiny,
    stepnumber=1,
    breaklines=true,
    frame=single
}

\usepackage{mwe} % to get dummy images
\usepackage{comment}
\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025}
\jmlrvolume{-- nnn}
\editors{Accepted for publication at MIDL 2025}

\title[Conditional Diffusion Models as Medical Image Classifiers]{Conditional Diffusion Models are Medical Image Classifiers that Provide Explainability and Uncertainty for Free}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Gian Mario Favero\midljointauthortext{Contributed equally}} \Email{gian.favero@mail.mcgill.ca} \\
\Name{Parham Saremi\midlotherjointauthor} \Email{parham.saremi@mail.mcgill.ca} \\
\Name{Emily Kaczmarek}\Email{emily.kaczmarek@mail.mcgill.ca} \\
\Name{Brennan Nichyporuk} \Email{nichypob@mila.quebec} \\
\Name{Tal Arbel} \Email{tal.arbel@mcgill.ca} \\
\addr Center for Intelligent Machines, McGill University, Montreal, Canada. \\ Mila - Quebec AI Institute, Montreal, Canada.
}

\begin{comment}
\midlauthor{\Name{Author Name1\midljointauthortext{Contributed equally}\nametag{$^{1,2}$}} \orcid{1111-2222-3333-4444} \Email{abc@sample.edu}\\
\addr $^{1}$ Address 1 \\
\addr $^{2}$ Address 2 \AND
\Name{Author Name2\midlotherjointauthor\nametag{$^{1}$}} \Email{xyz@sample.edu}\\
\Name{Author Name3\nametag{$^{2}$}} \Email{alphabeta@example.edu}\\
\Name{Author Name4\midljointauthortext{Contributed equally}\nametag{$^{3}$}} \Email{uvw@foo.ac.uk}\\
\addr $^{3}$ Address 3 \AND
\Name{Author Name5\midlotherjointauthor\nametag{$^{4}$}} \Email{fgh@bar.com}\\
\addr $^{4}$ Address 4
}
\end{comment}

\begin{document}

\maketitle

\begin{abstract}
Discriminative classifiers have become a foundational tool in deep learning for medical imaging, excelling at learning separable features of complex data distributions. However, these models often need careful design, augmentation, and training techniques to ensure safe and reliable deployment. Recently, diffusion models have become synonymous with generative modeling in 2D. These models showcase robustness across a range of tasks including natural image classification, where classification is performed by comparing reconstruction errors across images generated for each possible conditioning input. This work presents the first exploration of the potential of class conditional diffusion models for 2D medical image classification. First, we develop a novel majority voting scheme shown to improve the performance of medical diffusion classifiers. Next, extensive experiments on the CheXpert and ISIC Melanoma skin cancer datasets demonstrate that foundation and trained-from-scratch diffusion models achieve competitive performance against SOTA discriminative classifiers without the need for explicit supervision. In addition, we show that diffusion classifiers are intrinsically explainable, and can be used to quantify the uncertainty of their predictions, increasing their trustworthiness and reliability in safety-critical, clinical contexts. Further information is available on our project page: \url{https://faverogian.github.io/med-diffusion-classifier.github.io/}.
\end{abstract}

\begin{keywords}
diffusion, classification, explainability, uncertainty
\end{keywords}

\section{Introduction}

Deep learning applications in medicine have received significant attention in recent years due to their potential to revolutionize healthcare outcomes. For instance, the ability to accurately classify disease pathology from medical images using discriminative classifiers (e.g., ResNet~\cite{he2015deep}, ViT~\cite{dosovitskiy2020image}) is central to advancing early diagnosis, personalized treatment, and overall patient care. 
In the ideal scenario, discriminative classifiers are robust and generalizable; however, state-of-the-art performance often relies heavily on data augmentation and hyperparameter tuning, which can be time- and computation-expensive, and may still be prone to overfitting and/or learning shortcuts~\cite{Geirhos:2020:ShortcutLearning}. Even with strong classification performance, models must be explainable and provide uncertainty estimates to ensure reliable and trustworthy predictions for safe clinical deployment. Current explainability and uncertainty methods depend largely on post-hoc analysis or model modifications. For example, explainability often relies on gradient-based analysis after training~\cite{gradcam} or counterfactual generation with a separate model~\cite{sun2023inherentlyinterpretablemultilabelclassification}, whereas uncertainty methods range from simple model modifications, like Monte Carlo (MC) dropout, to expensive ensembling methods. Thus, there remains limitations to the safe use of discriminative classifiers in medical imaging, particularly due to the lack of built-in explainability and uncertainty analysis.

Diffusion models~\cite{ho2020denoisingdiffusionprobabilisticmodels} make up one class of generative models that has shown remarkable flexibility and robustness across various deep learning tasks, achieving state-of-the-art performance in image~\cite{dhariwal2021diffusionmodelsbeatgans}, video~\cite{ho2022videodiffusionmodels}, and audio~\cite{kong2020diffwave} generation tasks. Recently, generative models have been used directly for image classification~\cite{Li:arXiv:2023:diffusionClassifier, clark2023texttoimagediffusionmodelszeroshot, Krojer:arXiv:2023:DiffusionReasoners, chen2024robustclassificationsinglediffusion} in natural imaging, showing that large pre-trained models like Stable Diffusion~\cite{rombach2022highresolutionimagesynthesislatent} can be used as classifiers that are competitive with state-of-the-art supervised discriminative classifiers~\cite{he2015deep, dosovitskiy2020image}. Diffusion models are increasingly being used in the medical domain for data augmentation~\cite{guo2024maisimedicalaisynthetic}, segmentation~\cite{wu2023medsegdiffmedicalimagesegmentation, wu2023medsegdiffv2diffusionbasedmedical, amit2023annotatorconsensuspredictionmedical}, anomaly detection~\cite{wolleb2022diffusionmodelsmedicalanomaly}, counterfactual explanation~\cite{sanchez2022healthygenerativecounterfactualdiffusion, bedel2023dreamrdiffusiondrivencounterfactualexplanation, weng2024fastdiffusionbasedcounterfactualsshortcut, pegios2024diffusionbasediterativecounterfactualexplanations}, and probabilistic classification~\cite{shen2024improvingrobustnessreliabilitymedical}. However, despite many conditional diffusion models developed for medical image analysis, they have yet to be explored as classifiers that can provide explainability and uncertainty-estimation for free.

In this work, we present a comprehensive evaluation of how conditional diffusion models can be re-purposed and leveraged for image classification, explainability, and uncertainty estimation in the medical domain. First, we propose a novel majority voting-based method that improves the performance of diffusion classifiers in medical imaging. We then demonstrate that classifiers derived from foundation and trained-from-scratch diffusion models perform competitively with state-of-the-art medical image discriminative classifiers through extensive experiments on the publicly available CheXpert~\cite{irvin2019chexpert} and ISIC Melanoma skin cancer~\cite{Rotemberg2020APD} datasets, despite not being trained for classification. Next, we show that diffusion classifiers offer explainability (via counterfactual generation) and uncertainty quantification (via entropy) out-of-the-box. We validate the uncertainty by showing that when the model is confident, it is correct, and vice versa. This is shown as model accuracy drastically improves as its uncertainty threshold increases. For example, Stable Diffusion reaches classification accuracies of 100\% and 95\% on ISIC and CheXpert, respectively, with only 45\% of its most uncertain samples filtered out.

\section{Methodology}

In this work, we present diffusion classifiers for medical imaging classification tasks. We first present an overview of diffusion models in Section \ref{diffusion}. Next, in Section \ref{diffusionclassifiers}, we define conditional diffusion models and demonstrate how they can perform classification. Section \ref{extensions} introduces all extensions to the diffusion classifier, including: our novel algorithm for improving classification performance through majority voting, as well as the ability to perform counterfactual explainability and uncertainty quantification without any modifications.

\subsection{Diffusion Models}\label{diffusion}

Diffusion models (DM) are likelihood-based models that learn to approximate a data distribution through a process of iterative noising and denoising involving two key phases: a fixed forward process and a learned backward process. In the forward process, Gaussian noise $\bm{\epsilon} \sim \mathcal{N}(0, \bm{\text{I}})$ is gradually added to data in a controlled manner, destroying its structure until it is pure Gaussian noise. 

This process, which is done on a sample over time, can be expressed by its marginal for all $t$ on a continuous interval, $[0,1]$:
\begin{equation}
    q(\bm{z}_{t}|\bm{x}) = \alpha_\lambda \bm{x} + \sigma_\lambda \bm{\epsilon} \text{ where } \bm{\epsilon} \sim \mathcal{N}(0, \bm{\text{I}}).\label{reparameterized_marginal}
\end{equation}
Following the variational diffusion model formulation \cite{kingma2023variationaldiffusionmodels}, the forward process is defined to be variance-preserving, imposing the constraint $\alpha_\lambda^2 = \text{sigmoid}(\lambda)$, $\sigma_\lambda^2=\text{sigmoid}(-\lambda)$, where $\lambda$ is the log-SNR given by $\lambda= f_\lambda(t)$ and $f_\lambda(t)$ is the noise schedule (see Appendix~\ref{ref:var-diffusion}). The noise schedule is a monotonically decreasing and invertible function that connects the time variable, $t$, with the log-SNR, $\lambda$. During training, $t$ is sampled from a continuous, uniform distribution, $\mathcal{U}(0,1)$, which is then used to compute $\lambda$. The resulting distribution over noise levels can be defined as $p(\lambda)=-1/f'_\lambda(t)$~\cite{kingma2023understandingdiffusionobjectiveselbo}.

In the backward process, a neural network attempts to learn how to remove the added noise and recover an approximate sample from the original data distribution. Kingma et al. show that the variational lower bound objective (VLB) function for training diffusion models can be derived in continuous time with respect to its log-SNR, $\lambda$, noise sampling distribution, $p(\lambda)$ and weighting function, $w(\lambda)$~\cite{kingma2023variationaldiffusionmodels}. This VLB is:
\begin{equation}
    \log p(x) = \mathcal{L}_x + \mathcal{L}_T - \mathbb{E}_{\epsilon \sim \mathcal{N}(0,\text{I}), \lambda \sim p(\lambda)} \left[ \frac{w(\lambda)}{p(\lambda)} ||\bm{x} - \hat{\bm{x}}_\theta(\bm{z}_\lambda; \lambda)||_2^2 \right]. \label{eq:ELBO}
\end{equation}
Where $\mathcal{L}_x=-\log p(\bm{x}|\bm{z}_0) \approx 0$ for discrete $\bm{x}$ and $\mathcal{L}_T=D_{KL}(q(\bm{z}_T|x)||p(\bm{z}_T)) \approx 0$ for a well-defined forward process. We use a min-SNR weighting function~\cite{hang2024efficientdiffusiontrainingminsnr}, a shifted-cosine noise schedule~\cite{Hoogeboom:arXiv:2023:simpleDiffusion}, and v-prediction parameterization for greater stability during training and sampling~\cite{salimans2022progressivedistillationfastsampling}.

\subsection{Conditional Diffusion Models as Classifiers} \label{diffusionclassifiers}

Conditional diffusion models incorporate text or categorical inputs, such that the prediction becomes $\hat{\bm{x}}_\theta(\bm{z}_\lambda, c)$ where $c$ is a conditioning embedding. In this paper, we implement conditioning through cross-attention in a UNet-based diffusion model ~\cite{rombach2022highresolutionimagesynthesislatent}, and adaptive layer normalization in DiTs~\cite{peebles2023scalablediffusionmodelstransformers}. 

Recent works~\cite{Li:arXiv:2023:diffusionClassifier, clark2023texttoimagediffusionmodelszeroshot, Krojer:arXiv:2023:DiffusionReasoners, chen2024robustclassificationsinglediffusion} have explored using conditional diffusion models as discriminative classifiers. As shown in Figure~\ref{fig:architecture}, classification is performed by comparing reconstruction errors across images generated for each possible conditioning input. Specifically, using the labels, $\bm{C}=\{c_i\}$, and Bayes' theorem on model predictions, $p(\bm{x}|c_i)$, we can derive $p(c_i|\bm{x})$:
\begin{equation}
    p(c_i|\bm{x}) \approx \frac{\text{exp} \{\mathbb{E}_{\epsilon \sim \mathcal{N}(0,\text{I}), \lambda \sim p(\lambda)} \left[ ||\bm{x} - \hat{\bm{x}}_\theta(\bm{z}_\lambda, c_i)||_2^2 \right]\}}{\exp \{\sum_j\mathbb{E}_{\epsilon \sim \mathcal{N}(0,\text{I}), \lambda \sim p(\lambda)} \left[ ||\bm{x} - \hat{\bm{x}}_\theta(\bm{z}_\lambda, c_j)||_2^2 \right]\}}. \label{eq:softmaxDiffClass}
\end{equation}
A more complete derivation is found in Appendix~\ref{ref:bayes-derivation}. A Monte Carlo estimation of the expectation for an arbitrary class, $c_j$, can be computed by sampling $N$ noise level pairs, $(\bm{\epsilon}, \lambda)$ and averaging the reconstruction error:
\begin{equation}
    \frac{1}{N}\sum^N_{k=1} \left[ ||\bm{x} - \hat{\bm{x}}_\theta(\alpha_{\lambda_k} \bm{x} + \sigma_{\lambda_k} \bm{\epsilon}_k, c_j)||_2^2 \right].\label{eq:diffClass}
\end{equation}
For each $(\bm{\epsilon}, \lambda)$ pair, $\bm{\epsilon}$ is sampled from an isotropic Gaussian distribution and $\lambda$ is sampled from $p(\lambda)$ (practically speaking, $t \sim \mathcal{U}(0,1)$, then $\lambda = f_\lambda(t)$). Eq.~\eqref{eq:diffClass} shows that classifying one sample requires $N$ many steps per condition, where the Monte Carlo estimate becomes more accurate as the number of steps increases. To reduce the variance of prediction for a given image, $\bm{x}$, an identical set of $(\bm{\epsilon}_k, \lambda_k) \in S\{(\bm{\epsilon}_k, \lambda_k)\}_{k=1}^N$ is used for every condition, which increases the accuracy of the prediction $p(\bm{C}|\bm{x})$. In practice, Eq. \eqref{eq:softmaxDiffClass} is equivalent to choosing the class with the minimum average reconstruction error.

\begin{figure}[htbp]
    \centering
    \includegraphics[width=\textwidth]{images/architecture.pdf}
    \caption{\textbf{Diffusion classifiers} are conditional diffusion models repurposed as classifiers. First, a sample, $\bm{x}$, is noised at a randomly chosen noise level, $(\bm{\epsilon}_k, \lambda_k)$. The noised sample is then denoised by the diffusion network with each possible conditioning input, $c_j$. The conditioning variable, $c_j$, that results in the denoised output, $\hat{\bm{x}}_\theta (\bm{z}_\lambda, c_j)$, with the smallest reconstruction error over many noise levels is selected as the class. This process is repeated for a set of $N$ noise levels $(\bm{\epsilon}, \lambda)$ with the reconstruction errors aggregated (e.g., average/majority voting) for a more accurate prediction.}
    \label{fig:architecture}
\end{figure}

\subsection{Extensions on Diffusion Classifiers} 
\label{extensions}

\paragraph{Majority Voting:} \label{majority-voting} In this work, we introduce a novel  majority-vote-based algorithm for determining the predicted class. Here, we tally votes across all $(\bm{\epsilon}, \lambda)$ pairs by identifying the class with the lowest error as the prediction for that pairing, and take the final predicted class as the one with the majority of individual votes (see Appendix~\ref{ref: majority-vote} for the algorithm's pseudocode). We posit that averaging reconstruction error over all noise levels inherently weights higher values of $\lambda$ more, which is not always beneficial (i.e., reconstructions at higher values of $\lambda$ are naturally much harder and thus have greater error, introducing more noise into the average reconstruction error).

\paragraph{Intrinsic Explainability:} \label{explainability}
Diffusion classifiers use Classifier-Free Guidance (CFG)~\cite{ho2022classifierfreediffusionguidance} to understand which features are most influential in generating certain classes. CFG is a common approach in which a conditional diffusion model is simultaneously trained for an unconditional task by randomly dropping out $c$ ($\sim$10\% of the time). In doing so, sampling can be guided towards an intended class with a guidance-scale, $w$:
\begin{equation}
    \tilde{\bm{x}}_\theta(\bm{z}_\lambda, c) = (1+w)\hat{\bm{x}}_\theta(\bm{z}_\lambda, c) - w\hat{\bm{x}}_\theta(\bm{z}_\lambda, \varnothing).
\end{equation}
At inference, the model permits explainability for free, through the conditional generation of the factual and counterfactual images of the input image. First, noise is added to obscure the input images, while preserving enough image structure to make reconstruction possible. Then, by varying the condition at inference, the model can shift its generation process to any possible conditional class. These generated images represent the reconstruction of the image provided by the true class, and any counterfactual image(s), where difference maps can be created to highlight class-specific regions modified by the network.

\paragraph{Uncertainty Quantification:} \label{uncertainty}
Diffusion classifiers are also able to produce uncertainty estimates without any additional modifications to the model. The set of $N$ $(\bm{\epsilon}, \lambda)$ pairs required for accurate classification results (Eq.~\eqref{eq:diffClass}) results in numerous predictions generated for each sample and thus inherently resembles the  uncertainty estimation strategy via MC dropout or ensemble methods. As explained in Section \ref{diffusionclassifiers}, to achieve accurate classification, a single sample requires $N$ steps (repeated per condition) where reconstruction errors for that sample are calculated at different $(\bm{\epsilon}, \lambda)$ noise levels  (Eq.~\eqref{eq:diffClass}). To quantify the uncertainty of the overall predicted class, we construct a Bernoulli distribution from each of the $N$ predictions. This creates a probability density from which uncertainty can be computed.

\section{Experiments}


We first evaluate the performance of the average reconstruction error objective (Section \ref{diffusionclassifiers}) against a majority voting alternative, demonstrating that the latter yields superior results in our tasks. We then compare the classification performance of diffusion classifiers with state-of-the-art discriminative baselines, and furthermore, show that conditional diffusion models are interpretable out-of-the-box, and capable of producing uncertainty quantifications.

\subsection{Datasets}

\begin{table}[htbp]
    \centering
    \renewcommand{\arraystretch}{1.26} % Increase row spacing
    \resizebox{0.7\linewidth}{!}{
        \begin{tabular}{lccccc}
            \toprule
            & \textbf{Method} & \multicolumn{2}{c}{\textbf{CheXpert}} & \multicolumn{2}{c}{\textbf{ISIC}} \\
            \cmidrule(lr){3-4} \cmidrule(lr){5-6}
            &                      & \textbf{Majority} & \textbf{Average} & \textbf{Majority} & \textbf{Average} \\
            \midrule 
            & DiT-B/4              & 86.8 $\pm$ 0.38 & 85.3 $\pm$ 0.40 & 90.5 $\pm$ 0.08 & 90.4 $\pm$ 0.29 \\
            & UNet                 & 84.6 $\pm$ 0.21 & 83.4 $\pm$ 0.14 & 91.2 $\pm$ 0.08 & 84.3 $\pm$ 0.75 \\
            & Stable Diffusion     & 84.7 $\pm$ 0.34 & 79.9 $\pm$ 0.29 & 94.7 $\pm$ 0.14 & 94.5 $\pm$ 0.33 \\
            \bottomrule
        \end{tabular}
    }
    \caption{\textbf{Majority voting outperforms averaging} in achieving highest classification accuracy on the CheXpert and ISIC Melanoma test sets (using 501 steps). Model variance is calculated at inference based on five random seeds.}
    \label{fig:majority-vs-average}
\end{table}

\paragraph{ISIC Melanoma:} A publicly available dataset~\cite{Rotemberg2020APD} containing over 35,000 images of skin lesions and corresponding labels for the presence of melanoma. We balance the dataset by class for our experiments, resulting in 10,212 total images. The data are randomly split into an 80/10/10 train/validation/test set.

\paragraph{CheXpert:} A publicly available dataset~\cite{irvin2019chexpert} containing over 200,000 chest X-ray images with binary labels for 14 diseases and the presence of support devices. For our experiments, we use ``Pleural Effusion'' and ``No Findings'' as mutually exclusive labels and filter for frontal views of the chest, resulting in a balanced dataset of 20,404 samples. The data are randomly split into an 80/10/10 train/validation/test set.

\subsection{Model Architectures}

\paragraph{Baselines:} To establish a comparative baseline, we evaluate the performance of both convolutional and transformer-based architectures. We use \verb|torchvision| implementations of ResNet-18 and ResNet-50~\cite{he2015deep}, and \verb|timm| implementations of ViT-S/16 and ViT-B/16~\cite{dosovitskiy2020image}, EfficientNet-B0 and EfficientNet-B4~\cite{tan2019efficientnet}, and Swin-B Transformer~\cite{liu2021swintransformer}.

\paragraph{Conditional Diffusion Models:} We implement a UNet backbone based on the ADM architecture~\cite{dhariwal2021diffusionmodelsbeatgans} at $256^2$ resolution, incorporating improvements from simple diffusion~\cite{Hoogeboom:arXiv:2023:simpleDiffusion}, such as scaling the number of ResBlocks at lower resolutions to save memory at higher resolutions. For transformer-based diffusion models, we include the DiT-B/4 variant from~\cite{peebles2023scalablediffusionmodelstransformers}. Unless otherwise noted, all images are compressed with a single-stage discrete wavelet transform (DWT) using a Haar wavelet to improve computational efficiency.

\paragraph{Foundation Models:} Ideally, foundation models like Stable Diffusion can be repurposed as zero-shot classifiers. However, we find that such models are not trained on enough medical data to perform adequately by default. Thus, to ensure a fair comparison, we fine-tune Stable Diffusion v2-base~\cite{rombach2022highresolutionimagesynthesislatent} on an amalgamation of our CheXpert and ISIC Melanoma training splits. Given that the model is designed for text-to-image generation, we replace labels in the datasets with text prompts, e.g., ``a benign skin lesion'', or, ``a frontal chest xray of a sick patient with pleural effusion''. More details on all architectures can be found in Appendices \ref{ref: experimental-details} and \ref{ref: sd-fine-tuning}.

\section{Results}

\subsection{Ablating on the Classification Algorithm}

We propose a simple but effective majority voting scheme that, instead of accumulating errors at each timestep, tallies the amount of times a reconstruction error was smaller for each test condition and then chooses the class with the most votes. Table~\ref{fig:majority-vs-average} shows that the highest classification accuracy is consistently achieved with majority voting. This result is intuitive: at greater values of $N$ there are more reconstructions attempted from high noise disturbance which can introduce large sources of variance in the average error. Figure~\ref{fig:steps-ablation} shows an ablation study of accuracy against classification steps on the CheXpert and ISIC validation sets when a majority vote algorithm is used. In general, more classification steps lead to better performance, though with diminishing returns. Thus, we use majority voting with 501 steps for all diffusion-based classification results in this paper.

\begin{table}[htbp]
    \centering
    \renewcommand{\arraystretch}{1.26} % Increase row spacing
    \resizebox{0.6\textwidth}{!}{%
    \begin{tabular}{lccccc}
        \toprule
        & \textbf{Method} & \multicolumn{2}{c}{\textbf{CheXpert}} & \multicolumn{2}{c}{\textbf{ISIC}} \\
        \cmidrule(lr){3-4} \cmidrule(lr){5-6}
         & & \textbf{Accuracy} & \textbf{F1} & \textbf{Accuracy} & \textbf{F1} \\
        \midrule \multirow{4}{*}{\rotatebox{90}{\textbf{CNN }}}
        & ResNet-18                    & 90.9 & 0.910 & 94.4 & 0.943 \\ 
        & ResNet-50                    & 91.6 & 0.914 & 93.6 & 0.935 \\
        & EfficientNet-B0              & 90.5 & 0.907 & 93.1 & 0.930 \\
        & EfficientNet-B4              & 90.4 & 0.904 & 93.2 & 0.930 \\ \hline \multirow{3}{*}{\rotatebox{90}{\textbf{TF }}}
        & ViT-S/16                     & 86.9 & 0.869 & 95.0 & 0.949 \\
        & ViT-B/16                     & 85.1 & 0.857 & 94.8 & 0.948 \\
        & Swin-B                       & 86.1 & 0.863 & 95.9 & 0.958 \\ \hline \multirow{4}{*}{\rotatebox{90}{\textbf{DM }}}
        & DiT-B/4                      & 86.1 & 0.860 & 90.4 & 0.901 \\
        & UNet                         & 84.5 & 0.854 & 91.8 & 0.919 \\
        & Stable Diffusion$^*$         & 85.0 & 0.839 & 94.8 & 0.946 \\
        & Stable Diffusion$^{\dagger}$ & 48.8 & 0.656 & 39.7 & 0.521 \\
        \bottomrule
    \end{tabular}
    }
    \caption{\textbf{Diffusion classifiers are competitive with discriminative baselines.} $^*$ and $^\dagger$ denote fine-tuned and zero-shot versions, respectively. Results are reported on the CheXpert and ISIC Melanoma test sets, with 501 classification steps and majority voting being used for the diffusion classifiers (DM).}
    \label{tab:classification_results}
\end{table}

\subsection{Classification Performance on Benchmark Datasets}

Table~\ref{tab:classification_results} shows the classification accuracy and F1-score of each model on the CheXpert and ISIC Melanoma test sets. Note that the models are grouped by architecture: convolution-based (CNN), transformer-based (TF), and diffusion-based (DM). These results demonstrate that the diffusion classifier achieves competitive performance with discriminative baselines. However,  unlike other classifiers, the diffusion classifier requires minimal hyperparameter tuning, no data augmentations, and only a simple and stable MSE loss function during training. A comparison of optimization settings is found in Appendix~\ref{ref: experimental-details}.

\begin{figure}[htbp]
    \floatconts
        {fig:graphs}% label for whole figure
        {\caption{(a) Ablation on corresponding validation sets demonstrates that higher performance comes with more classification steps. (b) Diffusion classifiers inherently produce uncertainty estimates. Filtering uncertain predictions improves performance on the remaining data.}}% caption for whole figure
            {%
            \subfigure{%
                \label{fig:steps-ablation}% label for this sub-figure
                \includegraphics[width=0.35\linewidth]{images/combined-steps.pdf}
            }\qquad % space out the images a bit
            \subfigure{%
                \label{fig:uncertainty}% label for this sub-figure
                \includegraphics[width=0.35\linewidth]{images/uncertainty-all-models-improved-accuracy.pdf}
            }
            }
\end{figure}

\subsection{Intrinsic Explainability}%Explainability Comes for Free}

A key advantage of diffusion classifiers lies in their intrinsic interpretability, which positions diffusion classifiers as not only effective but also transparent. Importantly, diffusion classifiers are able to produce counterfactual explanations, as opposed to other interpretability methods that simply highlight regions of interest. This can be seen in Figure~\ref{fig:interpretability}: On the left example (skin lesion), the counterfactual of a malignant lesion (melanoma) has changed colour and intensity to become healthy. In the right example (chest X-ray), the counterfactual image of a sick patient (pleural effusion) shows decreased disease pathology in the left and right lungs. The natural interpretability of diffusion classifiers provides both transparency on how the model is learning (thus allowing the identification of shortcut learning), and specific class information which improves understanding of the disease. In addition to providing disease explainability, the difference maps also reveal how the model makes its decision: the condition with the least reconstruction error is selected as the predicted class.

\begin{figure}[htbp]
    \centering
    \includegraphics[width=0.90\textwidth]{images/explainablity-figures.pdf}
    \caption{\textbf{Diffusion classifiers are naturally explainable} and highlight why they make classification decisions using classifier-free guided sampling. Difference maps show conditional areas of interest (pathology added/removed) during reconstruction.}
    \label{fig:interpretability}
\end{figure}

\subsection{Uncertainty Quantification}

The uncertainty quantification of diffusion classifiers is demonstrated in Figure \ref{fig:uncertainty}. In addition to competitive classification performance and intrinsic explainability, uncertainty quantifications can be estimated without any model modifications. In medical imaging, uncertainty measures are validated by confirming that when the model is confident, the prediction is correct, and when it is incorrect, it is uncertain~\cite{NAIR2020101557}. In terms of clinical decision making, quantifying this behaviour supports the idea that high-confidence predictions from the model are more trustworthy, while uncertain cases can be flagged for further testing or expert review. We therefore validate the diffusion model's uncertainty quantification by filtering out the most uncertain predictions and examining the change in performance on the remaining samples. The models show that accuracy increases when the most uncertain predictions are filtered out for CheXpert (- -) and ISIC (-). This confirms the effectiveness of their uncertainty measure and potential value across medical applications. We show that this same phenomenon holds with the trained DiT and pre-trained Stable Diffusion classifiers in Appendix~\ref{ref:uncertainty-accuracy}.

\section{Conclusion}

In this paper, we provide a comprehensive examination of the benefits of diffusion classifiers in medical imaging. First, we introduce a novel majority voting method to improve the overall performance of diffusion classifiers. We next demonstrate that diffusion classifiers are able to achieve comparable performance to state-of-the-art discriminative classifiers, in addition to providing intrinsic counterfactual explainability and uncertainty quantification. 

Future work can extend our study to assess the robustness of diffusion classifiers in medical imaging, particularly under domain shifts or variations in image acquisition protocols. In addition, diffusion classifiers should be evaluated on more complex medical image classification scenarios, including 3D image classification and multi-class classification. Further, given that we present a novel uncertainty estimation, an in-depth analysis against other uncertainty methods using common metrics (i.e., reliability plots, failure analysis) should be performed.

Due to the nature of classifying images by accumulating a series of predictions, conditional diffusion models are limited by inference speed and computational requirements. To provide a reference, classifying a single batch of 128 images (256$^2$) takes 3:48 minutes with the UNet diffusion classifier on an A100 GPU. We provide a more thorough breakdown of computational requirements in Appendix~\ref{ref:computational-resources}.

\begin{comment}
This is where the content of your paper goes.  Some random
notes\footnote{Random footnote are discouraged}:
\begin{itemize}
\item You should use \LaTeX \cite{Lamport:Book:1989}.
\item JMLR/PMLR uses natbib for references. For simplicity, here, \verb|\cite|  defaults to
  parenthetical citations, i.e. \verb|\citep|. You can of course also
  use \verb|\citet| for textual citations.
\item Eprints such as arXiv papers can of course be cited \cite{Hinton:arXiv:2015:Distilling}. We recomend using a \verb|@misc| bibtex entry for these as shown in the sample bibliography.
\item You should follow the guidelines provided by the conference.
\item Read through the JMLR template documentation for specific \LaTeX
  usage questions.
\item Note that the JMLR template provides many handy functionalities
such as \verb|\figureref| to refer to a figure,
e.g. \figureref{fig:example},  \verb|\tableref| to refer to a table,
e.g. \tableref{tab:example} and \verb|\equationref| to refer to an equation,
e.g. \equationref{eq:example}.
\end{itemize}

\begin{table}[htbp]
 % The first argument is the label.
 % The caption goes in the second argument, and the table contents
 % go in the third argument.
\floatconts
  {tab:example}%
  {\caption{An Example Table}}%
  {\begin{tabular}{ll}
  \bfseries Dataset & \bfseries Result\\
  Data1 & 0.12345\\
  Data2 & 0.67890\\
  Data3 & 0.54321\\
  Data4 & 0.09876
  \end{tabular}}
\end{table}

\begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:example}
  {\caption{Example Image}}
  {\includegraphics[width=0.5\linewidth]{example-image}}
\end{figure}

\begin{algorithm2e}
\caption{Computing Net Activation}
\label{alg:net}
 % older versions of algorithm2e have \dontprintsemicolon instead
 % of the following:
 %\DontPrintSemicolon
 % older versions of algorithm2e have \linesnumbered instead of the
 % following:
 %\LinesNumbered
\KwIn{$x_1, \ldots, x_n, w_1, \ldots, w_n$}
\KwOut{$y$, the net activation}
$y\leftarrow 0$\;
\For{$i\leftarrow 1$ \KwTo $n$}{
  $y \leftarrow y + w_i*x_i$\;
}
\end{algorithm2e}
\end{comment}

\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{We thank Bruno Travouillon, Olexa Bilaniuk, and the Mila IDT team for their support with Mila's HPC. This work was supported by the Natural Sciences and Engineering Research Council of Canada, Fonds de Recherche du Quebec: Nature et Technologies, the Canadian Institute for Advanced Research (CIFAR) Artificial Intelligence Chairs program, Google Research, Calcul Quebec, the Digital Research Alliance of Canada, the Vadasz Scholar McGill Engineering Doctoral Award, and Mila - Quebec AI Institute.}

\bibliography{midl25_214}

\clearpage
\appendix
\section{Additional Background on Diffusion Classifiers} 

\subsection{Variational Diffusion Model Formulation}\label{ref:var-diffusion}
In the variational diffusion model formulation \cite{kingma2023variationaldiffusionmodels}, a noised image, $z_t$, can be generated from an uncorrupted image, $x$, at any point in a forward process marked by a continuous valued noise level, $t \sim [0,1]$:
\begin{align*}
    z_t = \alpha_\lambda x + \sigma_\lambda \epsilon \text{ where } \epsilon \sim \mathcal{N}(0,1)
\end{align*}
When noise is added to $x$, we are effectively decreasing its signal-to-noise ratio (and log-SNR, $\lambda$). There is a mapping that exists between the noise level, $t$, and the log-SNR, $\lambda$, called the noise schedule, $f_\lambda(t)$. For example, with a shifted-cosine noise schedule, this mapping is $\lambda = f_\lambda(t) = -2 \log \tan (\pi t / 2) + 2 \log(\frac{64}{256})$. Therefore, when diffusing an image, we first sample $t \sim \mathcal{U}(0,1)$ and then use this value and the noise schedule to calculate $\lambda$. There are many different noise schedules that differ in the way they modulate the log-SNR at various values along the range $[0,1]$, though they must be strictly monotonically decreasing. 

To corrupt the image, we ultimately need the values of $\alpha_\lambda$ and $\sigma_\lambda$, which we can derive with two additional pieces of information. First, we set the log-SNR as $\lambda=\log \alpha_\lambda^2/\sigma_\lambda^2$. Additionally, we assume that the forward process itself is \textit{variance preserving}, which implies $\alpha_\lambda^2+\sigma_\lambda^2=1$. Armed with these two relations, we can derive $\alpha_\lambda^2$ as:
\begin{align*}
    \lambda &= \log \alpha_\lambda^2/\sigma_\lambda^2 \\
    e^\lambda &= \alpha_\lambda^2 / (1 - \alpha_\lambda^2) \\
    \alpha_\lambda^2 &= e^\lambda - e^\lambda \alpha_\lambda^2  \\
    \alpha_\lambda^2(1+e^\lambda) &= e^\lambda \\
    \alpha_\lambda^2 &= \text{sigmoid}(\lambda)
\end{align*}
And similarly, we can derive $\sigma_\lambda^2=\text{sigmoid}(-\lambda)$. 

\subsection{Derivation of Diffusion Classifier from Bayes' Rule} \label{ref:bayes-derivation}
We provide a derivation of the diffusion classifier objective. 
\begin{equation*}
    \begin{aligned}
    p_{\theta}(c_i|\bm{x}) &= \frac{p_{\theta}(\bm{x},c_i)}{\sum_{c_j}p_{\theta}(\bm{x},c_j)} \\
    &= \frac{p_{\theta}(\bm{x}|c_i)p_{\theta}(c_i)}{\sum_{c_j}p_{\theta}(\bm{x}|c_j)p_{\theta}(c_j)}.
    \end{aligned}
\end{equation*}

We assume the case in which we have no prior information about the relative frequencies of different classes and make the simplifying assumption that all classes are equally likely. Assuming a uniform prior over the labels, i.e., \( p_{\theta}(c_i) = p_{\theta}(c_j) \) for all \( i, j \), the prior cancels out in the fraction:

\begin{equation*}
    \begin{aligned}
    p_{\theta}(c_i|\bm{x}) &= \frac{p_{\theta}(\bm{x}|c_i)}{\sum_{c_j}p_{\theta}(\bm{x}|c_j)} \\
    &= \frac{e^{\log p_{\theta}(\bm{x}|c_i)}}{\sum_{c_j}e^{\log p_{\theta}(\bm{x}|c_j)}}.
    \end{aligned}
\end{equation*}

Using the variational diffusion model formulation, we approximate the likelihood as:

\begin{equation*}
    \log p_{\theta}(\bm{x}|c_i) \approx \mathbb{E}_{\epsilon \sim \mathcal{N}(0,\text{I}), \lambda \sim p(\lambda)} [\|\bm{x} - \hat{\bm{x}}_\theta(\bm{z}_\lambda, c_i)\|_2^2] \Big. .
\end{equation*}

Thus, the posterior probability simplifies to:

\begin{equation*}
    p_{\theta}(c_i|\bm{x}) = \frac{\exp \{ \mathbb{E}_{\epsilon \sim \mathcal{N}(0,\text{I}), \lambda \sim p(\lambda)} [\|\bm{x} - \hat{\bm{x}}_\theta(\bm{z}_\lambda, c_i)\|_2^2] \}}  
    {\sum_{c_j}\exp \{ \mathbb{E}_{\epsilon \sim \mathcal{N}(0,\text{I}), \lambda \sim p(\lambda)} [\|\bm{x} - \hat{\bm{x}}_\theta(\bm{z}_\lambda, c_j)\|_2^2] \}}.
\end{equation*}

\subsection{Majority Voting Algorithm}\label{ref: majority-vote}
We provide the pseudocode for the diffusion classification algorithm used in the experiments. We opt for a majority voting scheme as opposed to the average reconstruction error approach outlined by~\cite{Li:arXiv:2023:diffusionClassifier}.

\begin{lstlisting}[style=mypython]
def classify(x, num_classes, classification_steps):
    errors = fill((x.shape[0], num_classes, classification_steps), float('inf'))
    for step in classification_steps:
        t = rand(0, 1)
        z_t, eps_t = diffuse(x, t)  # add noise to image at t
        # Get the errors for each class
        for c in range(num_classes):
            pred = model(z_t, t, c)  # get noise prediction for given class
            error = mse(pred, eps_t)
            errors[:, c, step] = error  # store the error

    # Find the class with the lowest error for each step
    end_of_stage_votes = errors[:, :, :classification_steps].argmin(dim=1)

    # Count the votes for each class across all steps
    votes = zeros(x.shape[0], num_classes)
    for b in range(x.shape[0]):
        for step in range(classification_steps):
            class_with_lowest_error = end_of_stage_votes[b, step]
            votes[b, class_with_lowest_error] += 1

    final_classes = votes.argmax(dim=1)

    return final_classes
\end{lstlisting}

\section{Experimental Details} \label{ref: experimental-details}

\subsection{Diffusion Classifier Optimization Settings}

We hold our optimization settings constant across all diffusion models trained for our experiments. A detailed summary is found in Table~\ref{tab:diff-optim-settings}.

\begin{table}[hbtp]
    \centering
    \begin{adjustbox}{max width=\textwidth}
    \begin{tabular}{lc}
        \toprule
        \textbf{Setting} & \textbf{Diffusion Model (3×256×256)} \\
        \midrule
        Batch Size & 128 \\
        Optimizer & Adam \\
        Learning Rate & $1 \times 10^{-4}$ \\
        Learning Rate Warmup Steps & 250 \\
        Gradient Clipping & 1.0 \\
        EMA Beta & 0.999 \\
        EMA Warmup Steps & 50 \\
        EMA Update Frequency & 5 \\
        \bottomrule
    \end{tabular}
    \end{adjustbox}
    \caption{Optimization settings for our conditional diffusion models}
    \label{tab:diff-optim-settings}
\end{table}

\subsection{Discriminative Baseline Optimization Settings}
We use official implementations of ResNet-based (\verb|torchvision|), EfficientNet- and ViT-based (\verb|timm|) classifiers in our experiments. A detailed summary of optimization settings for our discriminative baselines is found in Table~\ref{tab:baseline-optim-settings}.

\begin{table}[hbtp]
    \centering
    \begin{adjustbox}{max width=\textwidth}
    \begin{tabular}{lccc}
        \toprule
        \textbf{Setting} & \textbf{RN/EN (ISIC)} & \textbf{RN/EN (CheXpert)} & \textbf{ViT/Swin} \\
        \midrule
        Batch Size & 64 & 64 & 64 \\
        Optimizer & AdamW & AdamW & Adam \\
        Learning Rate & $1 \times 10^{-4}$ & $1 \times 10^{-4}$ & $1 \times 10^{-5}$ \\
        Weight Decay & $1 \times 10^{-5}$ & $1 \times 10^{-3}$ & --- \\
        \midrule
        \textbf{Data Augmentation} & \multicolumn{3}{c}{\textbf{Notes}} \\
        Random Rotation & \multicolumn{3}{c}{Degree range: (-30, 30)} \\
        Random Horizontal Flip & \multicolumn{3}{c}{Probability: 0.5} \\
        Random Vertical Flip & \multicolumn{3}{c}{Probability: 0.5} \\
        Random Gaussian Blur & \multicolumn{3}{c}{Kernel size: 5, Sigma range: (0.1, 2)} \\
        \bottomrule
    \end{tabular}
    \end{adjustbox}
    \caption{Optimization settings for discriminative baselines.}
    \label{tab:baseline-optim-settings}
\end{table}

\subsection{UNet Settings}
The ADM architecture~\cite{dhariwal2021diffusionmodelsbeatgans} is used as a starting point, with minor alterations based on capacity requirements of each experiment. Class conditions are integrated into the model using cross-attention with a trainable module \verb|nn.encoder|. A detailed summary is found in Table~\ref{tab:unet-settings}.

\begin{table}[hbtp]
    \centering
    \begin{adjustbox}{max width=\textwidth}
    \begin{tabular}{lc}
        \toprule
        \textbf{Setting} & \textbf{UNet Model (3×256×256)} \\
        \midrule
        Prediction Parameterization & velocity \\
        Noise Schedule & Shifted Cosine, Base-64 \\
        Wavelet Transform & Single-stage Haar Wavelet \\
        Sample Size & 128 \\
        Channels & 12 \\
        ResNet Layers per Block & 2 \\
        Base Channels & 128 \\
        Channel Multiplier & (1, 1, 2, 4, 8) \\
        Cross Attention Resolution & 16 \\
        Encoder Type & nn \\
        Cross Attention Dimension & 512 \\
        \bottomrule
    \end{tabular}
    \end{adjustbox}
    \caption{Settings for UNet model.}
    \label{tab:unet-settings}
\end{table}

\subsection{DiT Settings}
The DiT-B/4 architecture is followed as presented in~\cite{peebles2023scalablediffusionmodelstransformers}. A detailed summary is found in Table~\ref{tab:dit-settings}.

\begin{table}[hbtp]
    \centering
    \begin{adjustbox}{max width=\textwidth}
    \begin{tabular}{lc}
        \toprule
        \textbf{Setting} & \textbf{DiT Model (3×256×256)} \\
        \midrule
        Prediction Parameterization & velocity \\
        Noise Schedule & Shifted Cosine, Base-64 \\
        Wavelet Transform & Single-stage Haar Wavelet \\
        Sample Size & 128 \\
        Channels & 12 \\
        Number of Attention Heads & 12 \\
        Attention Head Dimension & 64 \\
        Number of Layers & 12 \\
        Patch Size & 4 \\
        \bottomrule
    \end{tabular}
    \end{adjustbox}
    \caption{Settings for DiT model.}
    \label{tab:dit-settings}
\end{table}

\section{Stable Diffusion v2 Fine-Tuning} \label{ref: sd-fine-tuning}

We fine-tune Stable Diffusion v2-base~\cite{rombach2022highresolutionimagesynthesislatent} using the \verb|Hugging Face| training pipeline for a total of 15k iterations. We construct the fine-tuning dataset by amalgamating our CheXpert and ISIC Melanoma training splits. Given that the model is designed for text-to-image generation, we replace labels in the datasets with text prompts, ie. ``a benign skin lesion'', or, ``a frontal chest xray of a sick patient with pleural effusion''. Fine-tuning dramatically increased Stable Diffusion's domain knowledge and subsequent classification performance on our benchmark datasets. 

\begin{figure}[hbtp]
    \centering
    \subfigure[No fine-tuning]{
        \includegraphics[width=0.45\textwidth]{images/before-finetuning.png}  % Replace with your image file path
        \label{fig:before-fine-tuning}
    }
    \hfill
    \subfigure[After fine-tuning for 15k steps]{
        \includegraphics[width=0.45\textwidth]{images/after-finetuning.png}  % Replace with your image file path
        \label{fig:after-fine-tuning}
    }
    \caption{Task-related generation from the Stable Diffusion v2-base model before (left) and after (right) fine-tuning. Training for only a few thousand iterations dramatically increased in-distribution inference and classification performance.}  % Main caption for the entire figure
    \label{fig:sd-fine-tuning-image}
\end{figure}

\section{Uncertainty Quantification} \label{ref:uncertainty-accuracy}

We validate our model uncertainty through measuring performance as uncertain predictions are filtered out. For both the pre-trained Stable Diffusion classifier and our diffusion classifiers trained from scratch, accuracy increases for both datasets as the most uncertain predictions are filtered out. This indicates that the model is most uncertain about its incorrect predictions, which is highly valuable across medical applications. See Figure \ref{fig:uncertainty-boxplots} for a breakdown of this quantification in boxplot form.

\begin{figure}[hbtp]
    \centering
    \subfigure[Uncertainty estimates, CheXpert]{
        \includegraphics[width=0.9\textwidth]{images/combined_uncertainty_boxplot_chexpert.pdf}  % Replace with your image file path
    }
    \hfill
    \subfigure[Uncertainty estimates, ISIC]{
        \includegraphics[width=0.9\textwidth]{images/combined_uncertainty_boxplot_isic.pdf}  % Replace with your image file path
    }
    \caption{We find that all of our diffusion classifier models are more confident about their correct predictions (TP, TN) than their incorrect predictions (FP, FN).}
    \label{fig:uncertainty-boxplots}
\end{figure}

\section{More Explainability Results}

More explainability results can be found in Figure \ref{fig:more-explain-chexpert}, and Figure \ref{fig:more-explain-isic}. Input sick images have been altered to healthy class by adding noise to the input image and denoising with the healthy class. For CheXpert t=0.5 and for ISIC t=0.3 are used. CFG scale is 7.5.

\begin{figure}[hbtp]
    \centering
    \subfigure{
        \includegraphics[width=0.9\textwidth]{images/explainability-appendix/chexpert-1.pdf}  
    }
    \hfill
    \subfigure{
        \includegraphics[width=0.9\textwidth]{images/explainability-appendix/chexpert-2.pdf}  
    }
    \hfill
    \subfigure{
        \includegraphics[width=0.9\textwidth]{images/explainability-appendix/chexpert-3.pdf}  
    }
    \caption{More explainability results for CheXpert by converting input sick images to healthy images. t=0.5 and CFG=7.5 are used for generating these images.}
    \label{fig:more-explain-chexpert}
\end{figure}

\begin{figure}[hbtp]
    \centering
    \subfigure{
        \includegraphics[width=0.9\textwidth]{images/explainability-appendix/isic-1.pdf}  
    }
    \hfill
    \subfigure{
        \includegraphics[width=0.9\textwidth]{images/explainability-appendix/isic-2.pdf}  
    }
    \hfill
    \subfigure{
        \includegraphics[width=0.9\textwidth]{images/explainability-appendix/isic-3.pdf}  
    }
    \caption{More explainability results for ISIC by converting input sick images to healthy images. t=0.3 and CFG=7.5 are used for generating these images.}
    \label{fig:more-explain-isic}
\end{figure}

\section{Computational Resources} \label{ref:computational-resources}
All models were trained or fine-tuned a compute cluster of 80 GB A100 GPUs for all experiments in this paper. For inference, a single 80 GB A100 GPU is used. We provide a full breakdown of parameter count and time to classify a batch of 128 images in Table~\ref{tab:compute-comparison}.

\begin{table}[hbtp]
    \centering
    \renewcommand{\arraystretch}{1.2} % Adjust row spacing
    \begin{tabular}{lcccc}
        \toprule
        \textbf{Model} & \textbf{Parameters} & \textbf{Time per Batch (s)} \\
        \midrule
        UNet               & 276M & 228   \\
        DiT-B/4            & 148M & 195   \\
        SD v2-base         & 866M & 269   \\ \hline
        ResNet-18          & 12M  & 0.011 \\
        ResNet-50          & 26M  & 0.031 \\
        EfficientNet-B0    & 5M   & 0.020 \\
        EfficientNet-B4    & 19M  & 0.050 \\
        ViT-B/16           & 87M  & 0.036 \\
        ViT-S/16           & 22M  & 0.016 \\
        Swin-B             & 88M  & 0.090 \\
        \bottomrule
    \end{tabular}
    \caption{Comparison of computational cost and inference speed across different models in our experiments. Parameter count is provided in millions (M) and inference time is provided in seconds (s) for a single batch of size 128 images.}
    \label{tab:compute-comparison}
\end{table}

\end{document}