\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\usepackage{mwe} % to get dummy images
\usepackage{multirow} % for \multirow in tables
\usepackage{color, soul}
\usepackage{booktabs}
\usepackage{etoolbox}
\usepackage{xcolor}
\jmlrvolume{-- 11}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\editors{Accepted for publication at MIDL 2026}

\title[Simplex Diffusion for Medical Image Classification]{Simplex-Aligned Diffusion with Cross-Granularity Interaction for Robust Medical Image Classification}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
% \midlauthor{\Name{Chao Wu} \Email{cwu64@buffalo.edu}\and
% \Name{Mingchen Gao} \Email{mgao8@buffalo.edu}\\
% \addr Department of Computer Science and Engineering, University at Buffalo (SUNY), Buffalo, NY, United States}

 % Three or more authors with the same address:
\midlauthor{\Name{Chao Wu} \orcid{0009-0003-7508-2012} \Email{cwu64@buffalo.edu}\\
 \Name{Mingchen Gao} \orcid{0000-0002-5488-8514} \Email{mgao8@buffalo.edu}\\
 \addr Department of Computer Science and Engineering, University at Buffalo (SUNY), Buffalo, NY, United States}


%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship


\begin{document}

\maketitle

\begin{abstract}
The clinical deployment of medical image classification systems hinges on their trustworthiness, specifically, the ability to provide calibrated uncertainty estimates and maintain robustness under acquisition shifts. While generative diffusion models offer promising distributional modeling, existing approaches suffer from a fundamental geometric conflict: they apply unbounded Gaussian noise directly to bounded label simplices. We identify that this theoretical mismatch forces predictions into invalid probability spaces, serving as a primary source of model unreliability and overconfidence. To resolve this, we propose Simplex-Aligned Diffusion. Unlike standard methods, we reformulate the label generation process on an unconstrained logit manifold. By mapping the probability simplex to a Euclidean space, we ensure mathematical consistency with Gaussian diffusion, which effectively acts as a geometric regularizer for uncertainty calibration. Furthermore, we introduce a Transformer-based Cross-Granularity Interaction module to stabilize visual guidance by dynamically modeling global-local dependencies. Extensive experiments on the APTOS2019 and HAM10000 benchmarks demonstrate that our framework not only achieves competitive accuracy but significantly outperforms state-of-the-art baselines in calibration error (ECE) and resilience to clinical artifacts (e.g., sensor noise, blur), offering a mathematically rigorous and clinically reliable paradigm.\footnote{Code is available at \url{https://github.com/SamaritanW/simplex-aligned-diffusion}}
\end{abstract}

\begin{keywords}
Robust Medical Image Classification, Simplex-Aligned Diffusion, Uncertainty Calibration 
\end{keywords}
\input{intro_new}
\input{related_works}
\input{method}
\input{experiments}
\input{Conclusion}


% \begin{table}[htbp]
%  % The first argument is the label.
%  % The caption goes in the second argument, and the table contents
%  % go in the third argument.
% \floatconts
%   {tab:example}%
%   {\caption{An Example Table}}%
%   {\begin{tabular}{ll}
%   \bfseries Dataset & \bfseries Result\\
%   Data1 & 0.12345\\
%   Data2 & 0.67890\\
%   Data3 & 0.54321\\
%   Data4 & 0.09876
%   \end{tabular}}
% \end{table}


% \begin{figure}[htbp]
%  % Caption and label go in the first argument and the figure contents
%  % go in the second argument
% \floatconts
%   {fig:example}
%   {\caption{Example Image}}
%   {\includegraphics[width=0.5\linewidth]{example-image}}
% \end{figure}


\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{The work was supported by the US NSF CAREER award IIS-2239537.}


\bibliography{midl26_11}


\appendix
% \section{Algorithm}
% \begin{algorithm2e}
% \caption{Computing Net Activation}
% \label{alg:net}
%  % older versions of algorithm2e have \dontprintsemicolon instead
%  % of the following:
%  %\DontPrintSemicolon
%  % older versions of algorithm2e have \linesnumbered instead of the
%  % following:
%  %\LinesNumbered
% \KwIn{$x_1, \ldots, x_n, w_1, \ldots, w_n$}
% \KwOut{$y$, the net activation}
% $y\leftarrow 0$\;
% \For{$i\leftarrow 1$ \KwTo $n$}{
%   $y \leftarrow y + w_i*x_i$\;
% }
% \end{algorithm2e}

\appendix
\section{Implementation Details and Reproducibility}
\label{app:implementation}

\subsection{Implementation Details.} 
The framework is implemented in PyTorch. For the {Transformer-Enhanced Visual Tokenizer}, we utilize ResNet-18/50 as the backbone for both global and local streams, initialized with ImageNet pre-trained weights. The interaction module employs a standard Transformer encoder layer with a hidden dimension of 512.
For the {Simplex-Aligned Diffusion}, we set the total diffusion timesteps $T=1000$ with a linear noise schedule from $\beta_1=10^{-4}$ to $\beta_T=0.02$. The simplex scaling factor $\lambda$ is empirically set to $1.5 \log C$. The model is trained using the AdamW optimizer with an initial learning rate of $5 \times 10^{-4}$, decayed via a cosine annealing schedule. We train for 1,000 epochs with a batch size of 64. All experiments are conducted on 3 NVIDIA H100 GPUs.
\subsection{Dataset Preprocessing and Splitting}
To ensure a fair and rigorous comparison, we align our data partitioning protocols with the DiffMIC-v2 benchmark~\cite{yang2025diffmic}.

\begin{itemize}
    \item \textbf{HAM10000:} We adhere to the standard {7:3} split ratio for training and testing, consistent with the protocol in~\cite{gong2020distractor, yang2025diffmic}. This yields 7,010 images for training/validation and 3,005 images for testing.
    \item \textbf{APTOS2019} Following the baseline settings~\cite{yang2025diffmic}, we employ a {7:3} split on the official dataset (2,929 for training/validation, 733 for testing).
\end{itemize}

\textbf{Rigorous Model Selection Strategy:} 
It is important to note that the original DiffMIC-v2 implementation performs model selection by monitoring performance directly on the test set (i.e., reporting the best-epoch results on the test split). To avoid such test-set leakage and ensure clinical validity, we implement a stricter evaluation protocol: we randomly hold out $10\%$ of the training set as an independent validation set for hyperparameter tuning and checkpoint selection. All reported metrics in our main paper are derived from the unseen test set using the fixed model selected via the validation set.

All images undergo a standardized pre-processing pipeline:
\begin{itemize}
    \item \textbf{Resize \& Crop:} Images are center-cropped and resized to $224 \times 224$ to maintain consistent input resolution.
    \item \textbf{Normalization:} We apply Z-score normalization using standard ImageNet statistics: mean $\mu=[0.485, 0.456, 0.406]$ and std $\sigma=[0.229, 0.224, 0.225]$. 
    \item \textbf{Augmentation:} During training, we apply random horizontal flips and mild rotations ($\pm 10^\circ$) to mitigate overfitting. No augmentation is applied during inference.
\end{itemize}

\subsection{Architecture and Backbone Selection}
\label{app:backbone_justification}
A critical deviation in our implementation compared to the original DiffMIC-v2~\cite{yang2025diffmic} is the choice of the visual backbone. While the original DiffMIC-v2 employs {EfficientSAM}~\cite{xiong2023efficientsam} as the image encoder, we standardize our backbone to {ResNet-50} for both the baseline and our method. This decision is driven by two key factors:

\begin{enumerate}
    \item \textbf{Resolution Mismatch:} EfficientSAM is natively designed for high-resolution inputs ($1024 \times 1024$). However, standard medical image classification benchmarks (e.g., ISIC, APTOS) are typically evaluated at $224 \times 224$. Forcing a $224 \times 224$ input into a $1024 \times 1024$ model requires aggressive interpolation, which introduces artificial sub-pixel artifacts and does not reflect real-world clinical deployment constraints.
    \item \textbf{Fairness in Comparison:} Our objective is to validate the effectiveness of the {Simplex-Aligned Diffusion} strategy, rather than the power of the feature extractor. By using a standard ResNet-50, we ensure that performance gains are attributable solely to our methodological contributions (Logit-space diffusion and Cross-Granularity Interaction) rather than a larger backbone capacity.
\end{enumerate}

\subsection{Training Configuration}
All models are implemented in PyTorch and trained on NVIDIA H100 GPUs. We use the AdamW optimizer with a cosine annealing learning rate schedule. Detailed hyperparameters are listed in Table~\ref{tab:hyperparams}.

\begin{table}[h]
\centering
\floatconts
  {tab:hyperparams}
  {\caption{Hyperparameter settings for training.}}
  {%
    \begin{tabular}{l|c}
    \hline
    \textbf{Parameter} & \textbf{Value} \\
    \hline
    Image Size & $224 \times 224$ \\
    Batch Size & 64 \\
    Learning Rate & $1 \times 10^{-4}$ \\
    Weight Decay & $1 \times 10^{-4}$ \\
    Diffusion Timesteps ($T$) & 1000 \\
    Noise Schedule & Linear ($\beta_1=10^{-4}, \beta_T=0.02$) \\
    Total Epochs & 1000 \\
    \hline
    \end{tabular}
  }
\end{table}


\subsection{Training and Inference Algorithm}

\label{app:algorithms}
\begin{algorithm2e}[H] 
\caption{Training of Simplex-Aligned Diffusion}
\label{alg:training}
\DontPrintSemicolon 
\KwIn{Training images $\mathcal{D} = \{(\mathbf{x}^{(i)}, \mathbf{y}^{(i)})\}_{i=1}^N$, Total timesteps $T$}
\KwOut{Optimized model parameters $\theta$}
\BlankLine
% 1. Data Mapping
$\mathbf{z}_0 \leftarrow \text{CenterLogRatio}(\mathbf{y})$ \tcp*{Map one-hot label to Logit Space}
\BlankLine
% 2. Training Loop
\For{each iteration}{
    Sample batch $(\mathbf{x}, \mathbf{z}_0)$ from $\mathcal{D}$\;
    Sample timestep $t \sim \text{Uniform}(\{1, \dots, T\})$\;
    Sample noise $\boldsymbol{\epsilon} \sim \mathcal{N}(\mathbf{0}, \mathbf{I})$\;
    \BlankLine
    % 3. Forward Diffusion
    $\mathbf{z}_t \leftarrow \sqrt{\bar{\alpha}_t} \mathbf{z}_0 + \sqrt{1 - \bar{\alpha}_t} \boldsymbol{\epsilon}$ \tcp*{Add noise}
    \BlankLine
    % 4. Conditional Feature Encoding (拆解后的核心修改)
    $\mathbf{v}_g, \mathbf{v}_l, \mathbf{Z}_{\text{raw}} \leftarrow \text{DualStreamEncoder}(\mathbf{x})$ \tcp*{Extract Priors \& Tokens}
    $\mathbf{F}_{\text{ref}} \leftarrow \text{TransformerInteraction}(\mathbf{Z}_{\text{raw}})$ \tcp*{Semantic Refinement}
    $\mathcal{M} \leftarrow \text{ConstructMap}(\mathbf{v}_g, \mathbf{v}_l)$ \tcp*{Spatial Guidance Construction}
    \BlankLine
    % 5. Reverse Process Prediction
    $\boldsymbol{\epsilon}_\theta \leftarrow \text{UNet}(\mathbf{z}_t, t, \mathcal{M}, \mathbf{F}_{\text{ref}})$ \tcp*{Predict noise}
    \BlankLine
    % 6. Optimization
    $\mathcal{L}_{\boldsymbol{\epsilon}} \leftarrow \| \boldsymbol{\epsilon} - \boldsymbol{\epsilon}_\theta \|^2$\;
    Update $\theta$ using $\nabla_\theta \mathcal{L}_{\boldsymbol{\epsilon}}$\;
}
\end{algorithm2e}
\begin{algorithm2e}[H]
\caption{Inference / Sampling Procedure}
\label{alg:inference}
\DontPrintSemicolon
\KwIn{Test Image $\mathbf{x}$, Sampling timesteps $T$}
\KwOut{Predicted Class Probability $\hat{\mathbf{p}}$}
\BlankLine
% 1. Initialization
$\mathbf{z}_T \sim \mathcal{N}(\mathbf{0}, \mathbf{I})$ \tcp*{Initialize from standard Gaussian}
% 核心修改：拆解 VisualTokenizer 为三个子步骤
$\mathbf{v}_g, \mathbf{v}_l, \mathbf{Z}_{\text{raw}} \leftarrow \text{DualStreamEncoder}(\mathbf{x})$ \tcp*{Extract Priors \& Raw Tokens}
$\mathbf{F}_{\text{ref}} \leftarrow \text{TransformerInteraction}(\mathbf{Z}_{\text{raw}})$ \tcp*{Semantic Refinement}
$\mathcal{M} \leftarrow \text{ConstructMap}(\mathbf{v}_g, \mathbf{v}_l)$ \tcp*{Spatial Guidance}
\BlankLine
% 2. Reverse Diffusion Loop
\For{$t = T, \dots, 1$}{
    $\boldsymbol{\epsilon}_{\text{pred}} \leftarrow \text{UNet}(\mathbf{z}_t, t, \mathcal{M}, \mathbf{F}_{\text{ref}})$\;
    $\mathbf{z}_{t-1} \leftarrow \frac{1}{\sqrt{\alpha_t}} \left( \mathbf{z}_t - \frac{1-\alpha_t}{\sqrt{1-\bar{\alpha}_t}} \boldsymbol{\epsilon}_{\text{pred}} \right) + \sigma_t \mathbf{z}_{\text{noise}}$\;
}
\BlankLine
% 3. Projection back to Simplex
$\hat{\mathbf{p}} \leftarrow \text{Softmax}(\lambda \cdot \mathbf{z}_0)$ \tcp*{Project Logits to Probability Simplex}
\Return $\hat{\mathbf{p}}$\;
\end{algorithm2e}

\section{Training Dynamics and Convergence Analysis}
\label{app:convergence}

To demonstrate the stability of our proposed framework, we compare the validation metric curves of the Baseline (DiffMIC-v2) and our method over 1000 epochs.

\begin{figure}[htbp]
  \centering
  \begin{minipage}[b]{0.45\linewidth}
    \centering
    \includegraphics[width=\linewidth]{appendix_results/aptos_convergence_alignedacc.pdf}
    \centerline{(a) Validation Accuracy}
  \end{minipage}
  \hfill 
  \begin{minipage}[b]{0.45\linewidth}
    \centering
    \includegraphics[width=\linewidth]{appendix_results/aptos_convergence_alignedf1.pdf}
    \centerline{(b) Validation F1-Score}
  \end{minipage}
  
  \caption{{Comparison of Training Dynamics.} The validation curves for (a) Accuracy and (b) F1-Score. The baseline (Blue) converges rapidly in the early epochs but suffers from saturation. In contrast, our method (Red) demonstrates continuous improvement.}
  \label{fig:training_dynamics}
\end{figure}
  
As illustrated in Figure~\ref{fig:training_dynamics}, the baseline model exhibits a phenomenon of \textbf{Early Saturation}. Since it forces the diffusion model to approximate discrete one-hot vectors directly, the model quickly memorizes easy samples but fails to learn robust features for hard samples, leading to metric fluctuations or a decline in later stages. 
Conversely, our Simplex-Aligned model operates in a continuous logit space, enabling finer-grained optimization. Even in the mid-to-late stages of training, our curve continues to rise, indicating that although optimization in the logits space progresses more slowly than in the one-hot space, the model does not prematurely saturate or fall into ambiguous local optima. Instead, it continues to explore more informative regions of the parameter space. This sustained exploration not only leads to competitive final performance but also enhances robustness to noise. In contrast, the baseline model quickly enters a saturated regime, where later training updates become ineffective.
{These empirical observations align perfectly with our theoretical analysis in Section~\ref{sec:theoretical_analysis}.} Specifically, the baseline's volatility corroborates the existence of the systematic bias $\boldsymbol{\delta}$ (Proposition~\ref{prop:bias}), which causes target jittering during training, whereas our method's training dynamics validates the target consistency of the unconstrained logit diffusion.
\section{Extended Robustness Evaluation}
\label{app:extended_robustness}

\subsection{Visualization of Clinical Corruptions}
\label{app:corruption_viz}

To provide an intuitive context for the domain-specific robustness evaluation, we visualize the simulated artifacts on both datasets. Figures~\ref{fig:aptos_corruptions} and \ref{fig:isic_corruptions} display the degradation effects across increasing severity levels (Level 1, 3, and 5) for APTOS 2019 and HAM10000, respectively.

% ==========================================
% Figure 1: APTOS (Retina)
% ==========================================
\begin{figure}[h]
\centering
\floatconts
  {fig:aptos_corruptions}
  {\caption{{Clinical Corruptions on APTOS2019 (Retina).} 
  \textbf{Shot Noise} simulates photon starvation; \textbf{Defocus Blur} mimics lens errors; \textbf{Brightness/Contrast} represent exposure instabilities.}}
  {%
    \setlength{\tabcolsep}{0.5pt} 
    \renewcommand{\arraystretch}{0.2} 
    \small
    \begin{tabular}{cc}
      % Row 1: Gaussian Noise
      \rotatebox{90}{\scriptsize \textbf{Gaussian}} & 
      \includegraphics[width=0.6\linewidth]{noise/APTOS_Gaussian_Noise_ROW.png} \\
      
      % Row 2: Shot Noise
      \rotatebox{90}{\scriptsize \textbf{Shot Noise}} & 
      \includegraphics[width=0.6\linewidth]{noise/APTOS_Shot_Noise_ROW.png} \\
      
      % Row 3: Defocus Blur
      \rotatebox{90}{\scriptsize \textbf{Defocus}} & 
      \includegraphics[width=0.6\linewidth]{noise/APTOS_Defocus_Blur_ROW.png} \\
      
      % Row 4: Brightness
      \rotatebox{90}{\scriptsize \textbf{Brightness}} & 
      \includegraphics[width=0.6\linewidth]{noise/APTOS_Brightness_ROW.png} \\
      
      % Row 5: Contrast
      \rotatebox{90}{\scriptsize \textbf{Contrast}} & 
      \includegraphics[width=0.6\linewidth]{noise/APTOS_Contrast_ROW.png} \\
    \end{tabular}
  }
\end{figure}

% ==========================================
% Figure 2: HAM10000 (Derma)
% ==========================================
\begin{figure}[h]
\centering
\floatconts
  {fig:isic_corruptions}
  {\caption{{Clinical Corruptions on HAM10000 (Dermatoscopy).} 
  }}
  {%
    \setlength{\tabcolsep}{2pt}
    \renewcommand{\arraystretch}{0.5}
    \small
    \begin{tabular}{cc}
      % Header Row
       & \footnotesize \textbf{Severity 1} \hfill \textbf{Severity 3} \hfill \textbf{Severity 5} \\
      
      % Row 1: Gaussian
      \rotatebox{90}{\scriptsize \textbf{Gaussian}} & 
      \includegraphics[width=0.6\linewidth]{noise/HAM10000_Gaussian_Noise_ROW.png} \\
      
      % Row 2: Shot Noise
      \rotatebox{90}{\scriptsize \textbf{Shot Noise}} & 
      \includegraphics[width=0.6\linewidth]{noise/HAM10000_Shot_Noise_ROW.png} \\
      
      % Row 3: Motion Blur
      \rotatebox{90}{\scriptsize \textbf{Motion}} & 
      \includegraphics[width=0.6\linewidth]{noise/HAM10000_Motion_Blur_ROW.png} \\
      
      % Row 4: Defocus Blur
      \rotatebox{90}{\scriptsize \textbf{Defocus}} & 
      \includegraphics[width=0.6\linewidth]{noise/HAM10000_Defocus_Blur_ROW.png} \\
      
      % Row 5: Brightness
      \rotatebox{90}{\scriptsize \textbf{Brightness}} & 
      \includegraphics[width=0.6\linewidth]{noise/HAM10000_Brightness_ROW.png} \\
      
      % Row 6: Contrast
      \rotatebox{90}{\scriptsize \textbf{Contrast}} & 
      \includegraphics[width=0.6\linewidth]{noise/HAM10000_Contrast_ROW.png} \\
    \end{tabular}
  }
\end{figure}

\subsection{Full Benchmarking Tables}
Due to space constraints in the main text, we present the complete benchmarking results for all evaluated corruption types and severity levels.
\begin{table}[htbp]
 % The first argument is the label.
 % The caption goes in the second argument, and the table contents
 % go in the third argument.
\floatconts
  {tab:aptos_noise_comparison}%
  {\caption{Comparison of other noise types on the APTOS2019 dataset.}}%
  {%
  \resizebox{\textwidth}{!}{%
  \begin{tabular}{lccccccccc}
  \hline
  \multirow{2}{*}{\bfseries Type} & \multirow{2}{*}{\bfseries Sev} & \multicolumn{2}{c}{\bfseries Acc $\uparrow$} & \multicolumn{2}{c}{\bfseries F1 $\uparrow$} & \multicolumn{2}{c}{\bfseries Kappa $\uparrow$} & \multicolumn{2}{c}{\bfseries ECE $\downarrow$} \\
   & & \bfseries Ours & \bfseries Base & \bfseries Ours & \bfseries Base & \bfseries Ours & \bfseries Base & \bfseries Ours & \bfseries Base \\
  \hline
     \multirow{3}{*}{Defocus} 
      & 1 & 0.561 & {0.625} & 0.383 & {0.450} & {0.593} & 0.495 & 0.206 & {0.135} \\
     & 3 & {0.219} & 0.176 & 0.118 & {0.156} & {0.105} & 0.026 & 0.634 & {0.612} \\
     & 5 & {0.197} & 0.138 & 0.106 & {0.119} & {0.060} & 0.002 & 0.661 & {0.651} \\
     \hline
  \multirow{3}{*}{saturate} 
   & 1 & 0.755 & 0.716 & 0.481 & 0.414 & 0.772 & 0.725 & 0.109 & 0.163 \\
   & 3 & 0.788 & 0.753 & 0.615 & 0.570 & 0.875 & 0.857 & 0.068 & 0.098 \\
   & 5 & 0.771 & 0.731 & 0.604 & 0.556 & 0.802 & 0.805 & 0.074 & 0.113 \\
  \hline
  \multirow{3}{*}{brightness} 
   & 1 & 0.826 & 0.787 & 0.626 & 0.606 & 0.883 & 0.835 & 0.045 & 0.069 \\
   & 3 & 0.679 & 0.428 & 0.503 & 0.349 & 0.674 & 0.383 & 0.079 & 0.348 \\
   & 5 & 0.543 & 0.294 & 0.375 & 0.215 & 0.516 & 0.166 & 0.188 & 0.489 \\
  \hline
  \multirow{3}{*}{contrast} 
   & 1 & 0.794 & 0.769 & 0.537 & 0.439 & 0.844 & 0.762 & 0.086 & 0.120 \\
   & 3 & 0.708 & 0.699 & 0.405 & 0.380 & 0.701 & 0.579 & 0.102 & 0.128 \\
   & 5 & 0.491 & 0.461 & 0.266 & 0.280 & 0.341 & 0.286 & 0.147 & 0.243 \\
  \hline
  \end{tabular}}%
  }
\end{table}

\begin{table}[htbp]
 % The first argument is the label.
 % The caption goes in the second argument, and the table contents
 % go in the third argument.
\floatconts
  {tab:ham10000_noise_comparison}%
  {\caption{Comparison of other noise types on the HAM10000 dataset.}}%
  {%
  \resizebox{\textwidth}{!}{%
  \begin{tabular}{lccccccccc}
  \hline
  \multirow{2}{*}{\bfseries Type} & \multirow{2}{*}{\bfseries Sev} & \multicolumn{2}{c}{\bfseries Acc $\uparrow$} & \multicolumn{2}{c}{\bfseries F1 $\uparrow$} & \multicolumn{2}{c}{\bfseries Kappa $\uparrow$} & \multicolumn{2}{c}{\bfseries ECE $\downarrow$} \\
   & & \bfseries Ours & \bfseries Base & \bfseries Ours & \bfseries Base & \bfseries Ours & \bfseries Base & \bfseries Ours & \bfseries Base \\
  \hline
  \multirow{3}{*}{saturate} 
   & 1 & 0.683 & 0.666 & 0.304 & 0.387 & 0.221 & 0.294 & 0.040 & 0.086 \\
   & 3 & 0.692 & 0.668 & 0.427 & 0.333 & 0.284 & 0.209 & 0.051 & 0.067 \\
   & 5 & 0.539 & 0.317 & 0.151 & 0.110 & 0.094 & 0.0172 & 0.117 & 0.242 \\
  \hline
  \multirow{3}{*}{shot\_noise} 
   & 1 & 0.711 & 0.122 & 0.301 & 0.036 & 0.363 & 0.001 & 0.076 & 0.569 \\
   & 3 & 0.103 & 0.111 & 0.044 & 0.029 & 0.033 & 0.001 & 0.604 & 0.454 \\
   & 5 & 0.051 & 0.096 & 0.014 & 0.062 & 0.001 & 0.179 & 0.718 & 0.188 \\
  \hline
  \multirow{3}{*}{brightness} 
   & 1 & 0.867 & 0.855 & 0.752 & 0.721 & 0.750 & 0.732 & 0.135 & 0.090 \\
   & 3 & 0.754 & 0.737 & 0.493 & 0.423 & 0.516 & 0.437 & 0.062 & 0.065 \\
   & 5 & 0.720 & 0.698 & 0.390 & 0.270 & 0.372 & 0.234 & 0.035 & 0.114 \\
  \hline
  \multirow{3}{*}{contrast} 
   & 1 & 0.742 & 0.713 & 0.443 & 0.381 & 0.352 & 0.381 & 0.081 & 0.106 \\
   & 3 & 0.681 & 0.663 & 0.143 & 0.163 & 0.047 & 0.130 & 0.089 & 0.120 \\
   & 5 & 0.678 & 0.580 & 0.1156 & 0.114 & 0.002 & 0.000 & 0.175 & 0.177 \\
  \hline
  \end{tabular}}%
  }
\end{table}
\section{Additional Qualitative Results}
\label{app:more_qualitative}

We provide extensive qualitative comparisons to further substantiate the robustness of our Simplex-Aligned Diffusion framework.

% ==========================================
% Figure E.1: APTOS Extended Results (Full Width)
% ==========================================
\begin{figure}[t!]
\centering
\floatconts
  {fig:appendix_aptos_gradcam}
  {\caption{{Extended Qualitative Analysis on APTOS2019 (Retina).}}}
  {%
    \setlength{\tabcolsep}{0pt} 
    \renewcommand{\arraystretch}{0.2}
    \begin{tabular}{c} 
      
      % --- Case 1 ---
      \includegraphics[width=0.8\linewidth]{{appendix_results/APTOS_ID3_GT2_Base1.png}} \\
      \vspace{-3mm} 
      
      % --- Case 2 ---
      \includegraphics[width=0.8\linewidth]{appendix_results/APTOS_ID58_GT2_Base1.png} \\
      \vspace{-3mm}
      
      % --- Case 3 ---
      \includegraphics[width=0.8\linewidth]{appendix_results/APTOS_ID638_GT2_Base1.png} \\
      \vspace{-3mm}
      
      % --- Case 4 ---
      \includegraphics[width=0.8\linewidth]{appendix_results/APTOS_ID918_GT2_Base1.png} \\
      \vspace{-3mm}
      
      % --- Case 5 ---
      \includegraphics[width=0.8\linewidth]{appendix_results/APTOS_ID922_GT2_Base1.png} \\
      
    \end{tabular}
  }
\end{figure}

% ==========================================
% Figure E.2: HAM10000 Extended Results (Full Width)
% ==========================================
\begin{figure}[t!]
\centering
\floatconts
  {fig:appendix_ham_gradcam}
  {\caption{{Extended Qualitative Analysis on HAM10000 (Dermatoscopy).}}}
  {%
    \setlength{\tabcolsep}{0pt}
    %\renewcommand{\arraystretch}{0.5}
    \begin{tabular}{c} 
      % --- Case 1 ---
      \includegraphics[width=0.8\linewidth]{appendix_results/ID1369_GT1_Base0_Simp1.png} \\
      \vspace{-3mm}
      
      % --- Case 2 ---
      \includegraphics[width=0.8\linewidth]{appendix_results/ID1371_GT4_Base0_Simp4.png} \\
      \vspace{-3mm}
      
      % --- Case 3 ---
      \includegraphics[width=0.8\linewidth]{appendix_results/ID1464_GT0_Base1_Simp0.png} \\
      \vspace{-3mm}
      
      % --- Case 4 ---
      \includegraphics[width=0.8\linewidth]{appendix_results/ID2940_GT1_Base0_Simp1.png} \\
      \vspace{-3mm}
      
      % --- Case 5 ---
      \includegraphics[width=0.8\linewidth]{appendix_results/ID2299_GT4_Base1_Simp4.png} \\
    \end{tabular}
  }
\end{figure}
\section{Mathematical Definitions}
\label{app:math_definitions}

\subsection{Cohen's Kappa ($\kappa$)}
Cohen's Kappa measures the agreement between the predicted classification and the ground truth, correcting for chance agreement. It is defined as:
\begin{equation}
    \kappa = \frac{p_o - p_e}{1 - p_e}
\end{equation}
where $p_o$ is the relative observed agreement (Accuracy), and $p_e$ is the hypothetical probability of chance agreement. For ordinal tasks like APTOS, we utilize the Quadratic Weighted Kappa.

\subsection{Expected Calibration Error (ECE)}
ECE measures the expected discrepancy between the model's confidence and its empirical accuracy. We partition the $n$ test samples into $M$ equally spaced bins (e.g., $M=15$) based on their prediction confidence. 
For a sample $i$, let $\hat{y}_i$ be the predicted class and $\hat{p}_i = \max_c P(y=c|x_i)$ be the associated confidence.
Let $B_m$ be the set of samples falling into the $m$-th bin. ECE is calculated as the weighted average of the absolute difference between accuracy and confidence:
\begin{equation}
    \text{ECE} = \sum_{m=1}^M \frac{|B_m|}{n} \left| \text{acc}(B_m) - \text{conf}(B_m) \right|
\end{equation}
where $|B_m|$ is the number of samples in bin $m$, $\text{acc}(B_m) = \frac{1}{|B_m|} \sum_{i \in B_m} \mathbb{1}(\hat{y}_i = y_i)$ is the bin's accuracy ($y_i$ is the ground truth), and $\text{conf}(B_m) = \frac{1}{|B_m|} \sum_{i \in B_m} \hat{p}_i$ is the average confidence. Lower ECE indicates a better-calibrated model, which is vital for reliable clinical decision-making.

\section{Theoretical Analysis}
\input{theoretical}
\section{{Visualization of Failure Modes and Explainability}}

{
\paragraph{Defining ``Meaningful Failure".} 
We define a {``Meaningful Failure"} as a scenario where the model's categorical prediction is incorrect, yet its internal spatial reasoning, as evidenced by Grad-CAM localization, remains semantically aligned with the actual pathological regions. This distinction is critical for clinical safety: a model that fails while still highlighting the correct lesion is far more trustworthy than one that fails by shifting its focus to background artifacts.

\paragraph{Dataset-Specific Failure Analysis.} 
In this analysis, we specifically focus on \textbf{Motion Blur for HAM10000} and \textbf{Shot Noise for APTOS2019}:
\begin{itemize}
    \item \textbf{HAM10000 (Motion Blur):} As handheld dermatoscopy is uniquely susceptible to operator hand tremors, Motion Blur represents the most pervasive clinical artifact in skin lesion screening.
    \item \textbf{APTOS2019 (Shot Noise):} As noted in our statistical evaluation (Table~\ref{tab:stat_aptos}), Shot Noise Level 5 exhibits the highest performance variance. Rather than overlooking this instability, we utilize visualization to investigate its cause. We observe that extreme Poisson noise creates a high-variance regime where single-seed outliers may occur, yet our model consistently preserves structural discrimination even when the label prediction drifts.
\end{itemize}

\paragraph{Logit Variance as Diagnostic Intent.} 
We interpret the \textbf{Variance of Logits (Var)} as a quantitative measure of the model's \textbf{diagnostic intent}. A high variance indicates that the model is actively discriminating between classes, forcing the diffusion process toward a specific vertex of the logit manifold. 
In contrast, the baseline's low variance (often $< 0.1$) indicates ``logit flattening," where the model loses its ability to distinguish pathology from noise. This leads to \textbf{mode collapse}, where the baseline defaults to the majority class regardless of the visual input. By maintaining high discriminative variance, our Simplex-Aligned Diffusion demonstrates that it continues to "search" for lesions under noise levels that cause standard models to fail blindly.
Through Grad-CAM visualization of Figure~\ref{fig:failure_analysis_isic} and Figure~\ref{fig:failure_analysis_aptos}, we observe that our Simplex-Aligned Diffusion consistently maintains precise {sickness localization}, even when the specific diagnostic label is misidentified. This indicates that our logit-space geometric regularizer effectively preserves the semantic integrity of the latent features. In contrast, the baseline (DiffMIC-v2) typically exhibits \textbf{mode collapse}, defaulting to the majority class with sparse or chaotic attention maps that fail to focus on diagnostic regions. This demonstrates that our method is more clinically reliable: it provides persistent visual guidance to clinicians even when the signal-to-noise ratio is severely degraded, whereas the baseline’s failure is semantically uninformative.

\begin{figure}[ht]

\centering
\includegraphics[width=0.95\textwidth]{failure_cases/aptos/ID948_Ratio151.6_GT4.png}
\vspace{8pt} % 增加组间距

\includegraphics[width=0.95\textwidth]{failure_cases/aptos/ID814_Ratio106.3_GT3.png}
\vspace{8pt}

\includegraphics[width=0.95\textwidth]{failure_cases/aptos/ID780_Ratio27.3_GT4.png}

\caption{{Qualitative comparison of failure modes for APTOS2019 under severe clinical artifacts (shot noise). \textbf{Left to Right:} Clean image (GT), Noisy image, Baseline Grad-CAM, and Our Simplex-Aligned Grad-CAM. Even when both models output incorrect labels, our method maintains a high discriminative variance (Var) and consistently localizes the sick area, whereas the baseline attention becomes chaotic or collapses to the majority class.}}
\label{fig:failure_analysis_aptos}
\end{figure}

\begin{figure}[ht]

\centering
\includegraphics[width=0.95\textwidth]{failure_cases/isic/ID2241_Ratio57.4_GT1.png}
\vspace{8pt} % 增加组间距

\includegraphics[width=0.95\textwidth]{failure_cases/isic/ID999_Ratio66.9_GT0.png}
\vspace{8pt}

\includegraphics[width=0.95\textwidth]{failure_cases/isic/ID2295_Ratio19.2_GT2.png}

\caption{{Qualitative comparison of failure modes for HAM10000 under motion blur. \textbf{Left to Right:} Clean image (GT), Noisy image, Baseline Grad-CAM, and Our Simplex-Aligned Grad-CAM. Even when both models output incorrect labels, our method maintains a high discriminative variance (Var) and consistently localizes the lesion area, whereas the baseline attention becomes chaotic or collapses to the majority class.}}
\label{fig:failure_analysis_isic}
\end{figure}


\section{Sensitivity Analysis}
{To investigate the robustness of our Simplex-Aligned Diffusion framework, we conducted extensive ablation studies on two key hyperparameters: the scaling factor $\lambda$ and the label smoothing constant $\epsilon$. The results are summarized in Table~\ref{tab:sensitivity}.}

\begin{table}[ht]
\centering
\caption{{Sensitivity analysis of hyperparameters $\lambda$ and $\epsilon$ on HAM10000 and APTOS2019 datasets (Accuracy).}}
\vspace{8pt} % 解决标题拥挤问题
\label{tab:sensitivity}
\begin{tabular}{llccc}
\toprule
\textbf{Parameter} & \textbf{Dataset} & \textbf{Value 1} & \textbf{Optimal Value} & \textbf{Value 2} \\
\midrule
\multirow{2}{*}{Scaling ($\lambda$)} & HAM10000 & 0.8761 ($\lambda=1.0$) & \textbf{0.8940 ($\lambda=1.5$)} & 0.8864 ($\lambda=2.0$) \\
 & APTOS2019 & 0.8178 ($\lambda=1.0$) & \textbf{0.8480 ($\lambda=1.5$)} & 0.8424 ($\lambda=2.0$) \\
\midrule
\multirow{2}{*}{Smoothing ($\epsilon$)} & HAM10000 & 0.8818 ($10^{-2}$) & \textbf{0.8940 ($10^{-3}$)} & 0.8732 ($10^{-4}$) \\
 & APTOS2019 & 0.8333 ($10^{-2}$) & \textbf{0.8480 ($10^{-3}$)} & 0.8443 ($10^{-4}$) \\
\bottomrule
\end{tabular}
\end{table}

\paragraph{Scaling factor $\lambda$.} 
As shown in Table~\ref{tab:sensitivity}, model performance exhibits a clear single-peak behavior with respect to the scaling factor $\lambda$, achieving the best results at $\lambda = 1.5$ on both benchmarks. When moving away from this value, performance degrades smoothly rather than collapsing abruptly. Smaller $\lambda$ values under-utilize the simplex alignment, while excessively large values lead to more over-confident predictions, which slightly harms robustness on ambiguous samples. Overall, the observed trend indicates that the method is not overly sensitive to $\lambda$, and a fixed $\lambda = 1.5$ provides a stable and reproducible operating point.

\paragraph{Label smoothing $\epsilon$.} 
For label smoothing, $\epsilon = 10^{-3}$ consistently yields the best accuracy across both datasets. Larger $\epsilon$ ($10^{-2}$) overly softens the label distribution and reduces discriminative power, whereas smaller $\epsilon$ ($10^{-4}$) approaches hard one-hot labels and weakens the numerical stability of the CLR transformation. Importantly, performance varies smoothly across the tested range, suggesting that the model remains robust within reasonable smoothing strengths.

\paragraph{Learnable scaling.} 
We considered making $\lambda$ a learnable parameter, but opted for a fixed global scaling in our final design. Since $\lambda$ directly controls the geometry of the simplex-aligned logit transformation, learning it jointly with the model can introduce additional optimization instability and reduce interpretability. Given the smooth sensitivity trends, a fixed $\lambda$ offers a favorable balance between performance, stability, and ease of deployment.

\section{Statistical Stability over Multiple Runs}

To ensure the statistical significance of our findings, we conducted three independent training and evaluation runs from scratch using different random seeds (e.g., 42, 123, 999). All results reported in this section represent the \textbf{Mean $\pm$ Standard Deviation} over these runs.

\subsection{Main Performance Stability}
As shown in Table~\ref{tab:stat_main}, our Simplex-Aligned Diffusion consistently outperforms the primary baseline (DiffMIC-v2) across both benchmarks. The small standard deviations across multiple runs confirm that our logit-space geometric regularization provides a stable optimization landscape, avoiding the sensitivity often associated with generative classifiers.

\begin{table}[ht]
\centering
\caption{Main performance comparison averaged over three independent runs (Mean $\pm$ Std).}
\label{tab:stat_main}
\begin{tabular}{llcc}
\toprule
\textbf{Dataset} & \textbf{Metric} & \textbf{DiffMIC-v2 (Baseline)} & \textbf{Ours (Simplex-Aligned)} \\
\midrule
HAM10000 & Accuracy $\uparrow$ & 0.8830 $\pm$ 0.0045 & \textbf{0.8932 $\pm$ 0.0051} \\
& F1-Score $\uparrow$ & 0.8233 $\pm$ 0.0101 & \textbf{0.8256 $\pm$ 0.0124} \\
\midrule
APTOS2019 & Accuracy $\uparrow$ & 0.8385 $\pm$ 0.0032 & \textbf{0.8476 $\pm$ 0.0025} \\
& F1-Score $\uparrow$ & \textbf{0.6687 $\pm$ 0.0028} & 0.6656 $\pm$ 0.0064 \\
\bottomrule
\end{tabular}
\end{table}

\subsection{Robustness Stability and Failure Mode Analysis}
We further evaluate the stability of our model under clinical artifacts. Tables~\ref{tab:stat_ham} and \ref{tab:stat_aptos} provide a detailed breakdown of performance degradation dynamics.

\textbf{Resistance to Mode Collapse.} 
A critical observation is the baseline's susceptibility to \textbf{mode collapse} under severe noise. For instance, in Table~\ref{tab:stat_aptos} under Shot Noise (Severity 5), the baseline's Cohen's Kappa remains near zero ($0.081 \pm 0.088$), indicating that its accuracy is largely driven by random guessing or majority-class prediction. In contrast, our method maintains a consistently higher and positive Kappa ($0.194 \pm 0.233$), demonstrating persistent discriminative power.

\textbf{Analysis of Shot Noise Instability.} 
Shot Noise exhibits the highest variance among all tested corruptions, reflecting the stochastic nature of Poisson noise in low-light sensors. Despite this inherent instability, our Simplex-Aligned strategy maintains a strictly superior mean accuracy across all severities (e.g., $0.527 > 0.375$ at S1; $0.447 > 0.390$ at S3; $0.433 > 0.243$ at S5). Our method exhibits graceful, monotonic degradation in expected performance, unlike the volatile fluctuations observed in the baseline.

\begin{table}[ht]
\centering
\caption{HAM10000 robustness evaluation (Mean $\pm$ Std over 3 Runs). {Our method exhibits superior metric stability and significantly lower calibration error.}}
\label{tab:stat_ham}
\vspace{5pt}
\footnotesize % 使用较小字号确保页边距安全
\setlength{\tabcolsep}{5pt}
\begin{tabular}{lcccccc}
\toprule
\textbf{Noise Type} & \textbf{Sev.} & \textbf{Method} & \textbf{Accuracy $\uparrow$} & \textbf{Kappa $\uparrow$} & \textbf{ECE $\downarrow$} \\
\midrule
\multirow{6}{*}{Defocus} & \multirow{2}{*}{1} & Base & 0.748 $\pm$ 0.009 & 0.369 $\pm$ 0.041 & 0.105 $\pm$ 0.025 \\
 & & Ours & \textbf{0.788 $\pm$ 0.008} & \textbf{0.518 $\pm$ 0.024} & \textbf{0.086 $\pm$ 0.023} \\
\cmidrule{2-6}
 & \multirow{2}{*}{3} & Base & 0.674 $\pm$ 0.018 & 0.083 $\pm$ 0.041 & 0.186 $\pm$ 0.047 \\
 & & Ours & \textbf{0.713 $\pm$ 0.009} & \textbf{0.279 $\pm$ 0.011} & \textbf{0.069 $\pm$ 0.028} \\
\cmidrule{2-6}
 & \multirow{2}{*}{5} & Base & 0.655 $\pm$ 0.040 & 0.049 $\pm$ 0.134 & 0.218 $\pm$ 0.051 \\
 & & Ours & \textbf{0.685 $\pm$ 0.003} & \textbf{0.238 $\pm$ 0.058} & \textbf{0.109 $\pm$ 0.019} \\
\midrule
\multirow{6}{*}{Motion} & \multirow{2}{*}{1} & Base & 0.837 $\pm$ 0.005 & 0.632 $\pm$ 0.023 & \textbf{0.067 $\pm$ 0.029} \\
 & & Ours & 0.837 $\pm$ 0.010 & \textbf{0.659 $\pm$ 0.021} & 0.098 $\pm$ 0.022 \\
\cmidrule{2-6}
 & \multirow{2}{*}{3} & Base & 0.718 $\pm$ 0.012 & 0.280 $\pm$ 0.037 & 0.122 $\pm$ 0.031 \\
 & & Ours & \textbf{0.742 $\pm$ 0.007} & \textbf{0.389 $\pm$ 0.023} & \textbf{0.062 $\pm$ 0.017} \\
\cmidrule{2-6}
 & \multirow{2}{*}{5} & Base & 0.681 $\pm$ 0.018 & 0.165 $\pm$ 0.041 & 0.161 $\pm$ 0.049 \\
 & & Ours & \textbf{0.699 $\pm$ 0.004} & \textbf{0.284 $\pm$ 0.045} & \textbf{0.090 $\pm$ 0.023} \\
\bottomrule
\end{tabular}
\end{table}

\begin{table}[ht]
\centering
\caption{APTOS2019 robustness evaluation (Mean $\pm$ Std over 3 Runs). {Note the robust discriminative power (Kappa) of our method even in extreme noise regimes.}}
\label{tab:stat_aptos}
\vspace{5pt}
\footnotesize
\setlength{\tabcolsep}{4pt}
\begin{tabular}{lcccccc}
\toprule
\textbf{Noise Type} & \textbf{Sev.} & \textbf{Method} & \textbf{Accuracy $\uparrow$} & \textbf{Kappa $\uparrow$} & \textbf{ECE $\downarrow$} \\
\midrule
\multirow{6}{*}{Shot} & \multirow{2}{*}{1} & Base & 0.375 $\pm$ 0.204 & \textbf{0.295 $\pm$ 0.179} & 0.287 $\pm$ 0.105 \\
 & & Ours & \textbf{0.527 $\pm$ 0.018} & 0.229 $\pm$ 0.150 & \textbf{0.220 $\pm$ 0.033} \\
\cmidrule{2-6}
 & \multirow{2}{*}{3} & Base & 0.390 $\pm$ 0.187 & 0.104 $\pm$ 0.195 & 0.225 $\pm$ 0.143 \\
 & & Ours & \textbf{0.447 $\pm$ 0.129} & \textbf{0.142 $\pm$ 0.133} & \textbf{0.203 $\pm$ 0.149} \\
\cmidrule{2-6}
 & \multirow{2}{*}{5} & Base & 0.243 $\pm$ 0.167 & 0.081 $\pm$ 0.088 & \textbf{0.352 $\pm$ 0.247} \\
 & & Ours & \textbf{0.433 $\pm$ 0.220} & \textbf{0.194 $\pm$ 0.233} & 0.410 $\pm$ 0.273 \\
\midrule
\multirow{6}{*}{Motion} & \multirow{2}{*}{1} & Base & 0.731 $\pm$ 0.083 & 0.774 $\pm$ 0.126 & 0.108 $\pm$ 0.035 \\
 & & Ours & \textbf{0.807 $\pm$ 0.007} & \textbf{0.855 $\pm$ 0.007} & \textbf{0.067 $\pm$ 0.008} \\
\cmidrule{2-6}
 & \multirow{2}{*}{3} & Base & 0.589 $\pm$ 0.035 & \textbf{0.643 $\pm$ 0.068} & 0.177 $\pm$ 0.043 \\
 & & Ours & \textbf{0.665 $\pm$ 0.031} & 0.638 $\pm$ 0.035 & \textbf{0.118 $\pm$ 0.009} \\
\cmidrule{2-6}
 & \multirow{2}{*}{5} & Base & 0.512 $\pm$ 0.040 & 0.201 $\pm$ 0.038 & 0.463 $\pm$ 0.082 \\
 & & Ours & \textbf{0.555 $\pm$ 0.060} & \textbf{0.424 $\pm$ 0.064} & \textbf{0.186 $\pm$ 0.053} \\
\bottomrule
\end{tabular}
\end{table}
\end{document}
