\documentclass{midl} 

\usepackage{amsmath}
\usepackage{amssymb} 
\usepackage{parskip} 
\usepackage{graphicx}
\usepackage{algorithm}
\usepackage{colortbl}
\usepackage{algpseudocode} 
% \usepackage[ruled,vlined]{algorithm2e}
\usepackage{threeparttable}
\usepackage{booktabs} 
\usepackage{multirow}
\usepackage{float}
\usepackage{placeins}
% \usepackage{hyperref} 
% \usepackage{mwe} 
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\jmlrvolume{-- nnn}
\editors{Accepted for publication at MIDL 2026}

\title[Adaptive Inference for Medical Vision Transformers]{Adaptive Inference for Medical Vision Transformers: \\ Token Reduction or Early Exit?}

\midlauthor{
\Name{Ji Young Byun \midljointauthortext{Contributed equally}\nametag{$^{1}$}} \Email{jbyun13@jhu.edu} \\
\Name{HyunSeo Lee \midlotherjointauthor \nametag{$^{1}$}} \Email{hlee267@jhu.edu} \\
\Name{Jordan Shuff \nametag{$^{1, 3, 4, 5}$}} \Email{jshuff1@jhu.edu} \\
\Name{Rengaraj Venkatesh \nametag{$^{6}$}} \Email{venkatesh@aravind.org} \\
\Name{Nakul S. Shekhawat\thanks{Corresponding authors} \nametag{$^{4}$}} \Email{nshekha1@jhmi.edu} \\
\Name{Kunal S. Parikh$^{\dagger}$ \nametag{$^{1, 3, 4, 5}$}} \Email{ksp@jhu.edu} \\
\Name{Rama Chellappa$^{\dagger}$ \nametag{$^{1, 2}$}} \Email{rchella4@jhu.edu}\\
\addr $^{1}$ Department of Biomedical Engineering, Johns Hopkins University, Baltimore, MD, USA \\
\addr $^{2}$ Department of Electrical and Computer Engineering, Johns Hopkins University, Baltimore, MD, USA \\
\addr $^{3}$ Glaucoma Center of Excellence and Center for Nanomedicine, Wilmer Eye Institute, Johns Hopkins University School of Medicine, Baltimore, MD, USA \\
\addr $^{4}$ Wilmer Eye Institute, Johns Hopkins University School of Medicine, Baltimore, MD, US \\
\addr $^{5}$ Center for Bioengineering Innovation \& Design, Johns Hopkins University, Baltimore, MD, USA \\
\addr $^{6}$ Aravind Eye Hospital, Pondicherry, India \\
}


\begin{document}

\maketitle

\begin{abstract}
Vision Transformers (ViTs) have demonstrated exceptional performance in medical image analysis, yet their computational demands hinder clinical deployment, particularly in time-sensitive applications. Medical imaging requires sample-adaptive optimization due to dataset heterogeneity across modalities and sample complexity; uniform strategies do not well balance efficiency and accuracy. We propose a unified adaptive inference framework that combines Token Reduction (TR) and Early Exiting (EE) through dataset-specific profiling. Our approach quantifies spatial redundancy via Jensen-Shannon Divergence (JSD) and prediction confidence at intermediate layers to train a lightweight predictor that dynamically selects inference strategies at test time. Across five medical datasets, including a real-world cataract dataset (INSIGHT), our framework achieves 71.4\% average floating-point operations (FLOPs) reduction with only 0.1pp accuracy loss, substantially outperforming individual strategies (EE-only: 55.9\%, TR-only: 57.7\%). On PathMNIST, our adaptive inference framework simultaneously improves accuracy by 1.3pp while reducing computation by 77.2\%. On INSIGHT, we maintain baseline accuracy with 69.8\% FLOPs reduction, demonstrating robust real-world clinical applicability.
\end{abstract}

\begin{keywords}
Vision Transformers, Efficient Inference, Token Reduction, Early Exiting.
\end{keywords}


\section{Introduction}
\label{sec:intro}

Vision Transformers (ViTs) have achieved state-of-the-art performance across medical imaging tasks, including dermatological lesion classification~\citep{himel2024skin,al2025deep}, chest X-ray diagnosis~\citep{singh2024efficient}, histopathological tissue analysis~\cite{xu2023vision}, and ophthalmic image analysis~\citep{wu2023vision}, leveraging self-attention to capture long-range dependencies crucial for complex diagnostic tasks~\citep{dosovitskiy2021imageworth16x16words}. However, clinical deployment faces critical computational barriers in time-sensitive and resource-constrained settings. High-volume screening programs for diabetic retinopathy or cataract screening must process thousands of images daily~\citep{ruamviboonsuk2022real,tham2022detecting}, where even modest per-image latency accumulates into substantial computational burden. Point-of-care imaging on mobile devices~\citep{xu2025edge} operates under severe hardware constraints, making standard ViT models impractical for these scenarios.

Approaches to address computational inefficiency in deep learning divide into model-centric optimizations (quantization~\citep{li2023vit,du2024model}, compression~\citep{wang2022vtc,zhang2022minivit}, efficient attention~\citep{han2023flatten}) that apply static architectural changes, and data-centric methods that dynamically adapt to input characteristics. Two prominent data-centric strategies are TR~\citep{rao2021dynamicvit,liang2022not,bolya2022token}, which eliminates uninformative tokens, and EE~\citep{bakhtiarnia2022single,xu2023lgvit}, which terminates inference when samples achieve sufficient confidence. These data-centric approaches are particularly well-suited for medical imaging, where substantial variability exists both across and within modalities~\citep{kline2022multimodal}. In safety-critical medical applications, matching the appropriate strategy to dataset-specific characteristics is crucial, as suboptimal choices risk compromised diagnostic accuracy. 

To address this gap, we introduce a unified framework for adaptive strategy selection across diverse medical imaging datasets (ISIC2019, PathMNIST, PneumoniaMNIST, RetinaMNIST, INSIGHT). We calibrate dataset-specific thresholds: TR threshold via Jensen-Shannon Divergence between attention distributions, and EE confidence thresholds at checkpoints. At inference, a lightweight CNN predictor estimates redundancy from input images to activate TR while intermediate heads enable early termination based on confidence. This ensures redundant samples undergo TR, high-confidence cases EE, and complex cases receive full processing, maximizing efficiency without compromising diagnostic accuracy.

Our main contributions are:
\begin{itemize}
    \item \textbf{Unified Adaptive Framework:} We propose a unified framework that integrates TR and EE for ViTs, utilizing a lightweight predictor to adaptively activate TR based on input-specific spatial redundancy while leveraging confidence-based EE at intermediate layers, enabling instance-level optimization of both spatial and temporal redundancy.
    
    \item \textbf{Dataset-Specific Profiling Methodology:} We conduct comprehensive profiling analysis to characterize redundancy-complexity profiles through token-level similarity and sample-wise confidence distribution, revealing that optimal strategies vary across datasets.
    
    \item \textbf{Superior Efficiency-Accuracy Trade-offs:} Through comprehensive evaluation across five medical imaging datasets, our unified framework achieves 71.4\% average FLOPs reduction with 0.1pp average accuracy degradation, outperforming individual strategies (EE-only: 55.9\%, TR-only: 57.7\%).
\end{itemize}

\section{Methodology}
\label{sec:method}
This study proposes an integrated framework combining TR and EE to reduce computational cost in ViTs by adapting TR globally based on input image characteristics while using local prediction confidence for EE decisions. Our approach consists of three stages: (1) fine-tuning Data-efficient Image Transformer-Small (DeiT-S)~\citep{touvron2021trainingdataefficientimagetransformers} with EE heads on the training set, (2) profiling the validation set to calibrate TR and EE thresholds, and (3) deploying the unified strategy at test-time inference. Detailed dataset statistics are provided in Table~\ref{tab:dataset_splits}.


\subsection{Stage 1: Model Training with Early Exit Heads}
We fine-tune DeiT-S with 12 transformer blocks, attaching lightweight classifier heads at layers 4, 7, and 10. Each EE head operates on the CLS token using a two-layer MLP with layer normalization. We train all heads simultaneously using a weighted multi-exit loss:
\begin{equation}
L_{\text{total}} = w_{\text{final}} \cdot L_{\text{final}} + \sum_{k \in \{4, 7, 10\}} w_k \cdot L_k
\end{equation}
where $L_k$ is the cross-entropy loss at layer $k$. We set $w_4 = w_7 = w_{10} = 0.3 $ and $w_{\text{final}} = 1.0$. 

\subsection{Stage 2: Dataset-Specific Profiling for Strategy Selection}
We use the validation set to profile dataset-specific characteristics and calibrate thresholds for adaptive inference: (1) EE confidence thresholds $\theta_{\text{EE}}$ for each checkpoint, (2) ground truth redundancy scores for training the lightweight predictor, and (3) the redundancy threshold $\theta_{\text{R}}$ for TR activation.

\subsubsection{Early Exit Threshold Calibration}
We calibrate dataset-specific thresholds $\theta_{\text{EE}}$ by sweeping confidence values on the validation set, selecting thresholds that maximize FLOPs reduction while maintaining accuracy degradation $<$1\%. At inference, when confidence $c_k = \max(\text{Softmax}(\mathbf{z}_k))$ at layer $k$ exceeds $\theta_{\text{EE}}$, the sample is classified and inference terminates. Otherwise, computation continues to the next block.


\subsubsection{Spatial Redundancy Profiling}
To determine which samples benefit from TR, we quantify spatial redundancy using attention similarity. For each validation sample, we compute the ground truth redundancy score $y_{\text{red}}$ based on JSD between attention distributions:
\begin{equation}
y_{\text{red}} = 1 - \frac{1}{3}\left(\text{JSD}(\mathbf{a}_1, \mathbf{a}_4) + \text{JSD}(\mathbf{a}_4, \mathbf{a}_7) + \text{JSD}(\mathbf{a}_7, \mathbf{a}_{10})\right)
\end{equation}
where $\mathbf{a}_i$ represents the attention distribution at layer $i$. We select layers to balance coverage and efficiency: early layers alone miss semantic patterns, late layers overlook initial redundancy, while sparser/denser sampling either misses dynamics or adds unnecessary overhead. High $y_{\text{red}}$ values (close to 1) indicate low divergence between attention patterns across layers, suggesting high spatial redundancy where tokens can be safely reduced. 


\subsubsection{Lightweight Redundancy Predictor Training}
To avoid the computational overhead of full forward passes at test time, we train a lightweight custom CNN predictor to estimate redundancy from input images. The \texttt{Score\_Predictor} consists of a feature extractor with three convolutional blocks. The blocks progressively increase channel depth ($32 \rightarrow 64 \rightarrow 128$) using $3 \times 3$ convolutions (stride 2), followed by Batch Normalization and ReLU activation. It predicts $\hat{y}_{\text{red}} \in [0, 1]$ from input image $\mathbf{x} \in \mathbb{R}^{H \times W \times C}$ using mean squared error loss:
\begin{equation}
\mathcal{L}_{\text{pred}} = \frac{1}{N}\sum_{i=1}^{N}(\hat{y}_{\text{red}}^{(i)} - y_{\text{red}}^{(i)})^2
\end{equation}

\subsubsection{Token Reduction Threshold Calibration}
We establish the TR activation threshold $\theta_{\text{R}}$ by analyzing the distribution of predicted redundancy scores on the separate validation set. For each validation sample, we first calculate accuracy–FLOPs pairs under identical EE settings for two scenarios: one without TR and one with TR applied at every checkpoint. We then sweep candidate thresholds, using the predicted redundancy score $\hat{y}_{\mathrm{red}}$ to assign each sample to one of the two paths, and sum the previously calculated values to estimate overall accuracy and cost. We select the smallest threshold that maintains accuracy within 1\% of the all-token baseline while maximizing FLOPs reduction. At test time, if $\hat{y}_{\text{red}} > \theta_{\text{R}}$, TR is activated after each EE checkpoint (layers 4, 7, and 10).

\subsection{Stage 3: Unified Inference at Test Time}
The complete inference pipeline for each test sample:
\begin{enumerate}
    \item The \texttt{Score\_Predictor} estimates $\hat{y}_{\text{red}}$ and sets TR activation flag: \texttt{use\_tr} = $\hat{y}_{\text{red}} > \theta_{\text{R}}$
    \item The ViT processes the image layer-by-layer through the 12 transformer blocks
    \item At each checkpoint (layers 4, 7, 10):
    \begin{itemize}
        \item The EE head computes confidence $c_k = \max(\text{Softmax}(\mathbf{z}_k))$
        \item If $c_k > \theta_{\text{EE}, k}$, inference terminates and returns $\arg\max(\mathbf{z}_k)$
        \item If TR is activated (\texttt{use\_tr = True}) and $c_k \leq \theta_{\text{EE}, k}$, apply TR before continuing
    \end{itemize}
    \item If no EE occurs, the final head at layer 12 produces the classification
\end{enumerate}
This unified pipeline enables dataset-adaptive optimization: spatially redundant samples undergo TR, high-confidence samples EE, and challenging samples process through all layers. Algorithm~\ref{alg:unified_inference_visual} formalizes this complete inference procedure.

\section{Results and Discussion}
\subsection{Experimental Setup}
We evaluate our framework across five public datasets: ISIC2019~\citep{tschandl2018ham10000,codella2018skin,hernandez2024bcn20000} (9-class skin lesion classification), PathMNIST~\citep{yang2023medmnist} (9-class colon tissue classification), PneumoniaMNIST (2-class pneumonia detection), RetinaMNIST (5-class diabetic retinopathy grading), and INSIGHT, a private dataset of anterior segment eye images (4-class cataract classification). All images were resized to a $224 \times 224$ pixels using bicubic interpolation. For training, we applied data augmentation: random affine transformations (rotation range $\pm {10}^{\circ}$, translation up to $10\%$), autocontrast ($p=0.5$), and horizontal flipping. For testing and validation, images were resized to $224 \times 224$. DeiT-S~\citep{touvron2021trainingdataefficientimagetransformers} trained with AdamW optimizer~\citep{loshchilov2017decoupled} (learning rate: $5 \times 10^{-4}$, batch size: 64, epochs: 50). 


\subsection{Dataset Redundancy and Complexity Analysis}
We profile dataset-specific redundancy using token-level cosine similarity across DeiT-S layers (Figure~\ref{fig:profiling_a}). RetinaMNIST and ISIC2019 exhibit high initial similarity ($\sim$0.8), while INSIGHT, PathMNIST, and PneumoniaMNIST start at moderate similarity ($\sim$0.6). All datasets show monotonically increasing similarity with depth, confirming progressive feature homogenization~\citep{zhou2021deepvitdeepervisiontransformer,wang2022anti}.

\begin{figure}[h]
    \centering
    \subfigure[]{
        \includegraphics[width=0.45\textwidth]{figs/fig_1/cosine_similarity_1.png}
        \label{fig:profiling_a}
    }
    \hfill
    \subfigure[]{
        \includegraphics[width=0.45\textwidth]{figs/fig_1/confidence_evolution_1.png}
        \label{fig:profiling_b}
    }
    \caption{\textbf{Dataset redundancy and complexity analysis across DeiT-S layers.} (a) Token-level redundancy across layer transitions. RetinaMNIST and ISIC2019 show high initial similarity ($\sim$0.8); others start lower ($\sim$0.6). All exhibit monotonically increasing similarity. (b) Sample-wise complexity at decision layers (4, 7, 10, 12) showing 90th percentile (easy, lighter) and 10th percentile (hard, darker) confidence. PathMNIST and PneumoniaMNIST achieve high early confidence with minimal easy-hard gaps. RetinaMNIST and INSIGHT show persistent gaps.}
    \label{fig:profiling}
\end{figure}

Figure~\ref{fig:profiling_b} shows layer-wise confidence evolution at decision layers (4, 7, 10, 12). PathMNIST and PneumoniaMNIST achieve high early confidence ($>$0.8 by layer 4) with minimal easy-hard gaps at layer 12, indicating uniform sample complexity. Conversely, RetinaMNIST and INSIGHT start with low confidence and maintain substantial easy-hard gaps through layer 12, reflecting diverse sample complexity. These profiles necessitate adaptive strategy selection: datasets with high early confidence and small gaps (PathMNIST, PneumoniaMNIST) suit aggressive EE, while those with persistent gaps (RetinaMNIST, INSIGHT) benefit more from TR or conservative thresholds.


\subsection{Dataset-Specific Profiling for Strategy Selection}

\subsubsection{Early Exit Threshold Calibration}
To determine dataset-specific optimal EE thresholds for subsequent experiments, we perform validation set profiling by sweeping $\theta_{\text{EE}} \in [0.5, 0.95]$ to maximize FLOPs reduction while constraining accuracy loss to $<1\%$. Figure~\ref{fig:ee} illustrates the performance-efficiency trade-offs for PneumoniaMNIST and INSIGHT (remaining datasets in Figure~\ref{fig:ee_append}).

\begin{figure}[h]
    \centering
    \subfigure[]{
        \includegraphics[width=0.45\textwidth]{figs/fig_2/pneumoniamnist_EE_tradeoff_1.png}
        \label{fig:ee_a}
    }
    \hfill
    \subfigure[]{
        \includegraphics[width=0.45\textwidth]{figs/fig_2/insight_EE_tradeoffpng_1.png}
        \label{fig:ee_b}
    }
    \caption{\textbf{Performance-efficiency trade-offs of early exiting.} Accuracy (green), FLOPs reduction (blue), and exit layer (pink) vs. confidence threshold. (a) PneumoniaMNIST achieves 87\% FLOPs reduction at $\theta_{\text{EE}}=0.80$ (average exit layer 1.58, $-0.57$pp from baseline). (b) INSIGHT requires $\theta_{\text{EE}}=0.75$ (average exit layer 3.24, $-0.61$pp from baseline) for 73\% FLOPs reduction.}    
    \label{fig:ee}
\end{figure}

PneumoniaMNIST (Figure~\ref{fig:ee_a}) achieves 87\% FLOPs reduction at $\theta_{\text{EE}}=0.80$ (average exit layer 5.8, $-0.57$pp from baseline). INSIGHT (Figure~\ref{fig:ee_b}) requires $\theta_{\text{EE}}=0.75$, exiting at layer 10.6 for 73\% reduction ($-0.61$pp from baseline). This disparity confirms that datasets with high early-layer similarity enable aggressive EE, while those with diverse initial representations require deeper inference. Based on this profiling, we select: $\theta_{\text{EE}}^{\text{Retina}}=0.78$, $\theta_{\text{EE}}^{\text{Pneumonia}}=0.80$, $\theta_{\text{EE}}^{\text{INSIGHT}}=0.75$, $\theta_{\text{EE}}^{\text{ISIC}}=0.65$, $\theta_{\text{EE}}^{\text{Pathology}}=0.60$.


\subsubsection{Token Reduction Keep Rate Selection}

Figure~\ref{fig:tr} presents the accuracy-efficiency trade-offs of various TR strategies---random pruning, Top-K pruning, EViT~\citep{liang2022not}, A-ViT~\citep{yin2022vit}, and Token Merging (ToMe)~\citep{bolya2022token}---as a function of average token count across all DeiT-S layers, revealing substantial differences in robustness across datasets.

PathMNIST demonstrates resilience (Figure~\ref{fig:tr_a}): all strategies maintain performance above 99.6\%, even with aggressive reduction to 16 tokens (ToMe) versus the 99.91\% baseline. Random, Top-K, and EViT maintain $\sim$99.9\% accuracy across all token budgets. INSIGHT shows distinct sensitivity (Figure~\ref{fig:tr_b}): while Random pruning and EViT preserve $\sim$87\% accuracy down to 76 tokens, Top-K drops sharply to 84.0\% at 100 tokens (3.2\% loss from the 87.2\% baseline). This validates Figure~\ref{fig:profiling_a}: PathMNIST's higher token similarity indicates more redundant spatial information, which is precisely what TR exploits: when tokens are similar, fewer are needed to preserve discriminative features.

Validation set profiling identifies EViT as the most stable strategy with minimal sensitivity to token budgets (Figure~\ref{fig:tr_append}). The optimal keep rates (the proportion of tokens preserved at each layer) vary: ISIC2019/RetinaMNIST: 0.3, PneumoniaMNIST: 0.4, INSIGHT: 0.5, PathMNIST: 0.7. For fair comparison across TR-only and TR+EE configurations, we standardize at keep rate 0.4 for all experiments. At 40 retained tokens, EViT delivers a consistent $\sim$56\% FLOPs reduction (2.027 vs. 4.608 GFLOPs baseline) with competitive accuracy: PathMNIST 99.89\% ($-1.2$pp from baseline), PneumoniaMNIST 97.9\% ($+2.0$pp), INSIGHT 86.3\% ($-0.9$pp), RetinaMNIST 60.0\% ($+1.0$pp), ISIC2019 56.78\% ($+2.6$pp).

\begin{figure}[h!]
    \centering
    \subfigure[]{
        \includegraphics[width=0.45\textwidth]{figs/fig_3/PathMNIST_1.png}
        \label{fig:tr_a}
    }
    \hfill
    \subfigure[]{
        \includegraphics[width=0.45\textwidth]{figs/fig_3/INSIGHT_1.png}
        \label{fig:tr_b}
    }
    \caption{\textbf{Token reduction strategy comparison across medical imaging datasets.} X-axis: average token count; y-axis: accuracy (\%, left) and FLOPs reduction (\%, right, purple). Methods: Top-K (red circles), EViT (blue squares), Random (gray crosses), ToMe (green triangles). Gray dashed line: baseline accuracy. (a) PathMNIST: All strategies maintain $>99.7\%$ accuracy at 40 tokens; EViT achieves 99.89\% ($-1.2$pp). (b) INSIGHT: EViT shows superior stability, maintaining 86.3\% at 40 tokens ($-0.9$pp), while Top-K and Random exhibit higher variance. ToMe collapses below 76 tokens.}
    \label{fig:tr}
\end{figure}


\subsection{Lightweight Redundancy Predictor Performance}

Table~\ref{tab:predictor_performance} demonstrates that the lightweight \texttt{Score\_Predictor} effectively approximates spatial redundancy. RetinaMNIST, PathMNIST, and INSIGHT achieve strong performance (MAE $\leq 0.082$, Pearson $R \geq 0.70$), while ISIC2019 and PneumoniaMNIST show weaker correlations (Pearson $R = 0.48$ and $0.31$) due to narrower dynamic ranges in their redundancy distributions. Nevertheless, the predictor remains sufficiently accurate for coarse separation between low- and high-redundancy samples to trigger TR decisions. The calibrated thresholds reveal dataset-specific regimes: RetinaMNIST exhibits the highest threshold ($\theta_{\text{R}} = 0.9431$) to preserve subtle vascular patterns, while PathMNIST and INSIGHT adopt permissive thresholds ($\theta_{\text{R}} = 0.0870$ and $0.1883$) consistent with higher baseline redundancy, enabling dataset-aware efficiency without expensive redundancy estimation at test time. 


\begin{table}[h]
\centering
\caption{\textbf{Lightweight redundancy predictor performance on validation data.} The \texttt{Score\_Predictor} estimates spatial redundancy scores to determine TR activation. Mean absolute error (MAE), Pearson R, and $R^2$ evaluate prediction performance. Optimal thresholds $\theta_{\text{R}}$ are calibrated per dataset to balance computational savings and accuracy.}
\label{tab:predictor_performance}
\hspace{0.07\columnwidth}
\resizebox{0.9\columnwidth}{!}{
    \begin{tabular}{l|cccc}
    \toprule
    \textbf{Dataset} & \textbf{MAE ($\downarrow$)} & \textbf{Pearson R ($\uparrow$)} & $\mathbf{R^2}$ \textbf{($\uparrow$)} & \textbf{Optimal Threshold ($\theta_{\text{R}}$)} \\
    \midrule
    ISIC 2019 & 0.1352 & 0.4756 & 0.2124 & 0.1952 \\
    PneumoniaMNIST & 0.1631 & 0.3055 & 0.0730 & 0.2988 \\
    RetinaMNIST & 0.0804 & 0.7917 & 0.3567 & 0.9431 \\
    PathMNIST & 0.0635 & 0.7039 & 0.4701 & 0.0870 \\
    INSIGHT & 0.0818 & 0.7408 & 0.4932 & 0.1883 \\
    \bottomrule
    \end{tabular}
    }
\end{table}



\subsection{Unified Framework Performance}

Table~\ref{tab:main} compares our unified TR+EE framework against baseline and individual strategies across five datasets. By combining spatial and depth-wise pruning, our approach maintains near-baseline accuracy while processing an average of only 46.0 tokens with an average exit layer of 6.5.

The results highlight distinct behaviors across datasets, validating the need for dataset-specific profiling. PathMNIST achieves 96.0\% accuracy (+1.3pp), as TR removes redundant background tokens enabling EE to focus on salient tissue structures and exit early (avg. layer 3.0). Similarly, the real-world INSIGHT dataset maintains baseline accuracy (86.2\%), exploiting the high spatial redundancy typical of anterior segment imaging (avg. 29.2 tokens).

\FloatBarrier
\newcommand{\loss}[1]{\textcolor{teal!60!black}{\scriptsize\,(↓\,#1)}}
\newcommand{\gain}[1]{\textcolor{red!70!black}{\scriptsize\,(↑\,#1)}}

\begin{table}[h]
\centering
\caption{\textbf{Unified Framework Performance Across Datasets.} All methods use DeiT-S backbone. Baseline: 196 tokens, 12 layers. EE-only: 196 tokens with dynamic EE (dataset-specific $\theta_{\text{EE}}$). TR-only: EViT with 40 tokens across all layers. A-ViT: state-of-the-art adaptive ViT with 196 tokens. TR+EE (Ours, shaded): combines TR and EE. Best results in \textbf{bold}.}
\label{tab:main}
\hspace{0.07\columnwidth}
\resizebox{0.9\columnwidth}{!}{
\begin{tabular}{l|l|ccc}
\toprule
\textbf{Dataset} & \textbf{Strategy} & \textbf{Accuracy (\%)} & \textbf{Avg Tokens} & \textbf{Avg Exit Layer}  \\
\midrule
\multirow{4}{*}{\textbf{ISIC2019}} 
& Baseline & 54.2 & 196 & 12.0  \\
& EE-only & \textbf{56.8} \gain{2.6pp} & 196 & \textbf{9.26} \\
& TR-only & \textbf{56.8} \gain{2.6pp} & 40 & 12.0  \\
& A-ViT & \textbf{56.8} \gain{2.6pp} & 196 & 12  \\
& \cellcolor{blue!8}TR+EE & \cellcolor{blue!8}51.7\,\loss{2.5pp} & \cellcolor{blue!8}\textbf{20.8} & \cellcolor{blue!8}10.3 \\
\midrule
\multirow{4}{*}{\textbf{PneumoniaMNIST}} 
& Baseline & 90.1 & 196 & 12.0 \\
& EE-only & 87.8 \loss{2.3pp} & 196 & \textbf{1.86}  \\
& TR-only & \textbf{92.1} \gain{2.0pp} & \textbf{40} & 12.0 \\
& A-ViT & 92.0 \gain{1.9pp} & 196 & 12 \\
& \cellcolor{blue!8}TR+EE & \cellcolor{blue!8}88.8\,\loss{1.3pp} & \cellcolor{blue!8}56.3 & \cellcolor{blue!8}4.65 \\
\midrule
\multirow{4}{*}{\textbf{RetinaMNIST}} 
& Baseline & 59.0 & 196 & 12.0 \\
& EE-only & \textbf{61.0}\,\gain{2.0pp} & 196 & \textbf{5.45} \\
& TR-only & 54.5 \loss{4.5pp} & \textbf{40} & 12.0  \\
& A-ViT & 57.0 \loss{2.0pp} & 196 & 12.0  \\
& \cellcolor{blue!8}TR+EE & \cellcolor{blue!8}60.8\,\gain{1.8pp} & \cellcolor{blue!8}44.8 & \cellcolor{blue!8}7.7 \\
\midrule
\multirow{4}{*}{\textbf{PathMNIST}} 
& Baseline & 94.7 & 196 & 12.0 \\
& EE-only & 94.4\,\loss{0.3pp} & 196 & 11 \\
& TR-only & 93.5\,\loss{1.2pp} & \textbf{40} & 12.0 \\
& A-ViT & 92.0 \loss{2.7pp} & 196 & 12.0  \\
& \cellcolor{blue!8}TR+EE & \cellcolor{blue!8}\textbf{96.0}\,\gain{\textbf{1.3pp}} & \cellcolor{blue!8}79 & \cellcolor{blue!8}\textbf{3.0}  \\
\midrule
\multirow{4}{*}{\textbf{INSIGHT}} 
& Baseline & 86.1 & 196 & 12.0 \\
& EE-only & 87.4 \gain{1.3pp} & 196 & 7.04  \\
& TR-only & 85.5 \loss{0.6pp} & 40 & 12.0  \\
& A-ViT & 84.9 \loss{1.2pp} & 196 & 12.0  \\
& \cellcolor{blue!8}TR+EE & \cellcolor{blue!8}\textbf{86.2}\,\gain{\textbf{0.1pp}} & \cellcolor{blue!8}\textbf{29.2} & \cellcolor{blue!8}\textbf{6.9} \\
\midrule
\midrule
\multicolumn{5}{c}{\textit{\textbf{Average Performance Across All Datasets}}} \\
\midrule
& EE-only & $-0.3$pp & 196 & 7.0  \\
& TR-only & $-0.4$pp & \textbf{40} & 12.0  \\
& A-ViT & $-0.28$pp & 196 & 12.0  \\
& \cellcolor{blue!8}\textbf{TR+EE } & \cellcolor{blue!8}\textbf{-0.1pp} & \cellcolor{blue!8}46.0 & \cellcolor{blue!8}\textbf{6.5} \\
\bottomrule
\end{tabular}
}
\vspace{0.3cm}
\end{table}

However, ISIC2019 and PneumoniaMNIST show minor degradation. ISIC2019's 2.5pp loss stems from aggressive TR at layer 10.3, eliminating subtle fine-grained texture that the model may rely on for diagnosis. PneumoniaMNIST's 1.3pp loss indicates premature termination on subtle cases. While TR-only improved accuracy (92.1\%), the unified framework's aggressive EE prevents token-reduced representations from reaching deeper layers where they could recover performance. Both cases demonstrate the inherent trade-off: achieving extreme architectural sparsity requires a careful balance between the speed of termination and the preservation of fine-grained pathological features.

Per-class analysis (Table~\ref{tab:per_class_metrics}) reveals that our framework can improve sensitivity beyond baseline for diagnostically challenging classes, such as PathMNIST's mucus (98.16\%) and cancer-associated stroma (72.92\%), demonstrating that adaptive inference enhances performance on complex tissue types.

We conducted an ablation study on backbone architecture by evaluating ViT-S (Table~\ref{tab:main_vit}). TR+EE achieves 54.1\% to 76.4\% FLOPs reduction across five datasets with accuracy changes ranging from -4.5pp to +0.2pp relative to baseline. These results suggest that architectural characteristics influence the efficiency-accuracy trade-off, informing model selection for clinical deployment with varying computational and accuracy requirements.

\subsection{Computation Cost Analysis}
Table~\ref{tab:computation} evaluates the computational cost of our unified framework across five datasets. Our approach achieves a significant reduction in algorithmic complexity, averaging a 71\% decrease in GFLOPs across all datasets. This substantially outperforms A-ViT, which only achieves an average reduction of 28.9\%. These results highlight the advantages of combining token reduction to lower the cost per layer with early exiting to reduce the total depth of the network. 

\begin{table*}[h!]
\centering
\caption{\textbf{Computation Cost Analysis Across Datasets.} Comparison of theoretical complexity (FLOPs), real-world latency, and energy consumption. All methods use DeiT-S backbone. Speedup is calculated relative to the Baseline of each dataset. Best results in \textbf{bold}.}
\label{tab:computation}
\hspace{0.01\textwidth}
\begin{minipage}{\textwidth}
\resizebox{\textwidth}{!}{
\begin{tabular}{l|l|cccc}
\toprule
\textbf{Dataset} & \textbf{Strategy} & \textbf{FLOPs (G)} & \textbf{Latency (ms)} & \textbf{Energy (mJ)} & \textbf{Speedup} \\
\midrule
\multirow{5}{*}{\textbf{ISIC2019}} 
& Baseline & 4.61 & 1.201 & 9.776 & 1.00$\times$ \\
& EE-only & 2.616 \loss{43.3\%} & 1.252 \gain{4.2\%} & 10.552 \gain{8.0\%} & 0.96$\times$ \\
& TR-only & 2.027 \loss{56.0\%} & 0.552 \loss{54.0\%}& 4.654 \loss{52.3\%} & 2.18$\times$ \\
& A-ViT & 3.742 \loss{18.8\%} & 1.431 \gain{19.2\%} & 11.724 \gain{20.0\%} & 0.84$\times$ \\
& \cellcolor{blue!8}TR+EE & \cellcolor{blue!8}\textbf{1.569} \loss{\textbf{66.0\%}} & \cellcolor{blue!8}\textbf{0.629} \loss{47.6\%} & \cellcolor{blue!8}\textbf{4.886} \loss{50.0\%} & \cellcolor{blue!8}\textbf{1.91$\times$} \\
\midrule
\multirow{5}{*}{\textbf{Pneumonia}} 
& Baseline & 4.61 & 1.188 & 9.404 & 1.00$\times$ \\
& EE-only & \textbf{0.776} \loss{\textbf{83.2\%}} & \textbf{0.396 \loss{66.7\%}} & \textbf{3.355} & \textbf{3.00$\times$} \\
& TR-only & 2.027 \loss{56.0\%} & 0.543 \loss{54.3\%} & 4.324 \loss{54.0\%} & 2.19$\times$ \\
& A-ViT & 3.032 \loss{34.2\%} & 1.414 \loss{19.0\%} & 11.250 \gain{19.6\%} & 0.84$\times$ \\
& \cellcolor{blue!8}TR+EE & \cellcolor{blue!8}1.204 \loss{73.9\%} & \cellcolor{blue!8}0.575 \loss{51.6\%} & \cellcolor{blue!8}4.503 \loss{52.1\%} & \cellcolor{blue!8}2.07$\times$ \\
\midrule
\multirow{5}{*}{\textbf{RetinaMNIST}} 
& Baseline & 4.61 & 1.189 & 9.116 & 1.00$\times$ \\
& EE-only & 1.662 \loss{63.9\%} & 0.812 \loss{31.7\%} & 199.592 \gain{2089.4\%} & 1.46$\times$ \\
& TR-only & 2.027 \loss{56.0\%} & 0.546 \loss{54.1\%} & 4.415 \loss{51.6\%} & 2.18$\times$ \\
& A-ViT & 4.320 \loss{6.3\%} & 1.420 \gain{19.4\%} & 353.233 \gain{3774.9\%} & 0.84$\times$ \\
& \cellcolor{blue!8}TR+EE & \cellcolor{blue!8}\textbf{1.398} \loss{\textbf{70.0\%}} & \cellcolor{blue!8}\textbf{0.620} \loss{47.9\%} & \cellcolor{blue!8}\textbf{4.853} \loss{46.8\%} & \cellcolor{blue!8}\textbf{1.92$\times$} \\
\midrule
\multirow{5}{*}{\textbf{PathMNIST}} 
& Baseline & 4.61 & 1.225 & 12.401 & 1.00$\times$ \\
& EE-only & 3.049 \loss{33.9\%} & 1.335 \gain{9.0\%} & 2.098 \loss{83.1\%} & 0.92$\times$ \\
& TR-only & 2.027 \loss{56.0\%} & 0.575 \loss{53.1\%} & 4.979 \loss{59.9\%} & 2.13$\times$ \\
& A-ViT & 2.386 \loss{48.2\%} & 1.468 \gain{19.8\%} & 12.611 \loss{1.7\%} & 0.83$\times$ \\
& \cellcolor{blue!8}TR+EE & \cellcolor{blue!8}\textbf{1.050} \loss{\textbf{77.2\%}} & \cellcolor{blue!8}\textbf{0.475} \loss{61.2\%} & \cellcolor{blue!8}\textbf{4.243} \loss{65.8\%} & \cellcolor{blue!8}\textbf{2.58$\times$} \\
\midrule
\multirow{5}{*}{\textbf{INSIGHT}} 
& Baseline & 4.61 & 1.196 & 9.776 & 1.00$\times$ \\
& EE-only & 2.061 \loss{55.3\%} & 2.129 \gain{78.0\%} & 9.796 \gain{0.2\%} & 0.56$\times$ \\
& TR-only & 1.633 \loss{64.6\%} & \textbf{0.548} \loss{54.1\%} & \textbf{4.486} \loss{54.1\%} & \textbf{2.18$\times$} \\
& A-ViT & 2.902 \loss{37.0\%} & 1.421 \gain{18.7\%} & 12.561 \loss{28.5\%} & 0.84$\times$ \\
& \cellcolor{blue!8}TR+EE & \cellcolor{blue!8}\textbf{1.394} \loss{\textbf{69.8\%}} & \cellcolor{blue!8}0.631 \loss{47.2\%}  & \cellcolor{blue!8}4.568 \loss{53.3\%} & \cellcolor{blue!8}1.90$\times$ \\
\bottomrule
\end{tabular}
}
\end{minipage}
\end{table*}

Although edge device deployment involves distinct engineering such as quantization, we provide GPU latency and energy measurements to demonstrate tangible efficiency gains and to validate improvements in computational efficiency. Our framework achieves a 2.07$\times$ average speedup over the baseline and a decrease in latency of 51.2\%. We also achieve an average of 54.3\% energy reduction while maintaining highly stable consumption of 4.6 mJ across all modalities. This stability is particularly evident in RetinaMNIST, where both EE-only and A-ViT exhibit a sharp increase in energy, reaching 199.6 mJ and 353.2 mJ respectively. 

\subsection{Visualization and Failure Case Analysis}

Figure~\ref{fig:vis} visualizes the adaptive framework behavior on representative samples from INSIGHT and PathMNIST datasets. Figure~\ref{fig:vis_a} demonstrates sequential TR across all checkpoints with 40\% retention rate (keep rate = 0.4), where the model processes all layers for prediction while successfully preserving informative regions around the pupil even at the final checkpoint. Figure~\ref{fig:vis_b} shows a case where TR activates at the first checkpoint followed by EE, as the clear evidence of mature cataract at the pupil enables confident prediction without deeper processing. Similarly, Figure~\ref{fig:vis_c} illustrates a PathMNIST sample where TR activation at the first checkpoint is followed by EE, demonstrating the framework's ability to adaptively combine both efficiency strategies based on sample characteristics.

\begin{figure}[h]
    \centering
    \subfigure[]{
        \includegraphics[width=0.95\textwidth]{figs/fig_4/insight_TR_Max_Late.png}
        \label{fig:vis_a}
    }
    \hfill
    \subfigure[]{
        \includegraphics[width=0.45\textwidth]{figs/fig_4/insight_TR_Mid_EE.png}
        \label{fig:vis_b}
    }
    \hfill
    \subfigure[]{
        \includegraphics[width=0.45\textwidth]{figs/fig_4/pathmnist_TR_Mid_EE_1.png}
        \label{fig:vis_c}
    }
    \caption{\textbf{Visualization of adaptive framework on medical imaging samples.} (a) TR-only with sequential token reduction maintaining diagnostic regions across all layers. (b), (c) Combined TR+EE examples from INSIGHT and PathMNIST.}    
    \label{fig:vis}
\end{figure}
Figure~\ref{fig:failure} illustrates the primary failure modes encountered by our unified framework, specifically within the ISIC2019 and PneumoniaMNIST datasets. In ISIC2019, the performance degradation is primarily driven by extreme class imbalance, where the dominant nevus class (50.8\%) biases threshold calibration toward majority features. As shown in Figure~\ref{fig:fail_a}, aggressive token reduction prunes subtle diagnostic textures critical for rare classes like dermatofibroma (0.9\% of the dataset), leading to a 10.78pp specificity gain in majority classes but a sensitivity drop in rare classes. Conversely, PneumoniaMNIST failures are largely attributed to model overconfidence. Figures~\ref{fig:fail_b} and \ref{fig:fail_c} show how the early exit mechanism triggers on erroneous high-confidence predictions at layers 4 or 7. This overconfidence is compounded by the TR and EE interaction, where token-reduced representations (averaging 56.3 tokens) require the deeper architectural processing of all 12 layers to maintain accuracy, yet the EE mechanism terminates inference at an average layer of 4.65.  

\begin{figure}[h]
    \centering
    \subfigure[]{
        \includegraphics[width=0.95\textwidth]{figs/fig_5/isic_derma_fail.png}
        \label{fig:fail_a}
    }
    \hfill
    \subfigure[]{
        \includegraphics[width=0.38\textwidth]{figs/fig_5/pneumonia_3_fail.png}
        \label{fig:fail_b}
    }
    \hfill
    \subfigure[]{
        \includegraphics[width=0.55\textwidth]{figs/fig_5/pneumonia_6_fail.png}
        \label{fig:fail_c}
    }
    \caption{\textbf{Qualitative failure-case analysis of adaptive framework} (a) ISIC Dermatofibroma sample with misclassification. (b), (c) Overconfidence failures in PneumoniaMNIST where normal scans were misidentified due to premature termination at layers 4 and 7, respectively.}    
    \label{fig:failure}
\end{figure}

\section{Conclusion}
\label{sec:conclusion}
This work introduces a unified framework that integrates TR and EE for ViT inference in medical imaging. We calibrate dataset-specific thresholds: prediction confidence for EE ($\theta_{\text{EE}}$) and spatial redundancy via JSD for TR ($\theta_{\text{R}}$). At test time, a lightweight CNN predictor estimates sample-level redundancy to activate TR, while intermediate classifier heads enable EE based on confidence. Across five diverse datasets, our framework achieves 71.4\% average FLOPs reduction while maintaining diagnostic accuracy within 0.1pp of baseline, substantially outperforming individual strategies (EE-only: 55.9\%; TR-only: 57.7\%).

\textit{Clinical Impact:} Medical AI deployment demands both accuracy and efficiency, particularly in resource-constrained and time-sensitive settings. Our framework achieves substantial efficiency gains without compromising performance on both public benchmarks and real-world clinical data with inherent quality variability. This approach can enable broader access to diagnostic AI where hardware resources are scarce but patient need is greatest.

\textit{Limitations:} Our framework requires dataset-specific profiling to calibrate thresholds, but this one-time overhead maximizes test-time efficiency without recurring costs. Beyond theoretical FLOPs reductions, validation on edge devices is necessary to assess actual latency improvements.


\clearpage  
\midlacknowledgments{We acknowledge support from the National Eye Institute (P30EY001765, R21EY034343), VentureWell Propel Award, Microsoft Acceleration Award, Stephen F Raab and Mariellen Brickley-Raab Rising Professorship in Ophthalmology, Johns Hopkins University, and the National Academy of Medicine. In addition, funds to support this AITC study were provided by the Johns Hopkins University AITC under award number P30AG073104. Ji Young Byun was supported in part by a discretionary fund at Johns Hopkins University's Whiting School of Engineering.}
\bibliography{midl26_177}

\clearpage
\appendix

\renewcommand{\thesection}{A\arabic{section}}
\renewcommand{\thefigure}{A\arabic{figure}}
\renewcommand{\thetable}{A\arabic{table}}
\renewcommand{\thealgocf}{A\arabic{algocf}}
\setcounter{figure}{0}
\setcounter{table}{0}
\setcounter{algocf}{0}

\begin{table}[h]
\centering
\caption{Dataset statistics and data split information. The validation set is split into two equal parts: 50\% for dataset-specific profiling and lightweight predictor training, and 50\% for token reduction threshold calibration.}
\label{tab:dataset_splits}
\resizebox{0.6\textwidth}{!}{
    \begin{tabular}{lccc}
    \toprule
    \textbf{Dataset} & \textbf{Train} & \textbf{Validation} & \textbf{Test} \\
    \midrule
    ISIC2019 & 1791 & 448 & 118 \\
    PneumoniaMNIST & 4,708 & 524 & 624 \\
    RetinaMNIST & 1,080 & 120 & 400 \\
    PathMNIST & 89,996 & 10,004 & 7,180 \\
    INSIGHT & 5256 & 657 & 657 \\
    \bottomrule
    \end{tabular}
}
\end{table}

\textit{INSIGHT Ethics Statement} 
After obtaining informed consent,  community health workers collect smartphone-based images of patients attending community eye screenings. Diagnosis labels for each image were obtained using clinical diagnoses made via pen light examination by ophthalmologists at the same screening. The study was approved by the Institutional Review Boards of Aravind Eye Hospital and the Johns Hopkins University School of Medicine.

\clearpage
\begin{algorithm}[h]
\caption{Unified Adaptive Inference Pipeline (TR + EE)}
\label{alg:unified_inference_visual}
\SetAlgoLined
\DontPrintSemicolon
\KwIn{Input Image $\mathbf{x}$, Redundancy Threshold $\theta_{\text{R}}$, EE Thresholds $\{\theta_{\text{EE}}\}$, \\
\hspace{3.2em} Token Keep Rate $r$}
\KwOut{Prediction $\hat{y}$, Exit Layer $k^*$}
\BlankLine
\tcp{Stage 1: Adaptive Token Reduction (TR) Activation}
$\hat{y}_{\text{red}} \gets \texttt{Score\_Predictor}(\mathbf{x})$ \tcp*{Predict redundancy score}
$\texttt{use\_tr} \gets (\hat{y}_{\text{red}} > \theta_{\text{R}})$ \tcp*{Activate TR if redundant}
$\mathbf{Z} \gets \text{Initial Tokens}$ with $N_0$ tokens\;
\BlankLine
\tcp{Stage 2: Iterative Layer Processing with TR and EE}
\For{$k \gets 0$ \KwTo $11$}{
    $\mathbf{Z} \gets \text{Transformer\_Block}_k(\mathbf{Z})$\;
    
    \If{$k \in \{3, 6, 9\}$}{
        \tcp{Early Exit Check}
        $\mathbf{z}_k \gets \text{CLS\_Token\_Output}(\mathbf{Z})$\;
        $c_k \gets \max(\text{Softmax}(\mathbf{z}_k))$\;
        
        \If{$c_k > \theta_{\text{EE}}$}{
            \Return $\hat{y} \gets \arg\max(\mathbf{z}_k)$, $k^* \gets k$\;
        }
        
        \tcp{Token Reduction (if active)}
        \If{$\texttt{use\_tr} = \text{True}$}{
            $N_{\text{new}} \gets N_{\text{current}} \cdot r$\;
            $\mathbf{Z} \gets \text{Reduce\_Tokens}(\mathbf{Z}, N_{\text{new}})$\;
        }
    }
}
\BlankLine
\tcp{Stage 3: Default (Full Depth)}
$\mathbf{z}_{12} \gets \text{CLS\_Token\_Output}(\mathbf{Z})$\;
\Return $\hat{y} \gets \arg\max(\mathbf{z}_{12})$, $k^* \gets 12$\;
\end{algorithm}






\begin{figure}[t]
    \centering
    \subfigure[]{
        \includegraphics[width=0.45\textwidth]{figs/fig_2/retinamnist_EE_tradeoff_1.png}
    }
    \hfill
    \subfigure[]{
        \includegraphics[width=0.45\textwidth]{figs/fig_2/isic_EE_tradeoff_1.png}
    }
    \hfill
    \subfigure[]{
        \includegraphics[width=0.45\textwidth]{figs/fig_2/pathmnist_EE_tradeoff_1.png}
    }
    \caption{Performance-efficiency trade-offs of early exiting across confidence thresholds. Validation set profiling sweeps $\theta_{\text{EE}} \in [0.5, 0.95]$ to identify optimal thresholds maximizing FLOPs reduction while constraining accuracy loss to $<1\%$. X-axis shows confidence threshold $\theta_{\text{EE}}$; left y-axis shows accuracy (\%, green circles), right y-axis shows FLOPs reduction (\%, blue squares) and average exit layer (pink triangles). (a) RetinaMNIST: Achieves 41\% FLOPs reduction at $\theta_{\text{EE}}=0.78$ with 66.67\% accuracy ($-0.83$pp from baseline). (b) ISIC2019: Achieves 65\% FLOPs reduction at $\theta_{\text{EE}}=0.65$ with 72.32\% accuracy ($-0.89$pp from baseline). (c) PathMNIST: Achieves 91\% FLOPs reduction at $\theta_{\text{EE}}=0.60$ with 98.98\% accuracy ($-0.91$pp from baseline)}    
    \label{fig:ee_append}
\end{figure}


\begin{figure}[t]
    \centering
    \subfigure[]{
        \includegraphics[width=0.45\textwidth]{figs/fig_3/Retina_1.png}
    }
    \hfill
    \subfigure[]{
        \includegraphics[width=0.45\textwidth]{figs/fig_3/ISIC_1.png}
    }
    \hfill
    \subfigure[]{
        \includegraphics[width=0.45\textwidth]{figs/fig_3/Pneumonia_1.png}
    }
    \caption{Token reduction strategy comparison across medical imaging datasets. X-axis shows average token count; y-axis shows accuracy (\%, left) and FLOPs reduction (\%, right, purple line). Methods: Top-K (red circles), EViT (blue squares), Random (gray crosses), ToMe (green triangles). Gray dashed line marks baseline accuracy. (a) RetinaMNIST: EViT maintains consistent performance (60-67\% accuracy) across all token budgets. Top-K performs well at higher token counts (65.8\% at 159 tokens), while ToMe shows degraded performance comparable to random pruning. (b) ISIC2019: High sensitivity to token reduction across all strategies. ToMe achieves best overall performance (peak 62.7\% at 57 tokens). EViT excels at low token budgets while Top-K performs better at high token counts, showing complementary efficiency profiles. (c) PneumoniaMNIST: Highly robust to token reduction across all strategies, maintaining $>97\%$ accuracy down to 16 tokens. EViT, Top-K, and Random show negligible degradation, while ToMe exhibits relative instability below 57 tokens despite maintaining strong absolute performance.}    
    \label{fig:tr_append}
\end{figure}

\begin{table*}[h!]
\caption{\textbf{Diagnostic Performance by Strategy and Class.} Comparison of per-class Sensitivity ($\uparrow$) and Specificity ($\uparrow$) across five medical imaging datasets. The performance variations highlight dataset-specific biases. \textbf{Bolding} indicates the best metric across the four strategies for that specific class/metric. Latency is the achieved runtime (ms).}
\label{tab:per_class_metrics}
\resizebox{\textwidth}{!}{
\begin{tabular}{l|l|ccccc|ccccc}
\toprule
\multirow{2}{*}{\textbf{Dataset}} & \multirow{2}{*}{\textbf{Class Name}} & \multicolumn{5}{c|}{\textbf{Sensitivity (\%)} ($\uparrow$)} & \multicolumn{5}{c}{\textbf{Specificity (\%)} ($\uparrow$)} \\
\cmidrule(lr){3-7} \cmidrule(lr){8-12}
& & Baseline & EE-only & TR-only & A-ViT & \cellcolor{blue!8}TR+EE & Baseline & EE-only & TR-only & A-ViT & \cellcolor{blue!8}TR+EE \\
\midrule
% --- ISIC 2019 ---
\multirow{9}{*}{\textbf{ISIC 2019}} {\multirow{9}{*}{}} 
& 0: actinic keratosis & 12.50 & 12.50 & 6.25 & 18.75 & \cellcolor{blue!8}\textbf{31.25} & \textbf{100.00} & \textbf{100.00} & 99.02 & \textbf{100.00} & \cellcolor{blue!8}\textbf{100.00} \\
& 1: basal cell carcinoma & 87.50 & 87.50 & 87.50 & 87.50 & \cellcolor{blue!8}\textbf{93.75} & 94.12 & 95.14 & \textbf{97.06} & 95.10 &\cellcolor{blue!8}87.25 \\
& 2: dermatofibroma & 50.00 & \textbf{56.25} & 43.75 & 50.00 &\cellcolor{blue!8}37.50 & 99.02 & \textbf{100.00} & \textbf{100.00} & \textbf{100.00} & \cellcolor{blue!8}\textbf{100.00} \\
& 3: melanoma & 12.50 & 6.25 & 18.75 & 6.25 & \cellcolor{blue!8}\textbf{18.75} & 95.14 & 92.16 & 94.12 & \textbf{96.08} & \cellcolor{blue!8}\textbf{96.08} \\
& 4: nevus & {87.50} & \textbf{100.00} & \textbf{100.00} & 94.75 & \cellcolor{blue!8}{87.50} & 72.55 & 73.53 & 71.57 & 72.55 & \cellcolor{blue!8}\textbf{83.33} \\
& 5: pigmented benign keratosis & 81.25 & \textbf{87.50} & \textbf{87.50} & 87.50 & \cellcolor{blue!8}81.25 & 89.22 & \textbf{93.14} & 91.18 & 91.18 & \cellcolor{blue!8}82.35 \\
& 6: seborrheic keratosis & 0.00 & 0.00 & 0.00 & 0.00 & \cellcolor{blue!8}0.00 & \textbf{100.00} & \textbf{100.00} & \textbf{100.00} & \textbf{100.00} & \cellcolor{blue!8}\textbf{100.00} \\
& 7: squamous cell carcinoma & 50.00 & 50.00 & \textbf{56.25} & \textbf{56.25} & \cellcolor{blue!8}25.00 & {97.06} & {97.06} & {97.06} & \textbf{98.04} & \cellcolor{blue!8}\textbf{98.04} \\
& 8: vascular lesion & \textbf{100.00} & \textbf{100.00} & \textbf{100.00} & \textbf{100.00} & \cellcolor{blue!8}\textbf{100.00} & \textbf{100.00} & 99.13 & \textbf{100.00} & 97.39 & \cellcolor{blue!8}99.13 \\
\midrule
% --- PneumoniaMNIST ---
\multirow{2}{*}{\textbf{PneumoniaMNIST}} {\multirow{2}{*}{}}
& 0: normal & 73.93 & 67.95 & \textbf{80.77} & 79.06 &\cellcolor{blue!8}70.94 & \textbf{99.74} & \textbf{99.74} & 98.97 & \textbf{99.74} & \cellcolor{blue!8}98.72 \\
& 1: pneumonia & \textbf{99.74} & \textbf{99.74} & 98.97 & \textbf{99.74} & \cellcolor{blue!8}98.72 & 73.93 & 67.95 & \textbf{80.77} & 79.06 & \cellcolor{blue!8}70.94 \\
\midrule
% --- RetinaMNIST ---
\multirow{5}{*}{\textbf{RetinaMNIST}} {\multirow{5}{*}{}} 
& 0: No DR & \textbf{89.66} & 87.93 & 82.76 & 81.03 & \cellcolor{blue!8}85.06 & 57.96 & 67.70 & 71.24 & 68.14 & \cellcolor{blue!8}\textbf{73.45} \\
& 1: Mild DR & 0.00 & 17.39 & \textbf{36.96} & 2.17 & \cellcolor{blue!8}0.00 & 98.59 & 96.61 & 85.88 & \textbf{99.72} & \cellcolor{blue!8}{99.15} \\
& 2: Moderate DR & 41.38 & 40.22 & 31.52 & \textbf{59.78} & \cellcolor{blue!8}{52.17} & 86.04 & \textbf{87.66} & 88.64 & 75.07 & \cellcolor{blue!8}80.84 \\
& 3: Severe DR & 51.47 & 58.82 & 35.29 & 39.71 & \cellcolor{blue!8}\textbf{70.59} & \textbf{95.48} & 92.17 & 92.17 & 94.28 & \cellcolor{blue!8}{90.96} \\
& 4: Proliferative DR & \textbf{35.00} & 30.00 & 20.00 & 20.00 & \cellcolor{blue!8}15.00 & 98.42 & 98.16 & 98.42 & 98.42 & \cellcolor{blue!8}\textbf{99.74} \\
\midrule
% --- PathMNIST ---
\multirow{9}{*}{\textbf{PathMNIST}}{\multirow{9}{*}{}} 
& 0: adipose & \textbf{99.33} & 98.58 & 97.38 & 95.44 &  \cellcolor{blue!8}98.43 & 99.50 & 99.67 & 99.52 & 99.66 & \cellcolor{blue!8}\textbf{99.91} \\
& 1: background & \textbf{100.00} & \textbf{100.00} & \textbf{100.00} & \textbf{100.00} & \cellcolor{blue!8}\textbf{100.00} & 99.42 & 99.07 & 99.27 & 99.01 & \cellcolor{blue!8}\textbf{100.00} \\
& 2: debris & 95.87 & 97.94 & 99.12 & 95.28 & \cellcolor{blue!8}\textbf{97.35} & \textbf{99.80} & 99.56 & \textbf{99.83} & 98.99 & \cellcolor{blue!8}99.66 \\
& 3: lymphocytes & \textbf{100.00} & \textbf{100.00} & \textbf{100.00} & \textbf{100.00} & \cellcolor{blue!8}98.90 & 99.88 & 99.48 & 98.75 & 99.25 & \cellcolor{blue!8}\textbf{99.89} \\
& 4: mucus & 88.50 & 95.07 & 92.08 & 91.98 & \cellcolor{blue!8}\textbf{98.16} & {99.72} & 99.59 & 99.43 & \textbf{99.74} & \cellcolor{blue!8}{99.12} \\
& 5: smooth muscle & 93.24 & 87.16 & 85.47 & 85.98 & \cellcolor{blue!8}\textbf{92.57} & 97.81 & 98.51 & \textbf{98.66} & 97.71 & \cellcolor{blue!8}{98.28} \\
& 6: normal colon mucosa & \textbf{97.98} & 95.95 & 90.01 & 84.35 & \cellcolor{blue!8}93.30 & 98.57 & 98.70 & 98.80 & 99.07 & \cellcolor{blue!8}\textbf{99.72} \\
& 7: cancer-associated stroma & 65.80 & 66.51 & 67.70 & 61.06 & \cellcolor{blue!8}\textbf{72.92} & \textbf{99.87} & 99.73 & 99.39 & 99.75 & \cellcolor{blue!8}\textbf{99.87} \\
& 8: colorectal adenocarcinoma epithelium & 96.76 & 94.16 & 95.94 & 95.70 & \cellcolor{blue!8}\textbf{98.30} & 99.48 & \textbf{99.51} & 99.08 & 97.80 & \cellcolor{blue!8}{99.08} \\
\midrule
% --- INSIGHT ---
\multirow{4}{*}{\textbf{INSIGHT}} {\multirow{4}{*}{}} 
& clear & 88.22 & 90.45 & 82.64 & 89.49 & \cellcolor{blue!8}\textbf{91.40} & 87.17 & 86.88 & \textbf{91.69} & 85.13 & \cellcolor{blue!8}83.67 \\
& immature cataract & 81.17 & 79.37 & \textbf{87.22} & 75.34 & \cellcolor{blue!8}77.13 & 90.55 & 92.17 & 87.10 & 91.01 & \cellcolor{blue!8}\textbf{93.09} \\
& mature cataract & 80.77 & \textbf{92.31} & 82.69 & 80.77 & \cellcolor{blue!8}83.08 & 99.68 & 99.52 & \textbf{99.76} & 99.52 & \cellcolor{blue!8}99.68 \\
& pciol & 92.55 & 94.68 & \textbf{97.34} & 93.62 & \cellcolor{blue!8}93.62 & 99.29 & \textbf{99.82} & 99.29 & 98.93 & \cellcolor{blue!8}99.47 \\
\bottomrule
\end{tabular}
}
\end{table*}


\begin{table*}[h!]
\centering
\caption{\textbf{Unified Framework Performance Across Datasets Using ViT-S backbone.} Baseline: 196 tokens, 12 layers. EE-only: 196 tokens with dynamic EE (dataset-specific $\theta_{\text{EE}}$). TR-only: EViT with 100 tokens across all layers. TR+EE (Ours, shaded): combines TR and EE. Best results in \textbf{bold}.}
\label{tab:main_vit}
\resizebox{\textwidth}{!}{
\begin{tabular}{l|l|ccc|c}
\toprule
\textbf{Dataset} & \textbf{Strategy} & \textbf{Accuracy (\%)} & \textbf{Avg Tokens} & \textbf{Avg Exit Layer} & \textbf{FLOPs (G)} \\
\midrule
\multirow{4}{*}{\textbf{ISIC2019}} 
& Baseline & 60.2 & 196 & 12.0 & 4.61 \\
& EE-only & 56.8 \loss{3.4pp} & 196 & \textbf{3.75} & \textbf{1.236 \loss{73.2\%}} \\
& TR-only & \textbf{61.9} \gain{1.7pp} & 100 & 12.0 & 3.003 \loss{34.8\%} \\
& \cellcolor{blue!8}TR+EE & \cellcolor{blue!8} 56.8\,\loss{3.4pp} & \cellcolor{blue!8}\textbf{68} & \cellcolor{blue!8}12.0 & \cellcolor{blue!8}{2.117}\,\loss{\textbf{54.1\%}} \\
\midrule
\multirow{4}{*}{\textbf{PneumoniaMNIST}} 
& Baseline & 94.2 & 196 & 12.0 & 4.61 \\
& EE-only & 93.9 \loss{0.3pp} & 196 & \textbf{3.0} & \textbf{1.05 \loss{77.2\%} }\\
& TR-only & \textbf{95.0} \gain{0.8pp} & \textbf{100} & 12.0 & 3.003 \loss{34.8\%} \\
& \cellcolor{blue!8}TR+EE & \cellcolor{blue!8}90.4\,\loss{3.8pp} & \cellcolor{blue!8}{110} & \cellcolor{blue!8}5.53 & \cellcolor{blue!8}{1.429}\,\loss{{69.0\%}} \\
\midrule
\multirow{4}{*}{\textbf{RetinaMNIST}} 
& Baseline & \textbf{66.8} & 196 & 12.0 & 4.61 \\
& EE-only & 65.5 \loss{1.3pp} & 196 & \textbf{6.77} & \textbf{1.992 \loss{56.8\%}} \\
& TR-only & 63.0 \loss{3.8pp} & 100 & 12.0 & 3.003 \loss{34.8\%} \\
& \cellcolor{blue!8}TR+EE & \cellcolor{blue!8}62.3\,\loss{4.5pp} & \cellcolor{blue!8}\textbf{68} & \cellcolor{blue!8}12.0 & \cellcolor{blue!8}{2.117}\,\loss{{54.1\%}} \\
\midrule
\multirow{4}{*}{\textbf{PathMNIST}} 
& Baseline & 93.3 & 196 & 12.0 & 4.61 \\
& EE-only & 93.7 \loss{0.5pp} & 196 & \textbf{3.13} & \textbf{1.082 \loss{76.5\%}} \\
& TR-only & \textbf{94.6 }\gain{0.4pp} & \textbf{100} & 12.0 & 3.003 \loss{34.8\%} \\
& \cellcolor{blue!8}TR+EE & \cellcolor{blue!8}93.5\,\gain{0.2pp} & \cellcolor{blue!8}{136} & \cellcolor{blue!8}3.22 & \cellcolor{blue!8}{1.086}\,\loss{{76.4\%}} \\
\midrule
\multirow{4}{*}{\textbf{INSIGHT}} 
& Baseline & 86.9 & 196 & 12.0 & 4.61 \\
& EE-only & \textbf{88.1} \gain{1.2pp} & 196 & \textbf{3.38} & \textbf{1.145 \loss{75.2\%}} \\
& TR-only & 87.7 \gain{0.8pp} & 100 & 12.0 & 3.003 \loss{34.8\%} \\
& \cellcolor{blue!8}TR+EE & \cellcolor{blue!8}86.0\,\loss{0.9pp} & \cellcolor{blue!8}\textbf{70} & \cellcolor{blue!8}9.46 & \cellcolor{blue!8}{1.979}\,\loss{{57.1\%}} \\
\bottomrule
\end{tabular}
}
\end{table*}

\makeatletter
\renewcommand*\@jmlrenddoc{\label{jmlrend}}
\makeatother
\end{document}


