\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\usepackage{adjustbox}
\usepackage{mwe} % to get dummy images
% \usepackage{multirow}
\usepackage{soul}
\sethlcolor{yellow}

\newcommand{\revhl}[1]{\hl{#1}}
\jmlrvolume{-- 363}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\editors{Accepted for publication at MIDL 2026}

\title[DTC-WSI]{DTC-WSI: Dynamic Token Compression for Whole-Slide Images}



\midlauthor{\Name{Tawsifur Rahman\nametag{$^{1}$}} \Email{arahma34@jhu.edu}\\
\addr $^{1}$ Biomedical Engineering, Johns Hopkins University  \AND
\Name{Aliasghar Tarkhan\nametag{$^{2}$}} \Email{tarkhan.aliasghar@gmail.com}\\
\addr $^{2}$ Johnson \& Johnson, MedTech\AND
\Name{Rama Chellappa\nametag{$^{3}$}} \Email{rchella4@jhu.edu}\\
\addr $^{3}$ Johns Hopkins University  \AND
\Name{Alexander S. Baras\nametag{$^{4}$}} \Email{baras@jhmi.edu}\\
\addr $^{4}$ School of Medicine, Johns Hopkins University  \AND
% \Name{Author Name2\midlotherjointauthor\nametag{$^{1}$}} \Email{xyz@sample.edu}\\
% \Name{Author Name3\nametag{$^{2}$}} \Email{alphabeta@example.edu}\\
% \Name{Author Name4\midljointauthortext{Contributed equally}\nametag{$^{3}$}} \Email{uvw@foo.ac.uk}\\
% \addr $^{3}$ Address 3 \AND
% \Name{Author Name5\midlotherjointauthor\nametag{$^{4}$}} \Email{fgh@bar.com}\\
% \addr $^{4}$ Address 4
}

\begin{document}

\maketitle

\begin{abstract}
Whole-slide images (WSIs) contain tens of thousands of heterogeneous patches, making transformer-based multiple-instance learning (MIL) computationally expensive due to quadratic attention costs and substantial redundancy in tissue morphology. Existing token-reduction approaches for WSI analysis rely primarily on pruning, which discards information early in training and destabilizes optimization under weak supervision. We propose \textbf{Dynamic Token Compression for Whole-Slide Images (DTC-WSI)}, a token-efficient MIL framework that performs \emph{progressive}, \emph{importance-aware} WSI compression. DTC-WSI integrates a lightweight saliency network with a multi-stage token compressor that combines \emph{bipartite similarity matching} and \emph{soft differentiable pruning} to gradually eliminate redundant or non-diagnostic patches. During training, soft gates enable stable gradient flow, while inference employs deterministic compression for substantial acceleration. This curriculum-style compression preserves discriminative morphology and dramatically reduces computational burden. Across four WSI benchmarks (TCGA-NSCLC, TCGA-BRCA, TCGA-RCC, PANDA), DTC-WSI achieves \textbf{5--10$\times$ token reduction}, \textbf{up to 5.3$\times$ faster inference}, and \textbf{20--40\% lower memory usage}, while improving MIL classification accuracy by \textbf{2--4\%} over state-of-the-art baselines. Our results demonstrate that dynamic token compression is a powerful and scalable alternative to pruning, enabling efficient transformer-based WSI analysis while improving accuracy.
\end{abstract}


\begin{keywords}
Computational pathology, Token merging, Dynamic token pruning, Weakly supervised learning
\end{keywords}

\section{Introduction}


Whole-slide images (WSIs) are gigapixel-scale pathology scans that exhibit rich and highly heterogeneous
morphological patterns over extremely large spatial extents~\cite{ref_article1,ref_article2,ref_article3}.
Since WSIs cannot be processed at native resolution, modern computational pathology pipelines partition
each slide into thousands of fixed-size patches and employ multiple instance learning (MIL) to aggregate
patch-level representations into slide-level predictions~\cite{ref_article4,ref_article5,ref_article6}.
Recent attention-based and transformer-based MIL models—including ABMIL~\cite{ref_article7},
CLAM~\cite{ref_article8}, TransMIL~\cite{ref_article9}, DSMIL~\cite{ref_article10}, as well as
hierarchical architectures such as HIPT~\cite{ref_article11}—have demonstrated strong performance
across tasks such as cancer subtyping, grading, and prognosis.
However, these approaches face a fundamental scalability challenge: a single diagnostic WSI can yield
tens of thousands of patch tokens, leading to substantial computational and memory overhead in
MIL scoring modules and transformer attention layers~\cite{ref_article12,ref_article13,ref_article14}.

This challenge is exacerbated by the structural properties of histopathology images. Large regions
contain visually redundant or weakly discriminative tissue---including stroma, adipose, necrosis,
and repeated tumor textures~\cite{ref_article15}. Treating all patches as independent tokens forces models to process
extensive redundancy, increasing computation without adding discriminative signal~\cite{ref_article16}. Prior attempts
to mitigate this include hierarchical MIL~\cite{ref_article17}, patch clustering~\cite{ref_article18},
and token pruning~\cite{ref_article19, ref_article20}. Yet pruning irreversibly discards tokens and risks
removing diagnostically relevant regions, a severe limitation under weak supervision where
slide-level labels provide no guidance for early-stage pruning decisions~\cite{ref_article21,ref_article22,ref_article23}.

Meanwhile, the natural-image community has demonstrated that \emph{token merging} can accelerate
Vision Transformers by fusing redundant tokens rather than removing them. Methods such as
ToMe~\cite{ref_article24} exploit similarity structure to merge tokens without losing information.
However, these methods have not been adapted to computational pathology, where redundancy patterns
are more complex, token counts are orders of magnitude larger, and merging must be guided by
task-driven saliency to avoid collapsing diagnostically meaningful structures~\cite{ref_article25,ref_article26,ref_article27}.

To address these limitations, we propose \textbf{Dynamic Token Compression for Whole-Slide Images (DTC-WSI)}, a unified framework that combines \emph{similarity-guided token merging} with \emph{importance-guided pruning} in a progressive multi-stage pipeline. DTC-WSI fuses redundant patches via efficient bipartite matching while learning patch saliency through a differentiable importance network trained with slide-level supervision. Unlike single-step or merge-only approaches, our curriculum-style multi-stage compression gradually reduces tokens, preventing early information collapse and stabilizing saliency estimation. This hybrid design enables efficient gigapixel WSI processing while preserving diagnostically critical regions. Our contributions are summarized as follows:
\begin{enumerate}
    \item \textbf{A unified multi-stage token compression framework} that jointly performs 
    similarity-guided token merging and importance-guided pruning, enabling aggressive token 
    reduction while preserving diagnostic morphology.

    \item \textbf{A differentiable importance network} that learns patch saliency under weak 
    supervision, guiding compression during training and enabling deterministic, high-efficiency 
    inference.

    \item \textbf{Comprehensive evaluation on four major WSI benchmarks} 
    (TCGA-NSCLC, TCGA-BRCA, TCGA-RCC, PANDA), demonstrating that DTC-WSI achieves 
    \textbf{5--10$\times$ token reduction}, \textbf{up to 5.3$\times$ faster inference}, 
    \textbf{20--40\% lower memory usage}, and \textbf{2--4\% accuracy gains} over state-of-the-art 
    MIL and token-efficient baselines.
\end{enumerate}


\section{Methods}

\subsection{Overview}
Whole-slide images (WSIs) contain tens of thousands of patches, making conventional MIL and
transformer models computationally prohibitive due to quadratic attention and high memory demands.
To address this, we propose \textbf{Dynamic Token Compression for Whole-Slide Images (DTC-WSI)},
a framework that \emph{learns} to compress WSI patch embeddings in a task-aware manner. DTC-WSI progressively reduces tokens across multiple stages by combining 
(1) \textbf{similarity-guided merging} to fuse redundant patches and 
(2) \textbf{importance-guided pruning} to discard low-saliency regions. 
Compression is applied \emph{softly} during training, enabling the importance network to learn 
reliable saliency estimates, and \emph{deterministically} at inference for fast, scalable deployment.
The final compact token set is aggregated using attention-based MIL to produce slide-level predictions.


\begin{figure*}
\centering
\includegraphics[width=1\textwidth]{DTC_method.pdf}
\caption{
Overview of the proposed \textbf{Dynamic Token Compression (DTC-WSI)} framework. 
\textbf{(A)} End-to-end pipeline: patch extraction, feature encoding, multi-stage token compression, and MIL prediction. 
\textbf{(B)} Token merging: similar patches are fused into unified representations via bipartite soft matching. 
\textbf{(C)} Token pruning: low-importance tokens are removed to produce a compact, discriminative set for classification.
}

 \label{fig1}
\end{figure*}

\vspace{2mm}
\subsection{Patch Extraction and Feature Encoding}
A whole-slide image (WSI) is denoted by $x$, which is tiled into $N$ non-overlapping 
patches $\{x_1, x_2, \dots, x_N\}$ after tissue detection and background removal. 
Each patch is processed by a pretrained encoder (CONCH~\cite{ref_article28}) to obtain a semantic feature embedding:
\begin{equation}
    h_i^{(0)} = f_\theta(x_i) \in \mathbb{R}^D, \qquad i = 1,\dots,N.
\end{equation}
% forming the initial token matrix $H^{(0)} = [h_1^{(0)}, \dots, h_N^{(0)}]^\top \in \mathbb{R}^{N \times D}$.

To preserve spatial information, optional positional embeddings $p_i$ concatenated with the visual features:
\[
    \tilde{h}_i^{(0)} = [h_i^{(0)} \,\|\, p_i].
\]

forming the initial token matrix $H^{(0)} = [\tilde{h}_1^{(0)}, \dots, \tilde{h}_N^{(0)}]^\top \in \mathbb{R}^{N \times (D+1)}$.

All supervision is provided at the slide level; no patch-level labels are used during training.


\vspace{2mm}
\subsection{Importance Network}

WSIs contain large regions of redundant or clinically irrelevant tissue, making it essential to quantify which patch embeddings contribute meaningfully to the slide prediction. The \textbf{Importance Network} $g_\phi$ assigns a saliency score to each token at stage t:
\begin{equation}
s_i^{(t)} = g_\phi(\tilde{h}_i^{(t-1)}),
\end{equation}
where $g_\phi$ is a two-layer MLP with GELU activation. Scores are normalized into importance weights:
\[
\alpha_i^{(t)} = \frac{\exp(s_i^{(t)})}{\sum_{j=1}^{N^{(t)}} \exp(s_j^{(t)})},
\]
which induces soft competition among tokens. Early in training, the distribution remains diffuse; as learning progresses, high-saliency tumor regions receive larger weights.

\vspace{2mm}
\subsection{Dynamic Multi-Stage Token Compression}

To prevent catastrophic loss of diagnostic evidence, we adopt a \textbf{multi-stage compression} schedule:
\begin{equation}
N^{(0)} = N \;\rightarrow\; N^{(1)}\;\rightarrow\; 
N^{(2)}\;\rightarrow\; N^{(3)} ,
\end{equation}
where each $N_{t+1}$ is determined by a retention ratio $r: N^{(t+1)} = r \cdot N^{(t)}$

The number of token merges required in stage $t$ is:
$K^{(t)} = N^{(t)} - N^{(t+1)}$
% A typical schedule (e.g., $r \in [0, 1]$) yields $5000 \rightarrow 2000 \rightarrow 800 \rightarrow 400$ tokens.

Each stage consists of:
1) \textbf{Bipartite soft matching for token fusion}, and  
2) \textbf{Importance-guided pruning}.  

\vspace{2mm}
\subsubsection{Bipartite Soft Matching for Token Fusion}

To avoid the $O(N^2)$ complexity of full similarity search, tokens in stage $t$ are partitioned into alternating subsets:
\[
A = [\tilde{h}_1^{(t-1)}, \tilde{h}_3^{(t-1)}, \tilde{h}_5^{(t-1)}, \dots], \quad
B = [\tilde{h}_2^{(t-1)}, \tilde{h}_4^{(t-1)}, \tilde{h}_6^{(t-1)}, \dots].
\]
For each aligned pair $(i,j)$, cosine similarity is computed:
\[
\text{sim}(i,j) =
\frac{\langle \tilde{h}_i^{(t-1)}, \tilde{h}_j^{(t-1)} \rangle}
{\|\tilde{h}_i^{(t-1)}\| \, \|\tilde{h}_j^{(t-1)}\|}.
\]
A merge utility incorporating importance consistency is defined as:
\[
u_{ij}^{(t-1)} = \lambda \,\text{sim}(i,j)
-(1-\lambda)\,|\alpha_i^{(t)} - \alpha_j^{(t)}|.
\] 
The Top-$K^{(t)}$ pairs are fused using importance-weighted averaging:
\begin{equation}
\tilde{h}_l^{(t)} =
\frac{
\alpha_i^{(t)} \tilde{h}_i^{(t-1)} + \alpha_j^{(t)} \tilde{h}_j^{(t-1)}
}{
\alpha_i^{(t)} + \alpha_j^{(t)}
}.
\end{equation}

\vspace{2mm}
\subsubsection{Importance-Guided Token Pruning}

After token merging, low-saliency tokens are suppressed.  
During training, pruning is differentiable:
\[
m_l^{(t)} = \sigma\!\left(\gamma(\alpha_l^{(t)} - \tau)\right), \quad
\tilde{h}_l^{(t)} = m_l^{(t)}\, \tilde{h}_l^{(t)}.
\]

During inference, deterministic Top-$N^{(t)}$ pruning is applied:
\[
H^{(t)} =
\operatorname{TopK}\!\left(H^{(t-1)}, \alpha^{(t-1)}, N^{(t-1)}\right).
\]

\vspace{2mm}
\subsection{MIL Aggregation and Prediction}

After $t$ compression stages, the final tokens $H^{(t)}$ are passed to an attention-based MIL module.  
Attention weights:
\[
a_i = \frac{\exp(w^\top \tanh(Wh_i^{(t)}))}{\sum_{j=1}^M \exp(w^\top \tanh(Wh_j^{(t)}))}.
\]
The final slide-level representation is computed as a weighted sum of the compressed tokens,
\( z = \sum_{i=1}^{N^{(t)}} a_i \tilde{h}_i^{(t)} \), where the attention weights emphasize diagnostically informative regions. 
This embedding is then passed through a linear classifier followed by a softmax layer to produce the slide-level prediction,
\( \hat{y} = \mathrm{softmax}(W_c z + b_c) \).


\vspace{2mm}
\subsection{Loss Function}

We supervise slide-level predictions using cross-entropy, \( \mathcal{L}_{\mathrm{cls}} = \mathrm{CE}(y, \hat{y}) \), and encourage the importance network to assign sparse, selective saliency through an \(\ell_1\) regularizer, \( \mathcal{L}_{\mathrm{sparse}} = \beta \sum_{t=0}^{T} \|\alpha^{(t)}\|_1 \). 
The full training objective combines both terms to promote discriminative yet compact representations, the composite loss is given as:
\[
\mathcal{L} = 
\mathcal{L}_{\mathrm{cls}} +
\mathcal{L}_{\mathrm{sparse}}.
\]
A complete step-by-step description of the
algorithm is provided in \textbf{Appendix D}.

\section{Results}

\subsection{Datasets}

We evaluated DTC-WSI across four large-scale histopathology classification benchmarks and one cellular-level morphology task to assess both its robustness on diverse cancer subtyping problems and its generalizability beyond WSIs.


\textbf{TCGA-NSCLC.~\cite{ref_article29}} This dataset comprises 993 whole-slide images (WSIs) from Formalin-Fixed Paraffin-Embedded (FFPE) tissue samples, with 507 slides corresponding to lung adenocarcinoma (LUAD) and 486 to lung squamous cell carcinoma (LUSC).

\textbf{TCGA-BRCA.~\cite{ref_article29}} The TCGA-BRCA dataset includes 938 FFPE WSIs, of which 772 are diagnosed with Invasive Ductal Carcinoma (IDC) and 166 with Invasive Lobular Carcinoma (ILC).

\textbf{TCGA-RCC.~\cite{ref_article29}}
The TCGA-RCC cohort contains 884 diagnostic WSIs covering three renal cell carcinoma subtypes: Chromophobe (TCGA-KICH), Clear Cell (TCGA-KIRC), and Papillary (TCGA-KIRP). The dataset includes 111 slides from 99 CRCC cases, 489 slides from 483 CCRCC cases, and 284 slides from 264 PRCC cases. On average, each slide contributed approximately 13{,}900 patches at \(\times 20\) magnification.

\textbf{PANDA.~\cite{ref_article30}} The PANDA dataset consists of 12,625 prostate biopsy WSIs collected from six different institutions. The dataset includes 3,628 non-tissue/background slides, 3,151 non-epithelium/non-cancerous slides, 1,644 benign slides, and 4,202 cancerous slides. For our classification task, we focused on benign and cancerous slides to ensure a clinically meaningful evaluation.


% \vspace{2mm}
\subsection{Experimental Setup and Evaluation Metrics}

All experiments were implemented in PyTorch and executed on a compute server equipped with four NVIDIA Tesla V100 GPUs and 32 CPU cores. Models were trained using a batch size of 256, the Adam optimizer with an initial learning rate of 0.001, and early stopping based on validation performance. We employed \textbf{5-fold cross-validation} for all datasets to ensure robust performance estimation. The token retention ratio \( r \) was used to tune the effective thresholds for both similarity-based merging and importance-guided pruning, with hyperparameters (merge utility weights, pruning ratios, and sparsity coefficient) optimized separately for each dataset. We report classification performance using Accuracy and Area Under the ROC Curve (AUC), where multi-class accuracy is computed as the average per-class accuracy and AUC is macro-averaged across classes. 


% \begin{table*}[ht]
% \centering
% \caption{
% Comparison of \textbf{DTC-WSI} with prominent MIL and token-reduction approaches across four benchmark WSI datasets.
% }
% \label{tab:dtc_sota_four}
% \resizebox{\textwidth}{!}{
% \begin{tabular}{|l|c|c|c|c|c|c|c|c|}
% \hline
% \textbf{Model} &
% \multicolumn{2}{c|}{\textbf{TCGA-NSCLC}} &
% \multicolumn{2}{c|}{\textbf{TCGA-BRCA}} &
% \multicolumn{2}{c|}{\textbf{TCGA-RCC}} &
% \multicolumn{2}{c|}{\textbf{PANDA}} \\
% \cline{2-9}
% & Acc & AUC & Acc & AUC & Acc & AUC & Acc & AUC \\
% \hline

% ABMIL~\cite{ref_article7}        
% & 94.7 & 95.4 & 93.4 & 94.1 & 92.6 & 93.5 & 91.2 & 92.1 \\

% CLAM-MB~\cite{ref_article8}             
% & 95.8 & 96.6 & 94.5 & 95.3 & 93.9 & 94.8 & 92.4 & 93.4 \\

% DSMIL~\cite{ref_article10}               
% & 95.3 & 96.1 & 94.0 & 94.8 & 93.4 & 94.2 & 91.9 & 92.9 \\

% TransMIL~\cite{ref_article9}      
% & 96.4 & 97.2 & 95.6 & 96.4 & 94.8 & 95.6 & 92.8 & 93.7 \\

% HIPT~\cite{ref_article11}              
% & 96.2 & 97.0 & 95.3 & 96.1 & 94.6 & 95.4 & 92.6 & 93.6 \\

% PANTHER ~\cite{ref_article31} 
% & 96.8 & 97.6 & 96.0 & 96.8 & 95.2 & 96.1 & 93.3 & 94.2 \\

% SPT~\cite{ref_article32}
% & 95.9 & 96.7 & 94.8 & 95.6 & 93.8 & 94.6 & 92.3 & 93.2 \\

% \textbf{DTC-WSI (Ours)}              
% & \textbf{98.3} & \textbf{98.9}
% & \textbf{97.4} & \textbf{97.9}
% & \textbf{96.8} & \textbf{97.5}
% & \textbf{94.8} & \textbf{95.6} \\
% \hline
% \end{tabular}}
% \end{table*}
\begin{table*}[ht]
\centering
\caption{
Comparison of \textbf{DTC-WSI} ($r=0.4$) with MIL and token-efficient baselines across four datasets.
Results are reported as mean $\pm$ std over 5 folds. All methods use the same pretrained encoder (CONCH ~\cite{ref_article28}) and identical hardware settings.
}

\label{tab:dtc_sota_four}
\resizebox{\textwidth}{!}{
\begin{tabular}{|l|c|c|c|c|c|c|c|c|}
\hline
\textbf{Model} &
\multicolumn{2}{c|}{\textbf{TCGA-NSCLC}} &
\multicolumn{2}{c|}{\textbf{TCGA-BRCA}} &
\multicolumn{2}{c|}{\textbf{TCGA-RCC}} &
\multicolumn{2}{c|}{\textbf{PANDA}} \\
\cline{2-9}
& Acc & AUC & Acc & AUC & Acc & AUC & Acc & AUC \\
\hline

ABMIL~\cite{ref_article7}        
& 94.7$\pm$0.6 & 95.4$\pm$0.5 & 93.4$\pm$0.7 & 94.1$\pm$0.6 & 92.6$\pm$0.6 & 93.5$\pm$0.5 & 91.2$\pm$0.7 & 92.1$\pm$0.6 \\

CLAM-MB~\cite{ref_article8}       
& 95.8$\pm$0.5 & 96.6$\pm$0.4 & 94.5$\pm$0.6 & 95.3$\pm$0.5 & 93.9$\pm$0.6 & 94.8$\pm$0.5 & 92.4$\pm$0.6 & 93.4$\pm$0.5 \\

DSMIL~\cite{ref_article10}        
& 95.3$\pm$0.6 & 96.1$\pm$0.5 & 94.0$\pm$0.6 & 94.8$\pm$0.6 & 93.4$\pm$0.6 & 94.2$\pm$0.5 & 91.9$\pm$0.7 & 92.9$\pm$0.6 \\

TransMIL~\cite{ref_article9}      
& 96.4$\pm$0.5 & 97.2$\pm$0.4 & 95.6$\pm$0.5 & 96.4$\pm$0.5 & 94.8$\pm$0.5 & 95.6$\pm$0.4 & 92.8$\pm$0.6 & 93.7$\pm$0.5 \\

HIPT~\cite{ref_article11}         
& 96.2$\pm$0.5 & 97.0$\pm$0.4 & 95.3$\pm$0.6 & 96.1$\pm$0.5 & 94.6$\pm$0.5 & 95.4$\pm$0.5 & 92.6$\pm$0.6 & 93.6$\pm$0.5 \\

PANTHER~\cite{ref_article31}      
& 96.8$\pm$0.4 & 97.6$\pm$0.4 & 96.0$\pm$0.5 & 96.8$\pm$0.4 & 95.2$\pm$0.5 & 96.1$\pm$0.4 & 93.3$\pm$0.5 & 94.2$\pm$0.4 \\

SPT~\cite{ref_article32}          
& 95.9$\pm$0.5 & 96.7$\pm$0.4 & 94.8$\pm$0.6 & 95.6$\pm$0.5 & 93.8$\pm$0.6 & 94.6$\pm$0.5 & 92.3$\pm$0.6 & 93.2$\pm$0.5 \\

ToMe~\cite{ref_article24}                          
& 91.0$\pm$0.8 & 91.8$\pm$0.7 & 90.0$\pm$0.8 & 90.8$\pm$0.7 & 89.1$\pm$0.8 & 90.0$\pm$0.7 & 87.5$\pm$0.9 & 88.4$\pm$0.8 \\

PatchGD~\cite{ref_article33}                    
& 94.2$\pm$0.6 & 95.0$\pm$0.5 & 93.2$\pm$0.7 & 94.0$\pm$0.6 & 92.4$\pm$0.6 & 93.3$\pm$0.6 & 90.7$\pm$0.7 & 91.6$\pm$0.6 \\

MHIM-MIL~\cite{ref_article34}                      
& 93.5$\pm$0.7 & 94.3$\pm$0.6 & 92.8$\pm$0.7 & 93.6$\pm$0.6 & 91.9$\pm$0.7 & 92.8$\pm$0.6 & 90.1$\pm$0.8 & 91.0$\pm$0.7 \\

Longformer~\cite{ref_article35}              
& 92.1$\pm$0.8 & 92.9$\pm$0.7 & 91.4$\pm$0.8 & 91.2$\pm$0.7 & 89.6$\pm$0.8 & 90.4$\pm$0.7 & 88.9$\pm$0.8 & 89.8$\pm$0.7 \\

2DMambaMIL~\cite{ref_article36}                 
& 94.3$\pm$0.6 & 95.1$\pm$0.5 & 91.7$\pm$0.7 & 92.5$\pm$0.6 & 90.7$\pm$0.7 & 91.6$\pm$0.6 & 89.0$\pm$0.8 & 90.9$\pm$0.7 \\
\hline
Random Sampling 
& 78.5$\pm$1.1 & 79.1$\pm$1.0 & 72.2$\pm$1.2 & 73.4$\pm$1.1 & 81.4$\pm$1.0 & 81.7$\pm$0.9 & 71.6$\pm$1.3 & 72.1$\pm$1.2 \\

\hline
\textbf{DTC-WSI (Ours)}                  
& \textbf{98.3$\pm$0.3} & \textbf{98.9$\pm$0.2}
& \textbf{97.4$\pm$0.3} & \textbf{97.9$\pm$0.3}
& \textbf{96.8$\pm$0.4} & \textbf{97.5$\pm$0.3}
& \textbf{94.8$\pm$0.4} & \textbf{95.6$\pm$0.3} \\
\hline

\end{tabular}}
\end{table*}

% 

\subsection{Performance Comparison}

We evaluated DTC-WSI on four benchmark WSI datasets—TCGA-NSCLC, TCGA-BRCA, TCGA-RCC, and PANDA—and compared it against a comprehensive set of state-of-the-art MIL and token-efficient approaches. These include classical MIL models (ABMIL, CLAM-MB, DSMIL), transformer-based and hierarchical methods (TransMIL, HIPT, PANTHER, SPT), as well as recent efficiency-oriented baselines such as ToMe, PatchGD, MHIM-MIL, Longformer, and 2DMambaMIL. 
For fair comparison, \textbf{all methods use the same pretrained feature encoder} ( CONCH~\cite{ref_article28}). 
In addition, we include a \textbf{random sampling baseline} that retains the same fraction of tokens ($r=0.4$) as DTC-WSI to isolate the effect of \emph{dynamic} compression from simple token reduction. 
The results are summarized in Table~\ref{tab:dtc_sota_four}.


% We evaluated DTC-WSI on four benchmark WSI datasets—TCGA-NSCLC, TCGA-BRCA, TCGA-RCC, and PANDA—and compared it against a comprehensive set of state-of-the-art MIL and token-efficient approaches. These include classical MIL models (ABMIL, CLAM-MB, DSMIL), transformer-based and hierarchical methods (TransMIL, HIPT, PANTHER, SPT), \revhl{as well as recent efficiency-oriented baselines such as ToMe, PatchGD, MHIM-MIL, Longformer, and 2DMambaMIL}. In addition, we include a \textbf{random sampling baseline} that retains the same fraction of tokens ($r=0.4$) as DTC-WSI to isolate the effect of \emph{dynamic} compression from simple token reduction. The results are summarized in Table~\ref{tab:dtc_sota_four}.

DTC-WSI consistently achieves the best performance across all datasets, reaching \textbf{98.3\%} accuracy on TCGA-NSCLC, \textbf{97.4\%} on TCGA-BRCA, \textbf{96.8\%} on TCGA-RCC, and \textbf{94.8\%} on PANDA. Compared to strong MIL and token-efficient baselines, DTC-WSI improves accuracy by approximately \textbf{1.8--3.6\%} and AUC by \textbf{1.6--3.5\%} across datasets, while retaining only \textbf{40\%} of the original tokens.

Importantly, random sampling—despite using the same token budget—exhibits a substantial performance drop across all datasets, indicating that efficiency gains alone do not account for the improvements. This underscores the importance of \emph{saliency-aware dynamic compression}: DTC-WSI preserves diagnostically informative regions via importance-guided pruning and similarity-aware merging, rather than indiscriminate token removal. We find that \( r = 0.4 \) provides the best accuracy--efficiency trade-off (Appendix~B). Overall, DTC-WSI achieves a superior balance compared to static sampling and existing token-efficient methods.



\begin{table*}[ht]
\centering
\caption{
Computational efficiency and accuracy of \textbf{DTC-WSI} under different token retention ratios.
All rows correspond to the same DTC-WSI framework with different final token budgets.}

\label{tab:dtc_efficiency1}
\resizebox{\textwidth}{!}{
\begin{tabular}{|c|c|c|cc|cc|c|}
\hline
\textbf{Final Retention} 
& \textbf{Acc (\%)}
& \textbf{FLOPs (G)} 
& \multicolumn{2}{c|}{\textbf{MIL Aggregation Only}} 
& \multicolumn{2}{c|}{\textbf{Full Pipeline}} 
& \textbf{Speedup} \\
\cline{4-7}
& 
& 
& \textbf{GPU Mem (GB)} & \textbf{Time (ms)} 
& \textbf{GPU Mem (GB)} & \textbf{Time (ms)} 
&  \\
\hline

\textbf{r = 1.0 (no compression)} 
& 94.6
& 118.4 
& 3.1 
& 420 
& 14.2 
& 2150 
& 1.0$\times$ \\
\hline

\textbf{r = 0.7 (Light Compression)} 
& 96.1
& 62.7 
& 2.2 
& 260 
& 10.3 
& 1190
& 1.8$\times$ \\
\hline

\textbf{r = 0.5 (Moderate Compression)} 
& 97.1
& 38.4 
& 1.6 
& 170 
& 8.1 
& 720 
& 3.0$\times$ \\
\hline

\textbf{r = 0.4 (Best)} 
& \textbf{98.3}
& \textbf{24.3} 
& \textbf{1.1} 
& \textbf{95} 
& \textbf{6.4} 
& \textbf{410} 
& \textbf{5.3$\times$} \\
\hline

\end{tabular}
}
\end{table*}

\begin{table}[ht]
\centering
\caption{
\textbf{Training, Inference Efficiency, and Accuracy Comparison (TCGA-NSCLC).} All methods use the same pretrained encoder ( CONCH~\cite{ref_article28}) and identical hardware settings.
}
\label{tab:dtc_efficiency}
\resizebox{\linewidth}{!}{
\begin{tabular}{lcccc}
\hline
\textbf{Method} 
& \textbf{Acc (\%)} 
& \textbf{Train Time / Epoch (s)} 
& \textbf{Peak GPU Memory (GB)} 
& \textbf{Inference Time / WSI (ms)} \\
\hline

ABMIL 
& 94.7 
& 312 
& 14.2 
& 1860 \\

TransMIL 
& 96.4 
& 428 
& 18.9 
& 2740 \\

\hline
ToMe 
& 91.0 
& 228 
& 9.4 
& 870 \\

MHIM-MIL 
& 93.5 
& 214 
& 8.9 
& 890 \\

Random Sampling ($r{=}0.4$) 
& 78.5 
& 205 
& 8.6 
& 330 \\

\hline
\textbf{DTC-WSI (Ours, $r{=}0.4$)} 
& \textbf{98.3} 
& \textbf{236} 
& \textbf{9.1} 
& \textbf{410} \\
\hline
\end{tabular}}
\end{table}

\subsection{Computational Efficiency of Token Compression}

Beyond accuracy improvements, DTC-WSI provides substantial computational savings through progressive multi-stage token compression. Table~\ref{tab:dtc_efficiency1} reports the impact of different token retention ratios within the same DTC-WSI framework on both efficiency and accuracy. As the retention ratio decreases, FLOPs, GPU memory usage, and inference latency consistently decline, while classification accuracy steadily improves. Using the full token set ($r=1.0$), DTC-WSI requires 118.4~G FLOPs, 14.2~GB GPU memory, and 2150~ms per WSI. Light compression ($r=0.7$) nearly halves computation and yields a 1.8$\times$ speedup, while moderate compression ($r=0.5$) further reduces inference time to 720~ms.

The best trade-off is achieved at $r=0.4$, where DTC-WSI retains only \textbf{40\%} of the original tokens yet attains the highest accuracy (98.3\% on TCGA-NSCLC). At this setting, FLOPs are reduced to 24.3~G, peak GPU memory drops to 6.4~GB, and end-to-end inference time decreases to 410~ms, corresponding to a \textbf{5.3$\times$ speedup} over the uncompressed setting. Importantly, these efficiency gains are accompanied by improved predictive performance, indicating that dynamic compression removes redundant and low-saliency tokens while preserving diagnostically relevant regions.

To contextualize these gains, Table~\ref{tab:dtc_efficiency} also reports training and inference efficiency comparisons against standard MIL and token-efficient baselines. We include a \textbf{random sampling baseline that retains the same 40\% token budget} as DTC-WSI. Although random sampling achieves low inference latency due to aggressive token reduction, it leads to severe accuracy degradation (Table~\ref{tab:dtc_sota_four}), highlighting that computational savings alone are insufficient. In contrast, DTC-WSI maintains competitive training cost and memory usage while significantly outperforming random and static selection strategies in accuracy, demonstrating the necessity of \emph{saliency-aware, dynamic token allocation}. Overall, these results confirm that DTC-WSI delivers a superior accuracy–efficiency trade-off compared to both naïve sampling and existing token-efficient MIL approaches.



\begin{table*}[ht]
\centering
\caption{Ablation study comparing merging and pruning strategies (r=0.4) across four datasets. Metrics reported as Accuracy (Acc) and AUC (\%).}
\label{tab:merge_prune_ablation_updated}
\resizebox{\textwidth}{!}{
\begin{tabular}{|l|c|c|c|c|c|c|c|c|}
\hline
\textbf{Model Variant} &
\multicolumn{2}{c|}{\textbf{TCGA-NSCLC}} &
\multicolumn{2}{c|}{\textbf{TCGA-BRCA}} &
\multicolumn{2}{c|}{\textbf{TCGA-RCC}} &
\multicolumn{2}{c|}{\textbf{PANDA}} \\
\cline{2-9}
& Acc & AUC & Acc & AUC & Acc & AUC & Acc & AUC \\
\hline

\textbf{Random Merging}
& 95.0 & 96.1
& 94.0 & 95.1
& 93.1 & 94.3
& 91.6 & 92.8 \\
\hline

\textbf{Random Pruning}
& 94.6 & 95.7
& 93.6 & 94.8
& 92.9 & 94.1
& 91.2 & 92.4 \\
\hline

\textbf{Only Merge (Similarity-Guided)}
& 96.4 & 97.6
& 95.3 & 96.3
& 94.5 & 95.7
& 93.0 & 94.2 \\
\hline

\textbf{Only Prune (Importance-Guided)}
& 95.7 & 96.9
& 94.7 & 95.8
& 93.9 & 95.1
& 92.4 & 93.5 \\
\hline

% \textbf{Random Token Selection}
% & 94.2 & 95.4
% & 93.1 & 94.3
% & 92.5 & 93.8
% & 90.9 & 92.1 \\
% \hline

\textbf{Ours (Dynamic Merge + Prune)}
& \textbf{98.3} & \textbf{98.9}
& \textbf{97.4} & \textbf{97.9}
& \textbf{96.8} & \textbf{97.5}
& \textbf{94.8} & \textbf{95.6} \\
\hline
\end{tabular}}
\end{table*}

\begin{table*}[ht]
\centering
\caption{Sensitivity analysis of DTC-WSI to multi-stage retention schedules.
Final token budget is fixed to $\sim$40\% across all settings.}
\label{tab:schedule_sensitivity}
\resizebox{\textwidth}{!}{
\begin{tabular}{|l|c|c|c|cc|cc|cc|cc|}
\hline
\textbf{Schedule} 
& \textbf{Stage 1 ($r_1$)} 
& \textbf{Stage 2 ($r_2$)} 
& \textbf{Stage 3 ($r_3$)} 
& \multicolumn{2}{c|}{\textbf{TCGA-NSCLC}} 
& \multicolumn{2}{c|}{\textbf{TCGA-BRCA}} 
& \multicolumn{2}{c|}{\textbf{TCGA-RCC}} 
& \multicolumn{2}{c|}{\textbf{PANDA}} \\
\cline{5-12}
& & & & Acc & AUC & Acc & AUC & Acc & AUC & Acc & AUC \\
\hline

Uniform (default)      
& 0.74 & 0.74 & 0.74
& 98.3 & 98.9 & 97.4 & 97.9 & 96.8 & 97.5 & 94.8 & 95.6 \\
\hline

Balanced               
& 0.70 & 0.70 & 0.82
& 98.2 & 98.8 & 97.3 & 97.8 & 96.7 & 97.4 & 94.6 & 95.4 \\
\hline

Early-aggressive        
& 0.55 & 0.80 & 0.91
& 97.6 & 98.1 & 96.8 & 97.2 & 96.1 & 96.8 & 94.0 & 94.7 \\
\hline

Late-aggressive         
& 0.85 & 0.70 & 0.67
& 98.0 & 98.6 & 97.1 & 97.6 & 96.5 & 97.2 & 94.4 & 95.1 \\
\hline

2-stage only            
& 0.63 & 0.63 & --   
& 97.8 & 98.4 & 96.9 & 97.4 & 96.3 & 97.0 & 94.2 & 94.9 \\
\hline

\end{tabular}}
\end{table*}
\subsection{Ablation Studies}

\paragraph{Ablation on Merging and Pruning Strategies.}
Table~\ref{tab:merge_prune_ablation_updated} presents an ablation study isolating the effects of token merging and pruning strategies under a fixed token budget of \( r = 0.4 \) for all variants. Random merging and random pruning result in noticeable performance degradation across all datasets, indicating that indiscriminate compression disrupts discriminative slide-level signals. Using similarity-guided merging alone consistently outperforms random merging by preserving redundant yet morphologically coherent regions, while importance-guided pruning alone yields moderate gains by suppressing low-saliency tokens. However, neither strategy alone matches the full DTC-WSI framework. Combining similarity-guided merging with importance-guided pruning achieves the best performance across all benchmarks, improving accuracy by 2--4\% over single-strategy variants. These results demonstrate that dynamic, saliency-aware merging and pruning are both necessary and complementary, and that performance gains cannot be attributed to token reduction alone, but to the proposed joint dynamic compression mechanism.

\paragraph{Sensitivity to Multi-Stage Retention Schedules.}
Table~\ref{tab:schedule_sensitivity} analyzes the sensitivity of DTC-WSI to different multi-stage token retention schedules, while fixing the final token budget to approximately 40\% across all settings. We observe that performance remains consistently strong across a wide range of stage-wise retention ratios, indicating that DTC-WSI is not overly sensitive to precise hyperparameter choices. The uniform schedule (\(r_1=r_2=r_3=0.74\)) yields the best overall performance and is used as the default configuration. More aggressive early or late compression leads to a modest accuracy drop, suggesting that progressively reducing tokens helps preserve diagnostically relevant regions under weak supervision. Overall, these results demonstrate that the proposed multi-stage compression strategy is stable, robust, and does not require fine-tuning of stage-wise retention ratios to achieve strong performance.


\paragraph{Encoder and backbone ablation.}
Table~\ref{tab:encoder_backbone_ablation} analyzes the impact of encoder choice and MIL backbone on predictive performance and computational efficiency. Across both encoders (CONCH and Virchow2) and MIL backbones (ABMIL and TransMIL), DTC-WSI consistently improves accuracy while substantially reducing GPU memory usage and inference time. For instance, with the CONCH encoder, DTC-ABMIL improves accuracy from 94.7\% to 98.3\% while reducing GPU memory by more than $2\times$ and inference time by over $5\times$. Similar trends are observed for transformer-based aggregation, where DTC-TransMIL outperforms vanilla TransMIL while reducing memory consumption and runtime by approximately $3$--$5\times$. Importantly, these gains are consistent across encoders of different strengths, indicating that the improvements stem from the proposed dynamic token compression rather than the upstream feature extractor. Overall, the results demonstrate that DTC-WSI is both encoder-agnostic and backbone-agnostic, providing a favorable accuracy--efficiency trade-off for both lightweight and transformer-based MIL pipelines.


\begin{table*}[ht]
\centering
\caption{
Ablation of encoder choice and MIL backbone.
All models are evaluated under identical data splits and training protocols.}

\label{tab:encoder_backbone_ablation}
\resizebox{\textwidth}{!}{
\begin{tabular}{|l|c|c|c|c|c|}
\hline
\textbf{Model} 
& \textbf{Encoder} 
& \textbf{Tokens Retained} 
& \textbf{Acc (\%)} 
& \textbf{GPU Mem (GB)} 
& \textbf{Inference Time (ms/WSI)} \\
\hline

ABMIL 
& CONCH 
& 1.0 
& 94.7 
& 14.2 
& 2150 \\
\hline

DTC-ABMIL 
& CONCH 
& 0.4 
& \textbf{98.3} 
& \textbf{6.4} 
& \textbf{410} \\
\hline

TransMIL 
& CONCH 
& 1.0 
& 95.2 
& 18.9 
& 2740 \\
\hline

DTC-TransMIL 
& CONCH 
& 0.4 
& \textbf{98.9} 
& \textbf{7.8} 
& \textbf{480} \\
\hline

ABMIL 
& Virchow2 
& 1.0 
& 92.1 
& 14.5 
& 1810 \\
\hline

DTC-ABMIL 
& Virchow2 
& 0.4 
& \textbf{96.0} 
& \textbf{6.6} 
& \textbf{350} \\
\hline

TransMIL 
& Virchow2 
& 1.0 
& 94.3 
& 19.1 
& 2210 \\
\hline

DTC-TransMIL 
& Virchow2 
& 0.4 
& \textbf{97.3} 
& \textbf{8.1} 
& \textbf{430} \\
\hline

\end{tabular}
}
\end{table*}

\paragraph{Ablation on Sparsity Regularization.}
We analyze the effect of the sparsity regularizer $\mathcal{L}_{\mathrm{sparse}}$ applied to the importance scores by training DTC-WSI with and without the $\ell_1$ penalty. Removing $\mathcal{L}_{\mathrm{sparse}}$ results in denser importance distributions, which weakens the pruning behavior and leads to higher token retention and increased inference cost. This also causes a consistent degradation in classification performance. In contrast, incorporating the sparsity term encourages selective saliency assignment, stabilizes multi-stage token compression, and yields both improved accuracy and computational efficiency. These results demonstrate that $\mathcal{L}_{\mathrm{sparse}}$ is a critical component for learning compact yet discriminative representations under weak slide-level supervision.


\begin{table}[ht]
\centering
\caption{
Ablation study on sparsity regularization.
Effect of removing the $\ell_1$ sparsity loss $\mathcal{L}_{\mathrm{sparse}}$
on performance and compression behavior (TCGA-NSCLC).
}
\label{tab:sparse_ablation}
\resizebox{0.9\linewidth}{!}{
\begin{tabular}{lcccc}
\hline
\textbf{Setting} 
& \textbf{Acc (\%)} 
& \textbf{AUC (\%)} 
& \textbf{Tokens Retained} 
& \textbf{Inference Time (ms)} \\
\hline
DTC-WSI w/o $\mathcal{L}_{\mathrm{sparse}}$ 
& 96.9 
& 97.6 
& 0.52 
& 610 \\
\hline
DTC-WSI (full, Ours) 
& \textbf{98.3} 
& \textbf{98.9} 
& \textbf{0.40} 
& \textbf{410} \\
\hline
\end{tabular}}
\end{table}

% We conduct two ablation studies to evaluate the contributions of  
% (1) \textbf{multi-stage token compression} and  
% (2) the interaction between \textbf{similarity-guided merging} and \textbf{importance-guided pruning}.  
% Results across all datasets show that each component of DTC-WSI is critical for achieving optimal performance.

% Table~\ref{tab:multistage_ablation_labels} reports accuracy across retention ratios 
% $r \in \{0, 0.7, 0.5, 0.4\}$.  
% Performance consistently improves as redundant tokens are removed and the representation becomes more focused.  
% For example, on \textit{TCGA-NSCLC}, accuracy increases from \textbf{94.6\%} (no compression)  
% to \textbf{96.1\%} (r=0.7), \textbf{97.1\%} (r=0.5), and peaks at \textbf{98.3\%} when retaining only 40\% of tokens.  
% Similar trends are observed on \textit{TCGA-BRCA} (\textbf{93.9\%} → \textbf{97.4\%}),  
% \textit{TCGA-RCC} (\textbf{92.8\%} → \textbf{96.8\%}), and \textit{PANDA} (\textbf{91.2\%} → \textbf{94.8\%}).  
% These results confirm that multi-stage compression acts as a curriculum: early stages preserve global context, while later stages refine attention to diagnostically salient regions, improving both accuracy and robustness.

% Table~\ref{tab:merge_prune_ablation_updated} further isolates the contributions of merging and pruning.  
% Using \textbf{only merging} already yields substantial gains (e.g., \textbf{96.4\%} on NSCLC), as redundant tissue regions are fused into compact representations.  
% \textbf{Only pruning} likewise improves performance (e.g., \textbf{95.7\%} on NSCLC) by removing low-saliency patches.  
% However, replacing the importance network with \textbf{random token selection} substantially degrades accuracy across all datasets, demonstrating the need for learned saliency during compression.

% The full DTC-WSI pipeline---combining similarity-guided merging, saliency-aware pruning, and a learned importance network---achieves the \textbf{highest accuracy on every dataset}, including  
% \textbf{98.3\%} (NSCLC), \textbf{97.4\%} (BRCA), \textbf{96.8\%} (RCC), and \textbf{94.8\%} (PANDA).  
% These ablations highlight that merging and pruning are complementary: merging eliminates redundancy, pruning removes noise, and importance-guidance ensures compression is both structured and diagnostically meaningful.

% \begin{table*}[ht]
% \centering
% \caption{Ablation study comparing merging and pruning strategies across four datasets. 
% Metrics reported as Accuracy (Acc) and AUC (\%).}
% \label{tab:merge_prune_ablation_updated}
% \resizebox{\textwidth}{!}{
% \begin{tabular}{|l|c|c|c|c|c|c|c|c|}
% \hline
% \textbf{Model Variant} &
% \multicolumn{2}{c|}{\textbf{TCGA-NSCLC}} &
% \multicolumn{2}{c|}{\textbf{TCGA-BRCA}} &
% \multicolumn{2}{c|}{\textbf{TCGA-RCC}} &
% \multicolumn{2}{c|}{\textbf{PANDA}} \\
% \cline{2-9}
% & Acc & AUC & Acc & AUC & Acc & AUC & Acc & AUC \\
% \hline

% \textbf{Only Merge}
% & 96.4 & 97.6
% & 95.3 & 96.3
% & 94.5 & 95.7
% & 93.0 & 94.2 \\
% \hline

% \textbf{Only Prune}
% & 95.7 & 96.9
% & 94.7 & 95.8
% & 93.9 & 95.1
% & 92.4 & 93.5 \\
% \hline

% \textbf{Random Token Selection}
% & 94.2 & 95.4
% & 93.1 & 94.3
% & 92.5 & 93.8
% & 90.9 & 92.1 \\
% \hline

% \textbf{Ours (Merge + Prune)}
% & \textbf{98.3} & \textbf{98.9}
% & \textbf{97.4} & \textbf{97.9}
% & \textbf{96.8} & \textbf{97.5}
% & \textbf{94.8} & \textbf{95.6} \\
% \hline
% \end{tabular}}
% \end{table*}




\begin{figure}
\centering
\includegraphics[width=0.7\textwidth]{DTC_visualization1.pdf}
\caption{
Visualization of multi-stage token compression in DTC-WSI: (A) Original WSI with post-merging and post-pruning heatmaps. (B–C) Similar patches merged into unified tokens (green), and (D) low-saliency patches removed by pruning (red).
}
 \label{fig2}
\end{figure}



\begin{figure}
\centering
\includegraphics[width=0.7\textwidth]{DTC_visualization_r1.pdf}
\caption{
Visualization of multi-stage token compression in DTC-WSI across retention ratios $r \in {1.0,,0.7,,0.5,,0.4}$. Example merged patches (e.g., adipose or stroma) are shown in blue boxes, pruned patches (e.g., background or slide borders) in red boxes, and high-importance patches retained for final prediction (e.g., tumor regions) in green boxes, highlighting diagnostically relevant tissue patterns.}


 \label{fig3}
\end{figure}

\section{Visualization of Token Compression}

Figure~\ref{fig2} illustrates the full multi-stage compression process performed by DTC-WSI. 
Panel (A) shows the original WSI along with overlaid heatmaps depicting the model output after 
similarity-guided token merging and after importance-guided pruning. Panels (B) and (C) present 
examples of visually similar patches that are merged into unified representations; these merged 
groups are highlighted with green borders, demonstrating how redundant regions—such as uniform 
stromal areas or repeated tumor patterns—are effectively consolidated. Panel (D) displays patches 
removed through importance-guided pruning, marked with red borders, revealing low-saliency regions 
that contribute minimally to the slide-level prediction.  Overall, these visualizations show that DTC-WSI performs structured, interpretable compression: reducing redundancy through merging while selectively pruning non-informative regions, ultimately preserving the most diagnostically meaningful tissue patterns.

Figure~\ref{fig3} illustrates how DTC-WSI progressively compresses a whole-slide image as the
token retention ratio decreases from $r=1.0$ (no compression) to $r=0.7$, $r=0.5$, and $r=0.4$.
For each retention level, we visualize the WSI after applying similarity-guided token merging and
importance-guided pruning.
Each panel illustrates the effect of these operations as the token budget decreases.
Example merged patches (blue boxes) correspond to visually homogeneous and redundant regions
(e.g., adipose or stromal tissue), which are fused to reduce redundancy.
Pruned patches (red boxes) highlight low-saliency or non-informative regions (e.g., slide borders
or artifacts) that are removed during compression.
In contrast, high-importance patches retained for final prediction (green boxes) capture
diagnostically relevant tumor tissue patterns.
At $r=1.0$, all extracted patches are preserved, resulting in a dense and highly redundant
representation, whereas by $r=0.4$ the representation becomes substantially more compact while
preserving salient tissue structures.
These visualizations demonstrate that DTC-WSI performs semantically meaningful compression by
preserving informative regions while aggressively removing redundancy, thereby concentrating
model capacity on morphologically relevant content and enabling both computational efficiency and
improved predictive performance.


\section{Conclusion}

We presented \textbf{DTC-WSI}, a scalable framework for token-efficient whole-slide image analysis.
By combining similarity-guided merging with importance-guided pruning in a progressive
multi-stage pipeline, DTC-WSI removes redundancy while preserving diagnostically essential
information. The method supports differentiable compression during training and deterministic
reduction at inference, achieving \textbf{5--10$\times$ token reduction}, \textbf{5.3$\times$ faster inference},
and \textbf{40\% lower memory usage} without sacrificing accuracy. Across four benchmark datasets,
DTC-WSI improves classification performance by \textbf{2--4\%}, demonstrating that compression can
enhance representation quality rather than degrade it. 
% More broadly, our results show that \emph{structured token merging}, guided by learned importance,
% offers a powerful alternative to pruning alone for large-scale vision tasks. 



\newpage
% \begin{thebibliography}{99}
\bibliography{midl26_363}


\appendix

\section{}


\section*{Whole Slide Image Preprocessing}

Whole slide image (WSI) preprocessing begins with automated tissue segmentation. Each WSI is first loaded into memory at a downsampled resolution, such as 20×, and converted from RGB to HSV colorspace. Tissue regions (foreground) are identified by thresholding the saturation channel after applying median blurring to smooth edges. A binary mask is then generated and refined using morphological closing to eliminate small gaps and holes. The contours of detected tissue regions are filtered based on an area threshold, ensuring only relevant regions are retained for further processing. The segmentation mask for each slide is also available for optional visual inspection. To facilitate manual adjustments, a human-readable text file is generated, listing processed files along with editable segmentation parameters. Once segmentation is complete, 256×256 patches are extracted from within the segmented contours at the specified magnification. These patches, along with their coordinates and slide metadata, are stored in the HDF5 hierarchical data format. The number of extracted patches per slide varies significantly—ranging from hundreds in biopsy slides at 20× magnification to hundreds of thousands in large resection slides at 40× magnification.


\section{Ablation study}

\textbf{Comparison of Different Threshold Values}

The extended ablation in Table~\ref{tab:extended_compression_ablation} evaluates DTC-WSI under a wide range of token retention ratios (\( r \in [0.3, 0.8] \)) across four benchmark datasets. Performance improves consistently as redundant tokens are removed, with accuracy rising steadily from \( r=0.8 \) to \( r=0.5 \) on all cohorts. The model achieves its best results at \( r = 0.4 \), reaching \textbf{98.3\%} (NSCLC), \textbf{97.4\%} (BRCA), \textbf{96.8\%} (RCC), and \textbf{94.8\%} (PANDA), demonstrating that moderate compression enhances discriminative focus while preserving essential morphology. When compression becomes too aggressive (\( r = 0.3 \)), performance drops sharply—e.g., NSCLC declines from \textbf{98.3\%} to \textbf{90.4\%}—indicating loss of critical diagnostic tokens. These results highlight a clear U-shaped trend: light compression reduces redundancy, moderate compression maximizes accuracy, and over-compression degrades performance. Overall, the study confirms that DTC-WSI benefits most from token retention around \( r = 0.4 \), where efficiency and predictive power are jointly optimized.


\begin{table*}[ht]
\centering
\caption{
Extended ablation study evaluating token retention ratios 
across four datasets.
Metrics reported as Accuracy (Acc) and AUC (\%). 
}
\label{tab:extended_compression_ablation}
\resizebox{\textwidth}{!}{
\begin{tabular}{|c|c|c|c|c|c|c|c|c|}
\hline
\textbf{Retention Ratio (r)} &
\multicolumn{2}{c|}{\textbf{TCGA-NSCLC}} &
\multicolumn{2}{c|}{\textbf{TCGA-BRCA}} &
\multicolumn{2}{c|}{\textbf{TCGA-RCC}} &
\multicolumn{2}{c|}{\textbf{PANDA}} \\
\cline{2-9}
& Acc & AUC & Acc & AUC & Acc & AUC & Acc & AUC \\
\hline

\textbf{r = 0.8} 
& 95.6 & 96.6 
& 94.6 & 95.6
& 93.6 & 94.7
& 92.0 & 93.3 \\
\hline

\textbf{r = 0.7}
& 96.1 & 97.2 
& 95.1 & 96.1
& 94.3 & 95.4
& 92.8 & 93.9 \\
\hline

\textbf{r = 0.6}
& 96.6 & 97.5 
& 95.6 & 96.6
& 95.0 & 95.9
& 93.3 & 94.4 \\
\hline

\textbf{r = 0.5}
& 97.1 & 98.0 
& 96.2 & 97.0
& 95.4 & 96.4
& 93.8 & 94.9 \\
\hline

\textbf{r = 0.4 (Best)}
& \textbf{98.3} & \textbf{98.9}
& \textbf{97.4} & \textbf{97.9}
& \textbf{96.8} & \textbf{97.5}
& \textbf{94.8} & \textbf{95.6} \\
\hline

\textbf{r = 0.3}
& 90.4 & 92.2
& 93.5 & 94.2
& 88.8 & 89.8
& 85.9 & 86.8 \\
\hline

\end{tabular}}
\end{table*}


% \paragraph{Encoder and backbone ablation.}
% Table~\ref{tab:encoder_backbone_ablation} analyzes the impact of encoder choice and MIL backbone on predictive performance and computational efficiency. Across both encoders (CONCH and Virchow2) and MIL backbones (ABMIL and TransMIL), DTC-WSI consistently improves accuracy while substantially reducing GPU memory usage and inference time. For instance, with the CONCH encoder, DTC-ABMIL improves accuracy from 94.7\% to 98.3\% while reducing GPU memory by more than $2\times$ and inference time by over $5\times$. Similar trends are observed for transformer-based aggregation, where DTC-TransMIL outperforms vanilla TransMIL while reducing memory consumption and runtime by approximately $3$--$5\times$. Importantly, these gains are consistent across encoders of different strengths, indicating that the improvements stem from the proposed dynamic token compression rather than the upstream feature extractor. Overall, the results demonstrate that DTC-WSI is both encoder-agnostic and backbone-agnostic, providing a favorable accuracy--efficiency trade-off for both lightweight and transformer-based MIL pipelines.


% \begin{table*}[ht]
% \centering
% \caption{
% Ablation of encoder choice and MIL backbone.
% All models are evaluated under identical data splits and training protocols.
% DTC consistently improves both accuracy and efficiency across encoders and backbones.
% }
% \label{tab:encoder_backbone_ablation}
% \resizebox{\textwidth}{!}{
% \begin{tabular}{|l|c|c|c|c|c|}
% \hline
% \textbf{Model} 
% & \textbf{Encoder} 
% & \textbf{Tokens Retained} 
% & \textbf{Acc (\%)} 
% & \textbf{GPU Mem (GB)} 
% & \textbf{Inference Time (ms/WSI)} \\
% \hline

% ABMIL 
% & CONCH 
% & 1.0 
% & 94.7 
% & 14.2 
% & 2150 \\
% \hline

% DTC-ABMIL 
% & CONCH 
% & 0.4 
% & \textbf{98.3} 
% & \textbf{6.4} 
% & \textbf{410} \\
% \hline

% TransMIL 
% & CONCH 
% & 1.0 
% & 95.2 
% & 18.9 
% & 2740 \\
% \hline

% DTC-TransMIL 
% & CONCH 
% & 0.4 
% & \textbf{98.9} 
% & \textbf{7.8} 
% & \textbf{480} \\
% \hline

% ABMIL 
% & Virchow2 
% & 1.0 
% & 92.1 
% & 14.5 
% & 1810 \\
% \hline

% DTC-ABMIL 
% & Virchow2 
% & 0.4 
% & \textbf{96.0} 
% & \textbf{6.6} 
% & \textbf{350} \\
% \hline

% TransMIL 
% & Virchow2 
% & 1.0 
% & 94.3 
% & 19.1 
% & 2210 \\
% \hline

% DTC-TransMIL 
% & Virchow2 
% & 0.4 
% & \textbf{97.3} 
% & \textbf{8.1} 
% & \textbf{430} \\
% \hline

% \end{tabular}
% }
% \end{table*}

% \paragraph{Ablation on Sparsity Regularization.}
% We analyze the effect of the sparsity regularizer $\mathcal{L}_{\mathrm{sparse}}$ applied to the importance scores by training DTC-WSI with and without the $\ell_1$ penalty. Removing $\mathcal{L}_{\mathrm{sparse}}$ results in denser importance distributions, which weakens the pruning behavior and leads to higher token retention and increased inference cost. This also causes a consistent degradation in classification performance. In contrast, incorporating the sparsity term encourages selective saliency assignment, stabilizes multi-stage token compression, and yields both improved accuracy and computational efficiency. These results demonstrate that $\mathcal{L}_{\mathrm{sparse}}$ is a critical component for learning compact yet discriminative representations under weak slide-level supervision.


% \begin{table}[ht]
% \centering
% \caption{
% Ablation study on sparsity regularization.
% Effect of removing the $\ell_1$ sparsity loss $\mathcal{L}_{\mathrm{sparse}}$
% on performance and compression behavior (TCGA-NSCLC).
% }
% \label{tab:sparse_ablation}
% \resizebox{0.9\linewidth}{!}{
% \begin{tabular}{lcccc}
% \hline
% \textbf{Setting} 
% & \textbf{Acc (\%)} 
% & \textbf{AUC (\%)} 
% & \textbf{Tokens Retained} 
% & \textbf{Inference Time (ms)} \\
% \hline
% DTC-WSI w/o $\mathcal{L}_{\mathrm{sparse}}$ 
% & 96.9 
% & 97.6 
% & 0.52 
% & 610 \\
% \hline
% DTC-WSI (full, Ours) 
% & \textbf{98.3} 
% & \textbf{98.9} 
% & \textbf{0.40} 
% & \textbf{410} \\
% \hline
% \end{tabular}}
% \end{table}


\section{Visualization of Token Compression}

We provide visualizations to illustrate how DTC-WSI compresses WSIs while preserving diagnostically
important tissue. In Figure.~\ref{fig2}, which shows the original WSI of lung adenocarcinoma, the second panel
visualizes similarity-guided merging by assigning identical interior and boundary colors to patches
that are merged into a single token. This reveals how homogeneous tissue regions—such as smooth
stroma or repeated tumor textures—are consolidated into compact groups, while heterogeneous or
diagnostically subtle regions remain unmerged. The third panel displays the result of
importance-guided pruning, where tokens with low saliency scores are removed entirely, leaving a
focused set of highly informative patches concentrated around tumor-rich or otherwise relevant
regions. Together, these visualizations demonstrate that DTC-WSI performs structured and
interpretable compression, reducing redundancy while retaining the critical morphological patterns
needed for accurate WSI classification.

\begin{figure*}
\centering
\includegraphics[width=0.8\textwidth]{DTC_visualization.pdf}
\caption{
Visualization of the multi-stage token compression in DTC-WSI. 
(Left) Original WSI thumbnail. 
(Middle) Similarity-guided merging groups redundant patches into shared representations(Patches with the same inner and border color are merged together.)
(Right) Importance-guided pruning removes low-saliency tokens.
} \label{fig4}
\end{figure*}

\section{Algorithm of DTC-WSI}
\begin{algorithm}[t]
\caption{Dynamic Token Compression for Whole-Slide Images (DTC-WSI)}
\label{alg:dtc_wsi}
\KwIn{Patch features $H^{(0)} = \{h_i^{(0)}\}_{i=1}^{N^{(0)}}$, \# stages $T$, target token counts $\{N^{(t)}\}_{t=1}^{T}$, mode $\in \{\text{train}, \text{infer}\}$}
\KwOut{Compressed token set $H^{(T)}$}

\For{$t = 0$ \KwTo $T-1$}{
    $N^{(t)} \leftarrow |H^{(t)}|$ \tcp*{current \#tokens}
    
    \tcc{1. Importance estimation}
    \For{$i = 1$ \KwTo $N^{(t)}$}{
        $s_i^{(t)} \leftarrow g_\phi(h_i^{(t)})$ \tcp*{importance score} 
    }
    $\alpha^{(t)} \leftarrow \mathrm{softmax}(s^{(t)})$ \tcp*{normalized importance}
    
    \tcc{2. Bipartite soft matching for token fusion}
    \tcp{Interleaved partition: odd indices $\rightarrow A$, even indices $\rightarrow B$}
    $A \leftarrow [1,3,5,\dots]$, \quad $B \leftarrow [2,4,6,\dots]$\;
    Let $L = \min(|A|, |B|)$\;
    
    \For{$k = 1$ \KwTo $L$}{
        $i \leftarrow A_k$, \quad $j \leftarrow B_k$\;
        $\text{sim}_{ij} \leftarrow 
        \dfrac{\langle h_i^{(t)}, h_j^{(t)} \rangle}
              {\|h_i^{(t)}\|\,\|h_j^{(t)}\|}$\;
        $u_{ij}^{(t)} \leftarrow 
        \lambda \,\text{sim}_{ij} 
        - (1-\lambda)\,|\alpha_i^{(t)} - \alpha_j^{(t)}|$\;
    }
    
    \tcp{Number of pairs to merge}
    $K^{(t)} \leftarrow \max\big(0,\, N^{(t)} - N^{(t+1)}\big)$\;
    Select top-$N^{(t)}$ pairs $\mathcal{P}^{(t)}$ sorted by $u_{ij}^{(t)}$\;
    
    \tcc{3. Merge selected pairs}
    Initialize $H_{\text{merge}}^{(t+1)} \leftarrow \emptyset$, mark all indices as ``unassigned''\;
    
    \ForEach{$(i,j) \in \mathcal{P}^{(t)}$ with both $i,j$ unassigned}{
        $\tilde{h}_i^{(t)} \leftarrow 
        \dfrac{\alpha_i^{(t)} h_i^{(t)} + \alpha_j^{(t)} h_j^{(t)}}
              {\alpha_i^{(t)} + \alpha_j^{(t)}}$\;
        Add $\tilde{h}_i^{(t)}$ to $H_{\text{merge}}^{(t+1)}$\;
        Mark $i$ and $j$ as ``assigned''\;
    }
    
    \tcc{4. Carry over unmerged tokens}
    $H_{\text{carry}}^{(t+1)} \leftarrow \{h_k^{(t)} \mid k \text{ unassigned}\}$\;
    $H_{\text{raw}}^{(t+1)} \leftarrow H_{\text{merge}}^{(t+1)} \cup H_{\text{carry}}^{(t+1)}$\;
    
    \tcc{5. Importance-guided pruning}
    \If{mode = train}{
        \tcp{soft, differentiable pruning}
        \ForEach{$h_k^{(t+1)} \in H_{\text{raw}}^{(t+1)}$}{
            $m_k^{(t)} \leftarrow \sigma\!\big(\gamma(\alpha_k^{(t)} - \tau)\big)$\;
            $h_k^{(t+1)} \leftarrow m_k^{(t)}\, h_k^{(t+1)}$\;
        }
        $H^{(t+1)} \leftarrow H_{\text{raw}}^{(t+1)}$\;
    }
    \Else{
        \tcp{hard top-$N^{(t+1)}$ pruning at inference}
        Rank all $h_k^{(t+1)} \in H_{\text{raw}}^{(t+1)}$ by $\alpha_k^{(t)}$\;
        $H^{(t+1)} \leftarrow \text{TopK}(H_{\text{raw}}^{(t+1)}, \alpha^{(t)}, N^{(t+1)})$\;
    }
}
\Return $H^{(T)}$
\end{algorithm}






\end{document}
