\documentclass{midl} % Include author names
%\documentclass[anon]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{multirow}
\usepackage{booktabs}
\usepackage{comment}
\usepackage{adjustbox}
\usepackage{caption}
\usepackage{xcolor}
\jmlrvolume{-- 137}
\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025}
\editors{Accepted for publication at MIDL 2025}

\title[PRIME]{Training-Free Dataset Pruning for Polyp Segmentation via Community Detection in Similarity Networks}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
  \midlauthor{\Name{Md Mostafijur Rahman} \Email{mostafijur.rahman@utexas.edu} %\and
  \\ \Name{Radu Marculescu} \Email{radum@utexas.edu}\\
   \addr The University of Texas at Austin}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
%\midlauthor{\Name{Author Name1\midljointauthortext{Contributed equally}\nametag{$^{1,2}$}} \Email{abc@sample.edu}\\
%\addr $^{1}$ Address 1 \\
%\addr $^{2}$ Address 2 \AND
%\Name{Author Name2\midlotherjointauthor\nametag{$^{1}$}} \Email{xyz@sample.edu}\\
%\Name{Author Name3\nametag{$^{2}$}} \Email{alphabeta@example.edu}\\
%\Name{Author Name4\midljointauthortext{Contributed equally}\nametag{$^{3}$}} \Email{uvw@foo.ac.uk}\\
%\addr $^{3}$ Address 3 \AND
%\Name{Author Name5\midlotherjointauthor\nametag{$^{4}$}} \Email{fgh@bar.com}\\
%\addr $^{4}$ Address 4
%}

\begin{document}

\maketitle

\begin{abstract}
Recent advances in deep learning have been driven by the availability of larger datasets and more complex models; however, this progress comes at the expense of substantial computational and annotation costs. To address these issues, we introduce a new, training-free dataset pruning method,\textit{PRIME}, targeting polyp segmentation in medical imaging. To this end, \textit{PRIME} constructs a similarity network among images in the target dataset and then applies community detection to retain a much smaller, yet representative subset of images from the original dataset. Unlike existing methods that require model training for dataset pruning, our \textit{PRIME} completely avoids model training, thus significantly reducing computational demands. The reduction in the training dataset reduces 56.2\% data annotation costs and enables 2.3$\times$ faster training of polyp segmentation models compared to training on the entire annotated dataset, with only a 0.5\% drop in the DICE score. Consequently, our \textit{PRIME} enables efficient training, fine-tuning, and domain adaptation across medical centers, thus offering a cost-effective solution for deep learning in polyp segmentation. Our implementation is available at \url{https://github.com/SLDGroup/PRIME}. 

\end{abstract}

\begin{keywords}
Training-free, Dataset pruning, Polyp segmentation, Community detection
\end{keywords}
\vspace{-0.2cm}
\section{Introduction}
\label{sec:introduction}

Polyp segmentation plays a pivotal role in the early detection and prevention of colorectal cancer, one of the leading causes of cancer-related mortality worldwide. Indeed, accurate segmentation enables clinicians to precisely locate and characterize polyps in colonoscopy images, thus guiding therapeutic decisions and follow-up strategies. Recent advances in deep learning \cite{ronneberger2015u,zhou2018unet++,fan2020pranet,dong2021polyp,Rahman_2023_WACV,wang2022stepwise,Rahman_2024_CVPR}, have brought remarkable improvements in segmentation performance, but these gains come at the cost of increasingly larger datasets with pixel-level annotations. Such annotations demand substantial time and expertise from medical professionals, thus making data curation expensive and labor-intensive. In addition, the heterogeneous appearance of polyps, varying in size, shape, texture, and contrast, exacerbates the challenge of gathering representative training images to ensure robust model generalization.


While efforts in image segmentation research have focused primarily on improving model performance, less attention has been paid to reducing the annotation burden through efficient data selection. Some earlier efforts address the selection of informative data samples under various learning paradigms, particularly in active learning \cite{settles1995active}. In these methods, scoring functions, such as Shannon’s entropy \cite{shannon1948mathematical}, variation ratio \cite{linton1965elementary}, and Monte Carlo (MC) dropout \cite{gal2016dropout}, are used to identify unlabeled images that maximize the informativeness of the annotated dataset under a limited annotation budget. While these approaches have shown promise in classification tasks \cite{gal2017deep}, less effort has been put into semantic segmentation \cite{gorriz2017cost}. One notable contribution addressing data selection for segmentation is KnowWhatToLabel \cite{dawoud2023knowing}, which introduces a scoring function to construct a set of support samples for few-shot microscopy image cells segmentation. Although this method outperforms traditional scoring functions like Shannon's entropy and MC-dropout, it relies on model training to compute scores, which leads to high computational costs. Additionally, data selection remains underexplored in medical image segmentation tasks like polyp segmentation. As polyp segmentation continues to grow in importance for clinical practice, efficient strategies for curating high-quality datasets are increasingly necessary.

\begin{figure}[t]
\vspace{-0.5cm}
\begin{center}
\includegraphics[width=0.9\linewidth]{images/clinicdb_network_550nodes_1510edges_226clusters0.843mo.png}
\end{center}
\vspace{-0.8cm}
\caption{Communities identified with Structural Similarity Index (SSIM) threshold of 0.92 in the CVC-ClinicDB dataset \cite{bernal2015wm}. Gephi’s algorithm \cite{blondel2008fast} identifies 226 communities with a modularity of 0.843. The degree distribution shows the number of nodes with varying connectivity (0 to 30, left-skewed), while the community distribution highlights the number of nodes per community (0 to $<$50). By considering top $\lceil 10\% \rceil$ nodes from each community, we select only 245 (44.5\%) representative nodes and prune the rest of the nodes (55.5\%). More results are shown in Appendix \ref{asec:network_analysis}.} 
\label{fig:network_with_communities}
\vspace{-0.55cm}
\end{figure}

To address these limitations, to the best of our knowledge, we are the first to propose a \textit{training-free dataset pruning} method,  namely Pruning by Representation, Image-based Modeling, and Evaluation (\textit{PRIME}) for polyp segmentation. Rather than relying on model-based metrics, \textit{PRIME} constructs a similarity network to quantify the similarity among images in a target dataset. \textit{PRIME} then exploits community detection \cite{blondel2008fast} (see Fig. \ref{fig:network_with_communities}) to retain a diverse and representative subset of the original dataset, thus effectively removing redundant data. Critically, our \textit{PRIME} does not require model training, thus reducing the computational costs associated with data reduction. PRIME ultimately reduces annotation effort, enables faster segmentation model training, and reduces domain adaptation costs across medical centers. Our main contributions are as follows:

\begin{itemize}
\vspace{-0.3cm}
    \item \textbf{Training-Free Dataset Pruning:} We introduce a new, training-free dataset pruning method, \textit{PRIME}, that first constructs a similarity network among images in the target dataset; then detects communities to select a much smaller, yet diverse and representative subset of the original data, thus eliminating redundant images.
    \vspace{-0.3cm}
    \item \textbf{Robust Generalizability Across Datasets:} By retaining a diverse and representative subset while pruning redundant images, our \textit{PRIME} consistently achieves high DICE scores across multiple datasets and similarity metrics. This robustness underscores the broad applicability of \textit{PRIME} to polyp segmentation.
    \vspace{-0.3cm}
    \item \textbf{Reduction in Annotation and Computational Costs:} Our \textit{PRIME} achieves up to 56.2\% data reduction, thus lowering the annotation effort while maintaining high segmentation performance with only a 0.5\% drop in the DICE score. The training-free design of our pruning algorithm reduces overall computational costs and enables 2.3$\times$ faster training of segmentation models compared to training in the entire dataset.
    \vspace{-0.3cm}
\end{itemize}

The remaining of the paper is organized as follows: Section~\ref{sec:related_work} discusses related work on polyp segmentation and dataset pruning methods. Section~\ref{sec:method} describes our method. Section~\ref{sec:experiments} presents the experimental evaluations. Section~\ref{sec:conclusion} summarizes our main contributions.

\vspace{-0.2cm}    
\section{Related Work}
\label{sec:related_work}

\subsection{Polyp Segmentation}
Polyp segmentation has been extensively studied in medical imaging due to its paramount importance in the diagnosis and prevention of colorectal cancer. Deep learning, particularly U-shaped convolutional neural networks (CNN) and vision transformers such as U-Net \cite{ronneberger2015u}, Attention UNet \cite{oktay2018attention}, UNet++ \cite{zhou2018unet++}, UNet 3+ \cite{huang2020unet}, DeepLabv3+ \cite{chen2018encoder}, PraNet \cite{fan2020pranet}, PolypPVT \cite{dong2021polyp}, CASCADE \cite{Rahman_2023_WACV}, SSFormer \cite{wang2022stepwise}, G-CASCADE \cite{Rahman_2024_WACV}, and EMCAD \cite{Rahman_2024_CVPR}, have demonstrated remarkable performance for polyp segmentation. However, these models typically require large-scale pixel-level annotated datasets, which leads to substantial annotation costs. Furthermore, the variability in polyp size, shape, and appearance further exacerbates the need for diverse training data, which can be prohibitively expensive in terms of both labeling and computational resources.

\vspace{-0.2cm}
\subsection{Dataset Pruning and Data Selection}
\label{subsec:dataset_pruning}
Dataset pruning and data selection aim to retain the most representative samples from a dataset while removing redundancies, an objective that is crucial for polyp segmentation due to the high cost of pixel-level annotations. Existing methods primarily target classification tasks, relying on training-based metrics or scalar scores like compactness~\cite{castro2018end, yang2022dataset}, diversity~\cite{aljundi2019gradient}, or forgetfulness~\cite{toneva2018empirical} to identify the most informative samples. Another line of work focuses on synthesizing smaller yet informative datasets via distillation~\cite{such2020generative,wang2018dataset} or condensation~\cite{zhao2021dataset}, however, they also involve complex training processes.

In the realm of semantic segmentation, especially medical image segmentation, dataset pruning remains comparatively underexplored. Most existing pruning or data selection methods have been developed and validated on classification benchmarks, with only a few adaptations to segmentation tasks. For instance, KnowWhatToLabel ~\cite{dawoud2023knowing} introduces a consistency-based method to select training samples to reduce annotation costs in few-shot microscopy image cell segmentation. Nevertheless, this method still depends on model training and has not been extended to polyp segmentation. The unique characteristics of polyp segmentation including the heterogeneous appearance of the polyp and the need for meticulous pixel-level labeling further emphasize the need for specialized and efficient data selection/pruning methods.
\vspace{-0.2cm}
\subsection{Existing Knowledge Gaps and the Need for a Training-Free Approach}
Despite significant advances, a key knowledge gap persists: Existing solutions either require model training at some stage or focus on other domains without specifically addressing polyp segmentation. Given the high cost of annotations and logistical constraints in collecting and sharing medical data across institutions, computationally lightweight methods become crucial. Training-heavy pipelines exacerbate these challenges, particularly when dealing with large-scale, high-resolution datasets under strict privacy regulations.

We plan to address these limitations by introducing a \emph{training-free dataset pruning} method designed explicitly for polyp segmentation. Rather than relying on gradient-based or generative scoring, we construct a similarity network among images and then apply community detection to isolate a diverse yet compact subset of the dataset. Our method not only substantially reduces the computational cost, but also alleviates the annotation burden, thus paving the way for faster and more cost-effective polyp segmentation workflows.

\vspace{-0.2cm}
\section{Method}
\label{sec:method}
In this section, we formally define the problem of dataset pruning and describe the two key components of our \textit{PRIME}: similarity network construction and sample selection.

\vspace{-0.2cm}
\subsection{Dataset Pruning: Problem Definition}
Let $\mathcal{D} = \{x_1, x_2, \dots, x_n\}$ represent the original training dataset, where each $x_i$ is an image, and $n$ is the total number of images in the dataset. Our goal is to select a subset $\mathcal{D'} \subset \mathcal{D}$, such that $|\mathcal{D'}| = m$ and $m \ll n$, while preserving the generalizability of the model. Specifically, we want to minimize the performance difference between a model trained on the original dataset $\mathcal{D}$ and the pruned dataset $\mathcal{D'}$, i.e.: \vspace{-0.2cm}
\begin{equation}
\min_{\mathcal{D'}} \mathbb{E}_{x \in \mathcal{D}} \left[ \mathcal{L}(f_{\mathcal{D}}(x), f_{\mathcal{D'}}(x)) \right]
\vspace{-0.2cm}
\end{equation}
where $f_{\mathcal{D}}$ and $f_{\mathcal{D'}}$ represent models trained on the original dataset and the pruned dataset, respectively, and $\mathcal{L}$ denotes the loss function (e.g., cross-entropy or DICE). Our goal is to select $\mathcal{D'}$ such that it is significantly smaller than $\mathcal{D}$, but retains its diversity and representativeness, thus minimizing performance degradation compared to the dataset $\mathcal{D}$.

\vspace{-0.2cm}
\subsection{Similarity Network Construction}
For efficient pruning, we first construct a similarity network $\mathcal{G} = (\mathcal{V}, \mathcal{E})$ from the training images, where each node $v_i \in \mathcal{V}$ represents an image $x_i \in \mathcal{D}$, and the edges $\mathcal{E}$ capture the similarity between image pairs. The similarity between any image pair ($x_i$,$x_j$) is quantified using a metric such as structural similarity index (SSIM)\footnote{SSIM similarity achieves slightly better pruning results than PCC as shown in Figs. 4 and 5.} or Pearson's correlation coefficient (PCC). More precisely, the $SSIM \in [0,1]$ of two images is calculated as in Eq. \ref{eq:ssim}: \vspace{-0.15cm}
%\[
%\text{SSIM}(x_i, x_j) = \frac{(2 \mu_i \mu_j + C_1)(2 \sigma_{ij} + C_2)}{(\mu_i^2 + \mu_j^2 + C_1)(\sigma_i^2 + \sigma_j^2 + C_2)},
%\]
\small
\begin{equation}
SSIM(x_i, x_j) = 
\frac{(2 \mu_i \mu_j + C_1)(2 \sigma_{ij} + C_2)}
{(\mu_i^2 + \mu_j^2 + C_1)(\sigma_i^2 + \sigma_j^2 + C_2)}
\label{eq:ssim}
\vspace{-0.15cm}
\end{equation}
\normalsize
where \(\mu_i\) and \(\mu_j\) are mean intensities (average pixel values) of images \(x_i\) and \(x_j\), \(\sigma_i^2\) and \(\sigma_j^2\) are their variances, and \(\sigma_{ij}\) is their covariance. Small positive constants \(C_1\) and \(C_2\) prevent instability in the division when variances or covariances are close to zero.

The $PCC \in [-1,1]$ between a pair of images is computed as in Eq. \ref{eq:pcc}: \vspace{-0.15cm}
\small
\begin{equation}
\label{eq:pcc}
PCC(x_i, x_j) = \frac{\sum (x_i - \bar{x}_i)(x_j - \bar{x}_j)}{\sqrt{\sum (x_i - \bar{x}_i)^2} \sqrt{\sum (x_j - \bar{x}_j)^2}}
\end{equation}
\vspace{0.15cm}
\normalsize
where $\bar{x}_i$ and $\bar{x}_j$ are mean intensities (average pixel values) of images $x_i$ and $x_j$.
 
For each image pair \((x_i, x_j)\), an edge \(e_{ij} \in \mathcal{E}\) is created between nodes \(v_i\) and \(v_j\) if their SSIM or PCC similarity exceeds a threshold \(\tau\) (set based on the range of similarities in the target dataset). This results in an undirected graph $\mathcal{G}$, where pairs of images with sufficient SSIM or PCC similarities get connected. The adjacency matrix $\mathbf{A}$ of $\mathcal{G}$ is defined in Eq. \ref{eq:adjacency_matrix}: \vspace{-0.15cm}
\begin{equation}
\label{eq:adjacency_matrix}
\mathbf{A}_{ij} =
\begin{cases} 
1 & \text{if } similarity(x_i, x_j) \geq \tau, \\
0 & \text{otherwise}.
\end{cases}
\end{equation}
\vspace{-0.2cm}

Fig. \ref{fig:workslow_diagram} (left box) illustrates this process. For example, in Fig. \ref{fig:network_with_communities}, the similarity network of the CVC-ClinicDB dataset is shown for an SSIM threshold of $\tau = 0.92$; this network has $550$ nodes and $1510$ edges. More details on this network are provided in the Appendix \ref{asec:network_analysis}.

\vspace{-0.2cm}
\subsection{Sample Selection via Community Detection}
Our pruning method exploits community detection \cite{blondel2008fast} within the similarity network $\mathcal{G}$. Communities in the network correspond to clusters of highly similar images (Fig. \ref{fig:network_with_communities}) which can help us to select a much smaller, yet representative subset from each community. The steps of our pruning method (Fig. \ref{fig:workslow_diagram}, right box) are described next.

\begin{figure}%[t]
\vspace{-0.3cm}
\begin{center}
\includegraphics[width=1\linewidth]{images/workflow_diagram.png}
\end{center}
\vspace{-0.7cm}
  \caption{Proposed workflow diagram (left - network construction of images, right - pruning network to select a subset of images).}
\label{fig:workslow_diagram}
\vspace{-0.55cm}
\end{figure}

\vspace{-0.2cm}
\subsubsection{Community detection}
We use a community detection algorithm, namely the Louvain method \cite{blondel2008fast}, to identify communities in the similarity network. Our goal is to maximize the modularity $Q$ of the network, defined as in Eq. \ref{eq:modularity}: \vspace{-0.2cm}
\small
\begin{equation}
\label{eq:modularity}
Q = \frac{1}{2|\mathcal{E}|} \sum_{i,j} \left[ A_{ij} - \frac{k_i k_j}{2|\mathcal{E}|} \right] \delta(c_i, c_j)
\vspace{-0.2cm}
\end{equation}
\normalsize
where \( |\mathcal{E}| \) is the cardinality of edges \( \mathcal{E} \), \( A_{ij} \) denotes adjacency as in Eq. \ref{eq:adjacency_matrix} (1 if nodes \( i \) and \( j \) are connected, 0 otherwise), \( k_i \) and \( k_j \) are the node degrees of nodes \( i \) and \( j \), and \( c_i \) and \( c_j \) indicate the communities of nodes \( i \) and \( j \). The indicator \( \delta(c_i, c_j) \)= 1 if \( c_i = c_j \) (same community), and 0 otherwise. This process identifies the network communities\footnote{We note that, in networks with a weaker structure, these groupings may be less distinct; this issue can be mitigated by using a smaller similarity threshold \( \tau \).} \( \mathcal{C}_1, \mathcal{C}_2, \dots, \mathcal{C}_k \), where each community \( \mathcal{C}_k \) consists of a set of highly similar images. 

\vspace{-0.2cm}
\subsubsection{Sample selection}
For each detected community $\mathcal{C}_k$, we retain only a representative subset of nodes (i.e., images) based on the size of the community:
\textit{i)} If $|\mathcal{C}_k| = 1$, we retain the single node, as it represents a unique image in the dataset; \textit{ii)} If $|\mathcal{C}_k| > 1$, we retain the top $\lceil p\% \rceil$ of nodes based on their (higher) node degree within the community; the remaining nodes in the community are pruned. Finally, the retained subset $\mathcal{D'}$ can be represented as in Eq. \ref{eq:retained_dataset}: \vspace{-0.2cm}
\small
\begin{equation}
\label{eq:retained_dataset}
\mathcal{D'} = \bigcup_{k=1}^{K} \mathcal{C}_k^{\lceil p\% \rceil}
\vspace{-0.2cm}
\end{equation} 
\normalsize
where $\mathcal{C}_k^{\lceil p\% \rceil}$ represents the top $\lceil p\% \rceil$ nodes selected from community $\mathcal{C}_k$, and $K$ is the total number of detected communities.

Intuitively, the similarity threshold \(\tau\) controls the network density, with lower values forming fewer, but denser communities and thus enabling higher pruning. A smaller \(\lceil p\% \rceil\) retains fewer nodes per community, which ensures that each community is represented by its most representative images, thus reducing redundancy while preserving diversity.

\section{Experiments}
\label{sec:experiments}
\subsection{Segmentation Network Architectures}  
We use the PVT-v2-b2 (PVT) \cite{wang2022pvt} and ResNet50 (R50) \cite{he2016deep} encoders (which are hierarchical backbones) and extract features from four stages. Then, we use the CASCADE decoder\footnote{Other decoders such as EMCAD \cite{Rahman_2024_CVPR} can be also used as shown in Appendix \ref{assec:emcad_clinicdb}.} \cite{rahman2023medical} (which is a local attention-based cascaded decoder) to decode and obtain the segmentation outputs of different stages. Finally, we use the output from the last stage to obtain the final segmentation map. We adopt the multi-stage loss aggregation for training as in \cite{rahman2023medical}. 


\vspace{-0.2cm}
\subsection{Implementation Details}
We implement all our experiments in Pytorch 1.11.0 and train all models on a single NVIDIA RTX A6000 GPU with 48GB of memory. We use different similarity thresholds ($\tau$) to construct the similarity network. We do not use any data augmentations in our experiments.

We use the AdamW optimizer \cite{loshchilov2017decoupled} with a learning rate and weight decay of 1e-4 in our experiments. We use the combined weighted IoU and weighted Binary Cross Entropy (BCE) loss function for the polyp segmentation on CVC-ClincDB and Kvasir datasets. We train the model for 200 epochs with a batch size of 16. We also utilize the pre-trained weights on ImageNet for backbone networks. Finally, we report the average DICE score (\%) over five runs for evaluation. 

\begin{figure}[h]
\vspace{-0.5cm}
\begin{center}
\includegraphics[width=0.55\linewidth]{images/samples_from_community_vs_dice.png}
\end{center}
\vspace{-0.8cm}
  \caption{Pruning (\%) and DICE scores (\%) vs. images (\%) selected from each community in CVC-ClinicDB dataset (total of 550 images) with PVT-CASCADE model. We use SSIM with a median threshold $\tau$=0.5 for a denser network construction with fewer communities.}
\label{fig:samples_from_community_vs_prune_dice}
\vspace{-0.3cm}
\end{figure}

\begin{table}[h]
\centering
\caption{Experimental results (DICE \%) on CVC-ClinicDB dataset using PVT-CASCADE and ResNet50 (R50)-CASCADE. $\tau$ is a similarity threshold that controls the density (i.e., pruning rate) of the similarity network. We select top $\lceil10\%\rceil$ images based on the degree of each community in SSIM-based similarity networks. }
\vspace{-0.2cm}
\begin{adjustbox}{width=0.9\textwidth}
\begin{tabular}{lccccc}
\hline
\multicolumn{1}{l}{\textbf{Architectures}} & \multicolumn{1}{c}{\textbf{Methods/($\tau$, pruning \%)}} & \multicolumn{1}{l}{\begin{tabular}[c]{@{}l@{}}($\tau$=0.92, \\55.5\%) \end{tabular}} & \multicolumn{1}{l}{\begin{tabular}[c]{@{}l@{}}($\tau$=0.95,\\31.3\%) \end{tabular}} & \multicolumn{1}{l}{\begin{tabular}[c]{@{}l@{}}($\tau$=0.97,\\15.1\%) \end{tabular}} & \multicolumn{1}{l}{\begin{tabular}[c]{@{}l@{}}($\tau$=1, \\0\% pruning)\end{tabular}} \\
\hline
PVT-CASCADE & Entropy \cite{shannon1948mathematical} & 89.82$\pm$1.4 & 91.77$\pm$1.0 & 92.63$\pm$0.8 & \multirow{4}{*}{94.29$\pm$0.5}  \\
PVT-CASCADE & MC-dropout \cite{gal2016dropout} & 90.18$\pm$1.3 & 92.25$\pm$0.9 & 92.81$\pm$0.7 & \\ 
PVT-CASCADE & KnowWhatToLabel \cite{dawoud2023knowing} & 90.34$\pm$1.4 & 92.62$\pm$0.9 & 92.87$\pm$0.8 & \\
PVT-CASCADE & Random & 89.94$\pm$3.3 & 91.99$\pm$2.1 & 92.24$\pm$1.3 &  \\
PVT-CASCADE & \textbf{PRIME (Ours)} & \textbf{92.85$\pm$1.3} & \textbf{94.18$\pm$0.7} & \textbf{94.48$\pm$0.5} &  \\ \hline
R50-CASCADE & Entropy \cite{shannon1948mathematical} & 89.37$\pm$1.5 & 91.43$\pm$0.9 & 92.21$\pm$0.7 &  \multirow{4}{*}{93.97$\pm$0.4} \\             
R50-CASCADE & MC-dropout \cite{gal2016dropout} & 89.96$\pm$1.3 & 91.94$\pm$0.8 & 92.42$\pm$0.7 & \\    R50-CASCADE & KnowWhatToLabel \cite{dawoud2023knowing} & 90.10$\pm$1.3 & 92.29$\pm$0.9 & 92.51$\pm$0.6 &  \\             
R50-CASCADE & Random & 89.53$\pm$3.1 & 91.61$\pm$1.9 & 91.86$\pm$1.4 &  \\               
R50-CASCADE & \textbf{PRIME (Ours)} & \textbf{92.45$\pm$1.1} & \textbf{93.82$\pm$0.8} & \textbf{94.08$\pm$0.4} &  \\ \hline
\end{tabular}
\end{adjustbox}
\label{tab:cvc_clinicdb}
\vspace{-0.5cm}
\end{table}


\begin{figure}[h]
\vspace{-0.5cm}
\begin{center}
\includegraphics[width=0.6\linewidth]{images/clinicdb_prunevsdice_time.png}
\end{center}
\vspace{-0.85cm}
  \caption{Pruning (\%) vs. DICE (left axis) and Time (right axis) on the CVC-ClinicDB dataset \cite{bernal2015wm} (550 images) with the PVT-CASCADE model. SSIM thresholds $\tau$=[1, 0.97, 0.95, 0.92, 0.9, 0.88, 0.85] are used to construct similarity networks, while PCC thresholds are adjusted to achieve similar pruning rates. Training time per epoch is reported averaging over 200 epochs. Our \textit{PRIME} prunes $55.5\% ^{\textcolor{blue}{(D)}}$ of data with only a $1.4\% ^{(\textcolor{blue}{A-D})}$ drop in DICE and reduces 2.3$\times$ training time compared to training on the entire dataset. Notably, we can prune $15\% ^{(\textcolor{blue}{B})}$ of images with a $0.2\% ^{(\textcolor{blue}{B-A})}$ increase in DICE.}
\label{fig:clinicdb_prune_dice_time}
\vspace{-0.5cm}
\end{figure}

\begin{figure}[t]
\begin{center}
\includegraphics[width=0.6\linewidth]{images/kvasir_prunevsdice_time.png}
\end{center}
\vspace{-0.85cm}
  \caption{Pruning (\%) vs. DICE score (left axis) and Time (right axis) on the Kvasir dataset \cite{jha2020kvasir} (900 images) with the PVT-CASCADE model. Training time per epoch is reported averaging over 200 epochs. SSIM thresholds $\tau$=[1, 0.97, 0.90, 0.85, 0.8, 0.75, 0.7] are used to construct similarity networks, while PCC thresholds are adjusted to achieve similar pruning rates. Our \textit{PRIME} prunes $56.2\% ^{(\textcolor{blue}{D})}$ of data with only a $0.5\% ^{(\textcolor{blue}{A-D})}$ drop in DICE and achieves 2.3$\times$ faster training compared to training on the entire dataset.}
\label{fig:kvasir_prune_dice_time}
\vspace{-0.7cm}
\end{figure}

\vspace{-0.2cm}
\subsection{Results}
\textbf{Impact of $\lceil p\% \rceil$ community representative:} 
We conducted an ablation study on CVC-ClinicDB dataset with the PVT-CASCADE model to see the impact of selecting different percentages of representatives from each community (Fig. \ref{fig:samples_from_community_vs_prune_dice}). We can conclude that the DICE score increases only marginally when we select more than $\lceil 10\% \rceil$ from each community. Hence, we select the top $\lceil 10\% \rceil$ samples from each community for model training purposes and prune/remove the rest of the community nodes in all of our experiments.

\noindent\textbf{Generalizability in multiple models:}
Table \ref{tab:cvc_clinicdb} shows the effectiveness of our \textit{PRIME} in improving generalizability across the PVT-CASCADE and R50-CASCADE models. At a similarity threshold of $\tau$=0.92 (i.e., 55.5\% pruning), PVT-CASCADE achieves a DICE score of 92.85\% vs. 89.94\% with random selection, and R50-CASCADE scores 92.45\% compared to 89.53\%. As pruning decreases, our \textit{PRIME} consistently outperforms the random selection. \textit{PRIME} also outperforms KnowWhatToLabel, Entropy, and MC-dropout pruning methods. Even at $\tau$=0.97 (15.1\% pruning), our \textit{PRIME} maintains high DICE scores in both models, closely matching the results obtained with the full dataset. This demonstrates our \textit{PRIME}’s ability to preserve data diversity and ensure robust segmentation performance with significantly reduced data. More results on Kvasir dataset are shown in Appendix \ref{assec:kvasir_results}.

%\noindent \textbf{Generalizability in multiple datasets and similarity metrics:} From Fig. \ref{fig:clinicdb_prune_dice_time}, we see that our pruning method can prune prune 55.5\% data leading to 2.3$\times$ training speedup with 1.45\% DICE score drop. We see a better trend in Fig. \ref{fig:kvasir_prune_dice_time}, 3.6$\times$ training speedup with only 1.55\% DICE score drop. In general, we can conclude that pruning in our SSIM-based similarity network shows the best performance outperforming both our Pearson correlation coefficient-based pruning and random pruning methods.
%Here, different values of $\tau$ are used for each pruning rate (i.e., smaller $\tau$ larger pruning).

\noindent\textbf{Generalizability on multiple datasets (centers) and similarity metrics:} 
Figs. \ref{fig:clinicdb_prune_dice_time} and \ref{fig:kvasir_prune_dice_time} show the efficacy of our \textit{PRIME} on the CVC-ClinicDB and Kvasir datasets, using both PCC and SSIM similarity metrics. Our \textit{PRIME} consistently yields higher DICE scores than random pruning, even at higher pruning rates. For instance, at 31.3\% pruning on the CVC-ClinicDB (Fig. \ref{fig:clinicdb_prune_dice_time}), our PCC-based pruning achieves 94.02\% DICE score, and SSIM-based pruning reaches 94.2\%, compared to only 92.0\% with random pruning. At 71.9\% pruning on the Kvasir dataset (Fig. \ref{fig:kvasir_prune_dice_time}), our \textit{PRIME} maintains DICE scores of 91.1\% (PCC) and 91.6\% (SSIM), while random pruning drops to 89.6\%. Our \textit{PRIME} also consistently outperforms KnowWhatToLabel, Entropy, and MC-dropout pruning methods. Finally, the training time per epoch decreases from about 100 secs at 0\% pruning to 28 secs at 71.9\% pruning, thus validating the efficiency and robustness of our \textit{PRIME} across datasets or imaging centers.


\begin{table}[t]
    \centering
    
    \caption{Results on a large video polyp dataset (SUN-SEG) \cite{ji2022video}. We use the PVT-CASCADE network and run each model for 30 epochs. Our PRIME prunes 69.7\% of the data with an SSIM similarity threshold of 0.7, while PCC thresholds are adjusted to achieve similar pruning rates. We report the average DICE score (\%) over three runs.}
    \label{tab:sunseg_results}
    \vspace{-0.2cm}
    \begin{adjustbox}{width=0.9\textwidth}
    \begin{tabular}{l|r|r|r|r}
            \toprule
        \textbf{Pruning Method} & \textbf{Easy Seen (\%)} & \textbf{Easy Unseen (\%)} & \textbf{Hard Seen (\%)} & \textbf{Hard Unseen (\%)} \\
        \midrule
        Full Dataset   & 92.47  & 80.65  & 87.77  & 80.80  \\
        PRIME w/ SSIM (\textbf{Ours})   & 91.55  & 80.06  & 86.95  & 80.51  \\
        PRIME w/ PCC (\textbf{Ours})    & 91.12  & 79.43  & 86.64  & 79.86  \\
        Random Pruning & 90.23  & 77.86  & 85.34  & 77.67  \\
        \bottomrule
    \end{tabular}
    \end{adjustbox}
    \vspace{-0.2cm}
\end{table}

\noindent\textbf{Scalability on large video polyp segmentation dataset:}
To demonstrate the scalability of our PRIME method, we evaluate it on SUN-SEG (Table \ref{tab:sunseg_results}), a large-scale video polyp segmentation dataset with 19,544 training images, significantly larger than standard polyp datasets. Handling such large datasets is computationally expensive and increases annotation costs, making dataset pruning crucial for practical deployment. Our method effectively reduces dataset size by 69.7\% while maintaining segmentation performance comparable to training on the full dataset. PRIME (SSIM) achieves less than a 1\% drop in DICE scores on seen test sets, whereas random pruning results in up to a 3.13\% performance drop, demonstrating that our structured community-based selection strategy retains essential information while significantly reducing data redundancy. In addition, PRIME reduces storage and computational costs, making large-scale dataset management more efficient without sacrificing segmentation quality. These results confirm that PRIME is a scalable, cost-effective solution for large medical imaging datasets.

\begin{table}[t]
    \centering
    \caption{Effects of augmentation on training PVT-CASCADE model using the full dataset, our PRIME, and random pruning. We apply random rotation and flips as augmentations. We report the average DICE score (\%) over three runs.}
    \label{tab:augmentation_effect}
    \vspace{-0.2cm}
    \begin{adjustbox}{width=0.9\textwidth}
    \begin{tabular}{llccc}
        \toprule
        \textbf{Dataset} & \textbf{Training Data} & \textbf{No Augmentation} & \textbf{With Augmentation} & \textbf{Improvement} \\
        \midrule
        \multirow{3}{*}{CVC-ClinicDB} & Full Dataset   & 94.29\% & 94.63\% & +0.34\% \\
        & PRIME w/ SSIM (Ours) & 92.85\% & 93.72\% & +0.87\% \\
        & Random Pruned  & 89.94\% & 90.76\% & +0.82\% \\
        \midrule
        \multirow{3}{*}{SUN-SEG}      & Full Dataset   & 92.47\% & 92.91\% & +0.44\% \\
        & PRIME w/ SSIM (Ours) & 91.55\% & 92.57\% & +1.02\% \\
        & Random Pruned  & 90.23\% & 90.98\% & +0.75\% \\
        \bottomrule
    \end{tabular}
    \end{adjustbox}
    \vspace{-0.5cm}
\end{table}

\noindent\textbf{Generalizability on unseen testset:}
To assess the generalizability of our PRIME method, we evaluate segmentation performance on the unseen testsets from SUN-SEG (Table \ref{tab:sunseg_results}). Despite pruning 69.7\% of the dataset, PRIME (SSIM) maintains segmentation DICE scores within 0.6\% of the full dataset, demonstrating that our method preserves diverse and informative samples essential for robust model learning. In contrast, random pruning leads to a 3.13\% performance drop, indicating that naive selection disrupts essential feature distribution, negatively affecting generalization. These results confirm that PRIME effectively balances dataset reduction with performance retention, thus ensuring strong generalization to unseen cases while significantly reducing annotation cost.

\noindent\textbf{Effect of augmentation during training:} As shown in Table \ref{tab:augmentation_effect}, augmentation has minimal impact ($<$0.5\%) when using the full dataset. However, for PRIME pruned datasets, augmentation improves the DICE scores by 0.87–1.02\%, suggesting that augmentations compensate for reduced dataset size.

\vspace{-0.2cm}
\section{Conclusion and Future Work}
\label{sec:conclusion}
In this paper, we have introduced \textit{PRIME}, a training-free dataset pruning method to minimize image annotation (labeling) efforts and enable efficient training of segmentation models. Experiments on multiple medical image segmentation datasets show its potential to maintain high DICE scores while reducing computational and data annotation costs. 

Future work will focus on developing more advanced metrics with critical shape information, a deeper analysis of similarity networks, and expanding experiments to diverse datasets (including 3D segmentation). Our \textit{PRIME} holds promise in accelerating research in continual learning, active learning, contrastive learning, and few-shot learning by enhancing data efficiency in resource-intensive applications.

%% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This work is supported in part by the NSF grant CNS 2007284, in part by the iMAGiNE Consortium (https://imagine.utexas.edu/), and in part by the Texas Health Catalyst award.}

\bibliography{midl25_137}


\newpage

\appendix
%This is a boring technical proof of
%\begin{equation}\label{eq:example}
%\cos^2\theta + \sin^2\theta \equiv 1.
%\end{equation}

%\section{Proof of Theorem 2}

%This is a complete version of a proof sketched in the main text.

\section{Datasets Details}
CVC-ClinicDB \cite{bernal2015wm} contains 612 images, which are extracted from 31 colonoscopy videos. Kvasir includes 1,000 polyp images, which are collected from the polyp class in the Kvasir-SEG dataset
\cite{jha2020kvasir}. Following the settings in CASCADE \cite{rahman2023medical}, we adopt the
same 550 and 900 images from CVC-ClinicDB and Kvasir datasets as the training set, and the remaining 62 and 100 images, respectively, are used as testsets.

The SUN-SEG dataset \cite{ji2022video, misawa2021development} is a large-scale video polyp segmentation benchmark consisting of 49,136 polyp frames and 109,554 non-polyp frames, with 19,544 training images and 29,592 test images. It provides diverse annotations, including pixel-wise masks, boundaries, scribbles, and polygons, supporting both fully and weakly supervised learning. The test set is further categorized into easy and hard cases, with 4,719 easy seen, 12,351 easy unseen, 3,882 hard seen, and 8,640 hard unseen images, ensuring a rigorous evaluation of model generalization. As one of the most well-annotated and high-quality polyp segmentation datasets, SUN-SEG serves as a critical benchmark for real-world endoscopic applications.

\section{Detailed Analysis of Similarity Network Properties and Community Detection}
\label{asec:network_analysis}

The structural properties of the constructed similarity network (900 nodes, 8922 edges) in Fig. \ref{fig:network_with_communities_kvasir} provide compelling evidence for the utility of community detection in training-free dataset pruning. Below, we analyze key metrics and their implications:

\begin{figure}[h]
\begin{center}
\includegraphics[width=1\linewidth]{images/kvasir_network_900nodes_8922edges_189clusters0.512mo.png}
\end{center}
\vspace{-0.5cm}
\caption{Communities identified with Structural Similarity Index (SSIM) threshold of 0.80 in the Kvasir dataset \cite{jha2020kvasir}. Gephi’s community detection \cite{blondel2008fast} identifies 189 communities with a modularity of 0.512. The degree distribution shows the number of nodes with varying connectivity (0 to $>$175, left-skewed), while the community distribution highlights the number of nodes per community (0 to $>$175). By considering top $\lceil 10\% \rceil$ nodes from each community, we select only 253 (28.1\%) representative nodes and prune the rest of the nodes (71.9\%).}
\label{fig:network_with_communities_kvasir}
\vspace{-0.2cm}
\end{figure}

\subsection{Network Connectivity and Cohesion}

\begin{itemize}
  \item \textbf{Average Degree (19.827):} Each node is connected to $~$20 others, on average, indicating robust pairwise similarity relationships. This density ensures that communities are well-anchored by hubs (high-degree nodes) while retaining fine-grained connections between niche samples.
  
  \item \textbf{Graph Density (0.022):} The sparsity of the network (only 2.2\% of possible edges exist) reflects a carefully calibrated similarity threshold, filtering out weak or noisy relationships. This sparsity enhances the discriminative power of detected communities, as retained edges likely correspond to semantically meaningful similarities.
\end{itemize}
\subsection{Small-World Characteristics}
\begin{itemize}
  \item \textbf{Average Path Length (3.348):} The short average distance between nodes (3.348 hops) suggests a small-world topology, where tightly knit communities are interconnected by a few bridging nodes \cite{watts1998collective}. This property enables efficient traversal of the network during pruning, thus ensuring that representative samples can be selected without exhaustive search.
  
  \item \textbf{Network Diameter (10):} The longest shortest path spans 10 edges, indicating that even the most dissimilar images are relatively proximate in the feature space. This compactness supports the hypothesis that the dataset contains latent hierarchical structures resolvable via community detection.
\end{itemize}
\subsection{Community Detection Efficacy}

\begin{itemize}
  \item \textbf{Modularity (0.512):} A modularity score $>$0.3 confirms the statistically significant community structure. The modularity value of 0.512 indicates strong separation between groups, where intra-community edges significantly outnumber inter-community edges. This ensures that the detected clusters are cohesive and distinct, aligning with visually or pathologically meaningful subgroups.
  
  \item \textbf{Connected Components (184 $\rightarrow$ 189 Communities):} The network initially contains 184 isolated components, but community detection resolves 189 clusters, demonstrating that the Gephi's algorithm \cite{blondel2008fast} successfully identifies substructures within connected components. This granularity is critical for capturing fine-grained variations (e.g., polyp subtypes or imaging artifacts).
\end{itemize}

\subsection{Local Clustering and Redundancy Reduction}
\begin{itemize}
  \item \textbf{Average Clustering Coefficient (0.597):} The high clustering coefficient indicates that the nodes tend to form tightly connected triads, a hallmark of homophilic networks where similar nodes cluster together \cite{watts1998collective}. This property ensures communities are internally homogeneous, reducing redundancy and enabling the selection of representative samples without oversampling.
  
  \item \textbf{Hub-Driven Cohesion:} Hubs (high-degree nodes) act as central coordinators, linking disparate regions of the network. By prioritizing hubs during pruning, our method retains images that anchor multiple communities, thus preserving the global dataset structure while minimizing information loss.
\end{itemize}


\subsection{Implications for Training-Free Dataset Pruning}
\label{subsec:implications}

The network’s small-world compactness (short average path of 3.348), scale-free topology (evidenced by hubs with degree $>$100 and average degree 19.827), and strong modularity (0.512) collectively validate community detection as a principled framework for dataset pruning. By leveraging these properties, our \textit{PRIME} achieves:
\begin{itemize}
  \item \textit{Efficiency:} Short paths and hierarchical communities reduce computational overhead.
  \item \textit{Representativeness:} Cohesive clusters preserve clinical diversity.
  \item \textit{Interpretability:} Communities align with domain-specific patterns (e.g., pathology, imaging protocols).
\end{itemize}
Our method is particularly advantageous for polyp segmentation, where resource limitations (e.g., expert humans available to do annotation or computing resources to run intense computations) demand strategies that balance performance, efficiency, and clinical relevance.

\begin{figure}[h]
\begin{center}
\includegraphics[width=0.95\linewidth]{images/reatained_pruned_samples_clinicdb_ccth0.85_images474.png}
\end{center}
\vspace{-0.4cm}
\caption{Our PRIME retained and pruned samples from different communities in CVC-ClinicDB dataset (pruning around 15\% highly similar images). Although more than one image can be retained or pruned depending on the community size, here we report pairs of retained and pruned samples from the same community.}
\label{fig:retained_pruned_smaples_visualization}
\vspace{-0.2cm}
\end{figure}

\begin{table}[h]
\centering
\caption{Experimental results (DICE \%) on Kvasir dataset \cite{jha2020kvasir} using PVT-CASCADE and ResNet50 (R50)-CASCADE. $\tau$ is a similarity threshold that controls the density (i.e., pruning rate) of the similarity network. We select top $\lceil10\%\rceil$ images based on the degree of each community in SSIM-based similarity networks.}
\vspace{-0.2cm}
\begin{adjustbox}{width=1\textwidth}
\begin{tabular}{lccccc}
\hline
\multicolumn{1}{l}{\textbf{Architectures}} & \multicolumn{1}{c}{\textbf{Methods/($\tau$, pruning \%)}} & \multicolumn{1}{l}{\begin{tabular}[c]{@{}l@{}}($\tau$=0.83, \\56.2\%) \end{tabular}} & \multicolumn{1}{l}{\begin{tabular}[c]{@{}l@{}}($\tau$=0.85,\\41.8\%) \end{tabular}} & \multicolumn{1}{l}{\begin{tabular}[c]{@{}l@{}}($\tau$=0.88,\\19.1\%) \end{tabular}} & \multicolumn{1}{l}{\begin{tabular}[c]{@{}l@{}}($\tau$=1, \\0\% pruning)\end{tabular}} \\
\hline
PVT-CASCADE & Entropy \cite{shannon1948mathematical} & 90.93$\pm$1.2 & 91.25$\pm$0.9 & 91.34$\pm$0.6 & \multirow{4}{*}{93.17$\pm$0.4} \\
PVT-CASCADE & MC-dropout \cite{gal2016dropout} & 90.81$\pm$1.3 & 91.46$\pm$0.9 & 91.75$\pm$0.7 & \\ 
PVT-CASCADE & KnowWhatToLabel \cite{dawoud2023knowing} & 91.57$\pm$1.3 & 91.75$\pm$0.7 & 91.98$\pm$0.7 &  \\
PVT-CASCADE & Random & 90.69$\pm$2.8 & 91.21$\pm$1.9 & 91.52$\pm$1.2 & \\
PVT-CASCADE & \textbf{PRIME (Ours)} & \textbf{92.67$\pm$1.1} & \textbf{93.01$\pm$0.6} & \textbf{93.22$\pm$0.5} & \\ \hline
R50-CASCADE & Entropy \cite{shannon1948mathematical}  & 90.58$\pm$1.4 & 91.06$\pm$1.0 & 91.22$\pm$0.7 &  \multirow{4}{*}{93.02$\pm$0.5} \\             
R50-CASCADE & MC-dropout \cite{gal2016dropout} & 90.63$\pm$1.2 & 91.41$\pm$0.9 & 91.62$\pm$0.6 & \\     R50-CASCADE & KnowWhatToLabel \cite{dawoud2023knowing} & 91.41$\pm$1.4 & 91.62$\pm$0.8 & 91.84$\pm$0.7 & \\             
R50-CASCADE & Random & 90.53$\pm$2.9 & 91.12$\pm$1.7 & 91.45$\pm$1.5 & \\             
R50-CASCADE & \textbf{PRIME (Ours)} & \textbf{92.01$\pm$1.3} & \textbf{92.73$\pm$0.9} & \textbf{92.96$\pm$0.6} &  \\ \hline
\end{tabular}
\end{adjustbox}
\label{tab:kvasir_results}
\vspace{-0.4cm}
\end{table}

\section{Additional Experiments}

\subsection{Visualization of retained and pruned samples from different communities}
To further illustrate how our PRIME pruning method retains diverse and representative samples while removing redundant ones, we present a qualitative analysis in Figure \ref{fig:retained_pruned_smaples_visualization}. This figure presents examples of retained and pruned images from different communities in the CVC-ClinicDB dataset, where approximately 15\% of highly similar images were removed.

Our pruning strategy ensures that visually similar, yet redundant, images are removed while preserving the most informative and diverse samples within each community. As seen in Figure 7, the retained images effectively represent polyp characteristics, mucosal textures, and variations in lighting, while the pruned images contain nearly identical structural features with minimal additional information. By eliminating these redundant samples, our method reduces annotation costs without compromising dataset diversity, leading to improved segmentation performance.

Although multiple images can be retained or pruned depending on the community size, for clarity, we present pairs of retained and pruned samples from the same community. This visualization reinforces that PRIME does not randomly discard images, but rather strategically selects a subset that maximally preserves diversity while minimizing redundancy.


\subsection{Results on the Kvasir dataset}
\label{assec:kvasir_results}
Table \ref{tab:kvasir_results} shows the efficacy of \textit{PRIME} in balancing dataset pruning and segmentation performance. While existing methods (e.g., entropy-based selection, MC-dropout) exhibit performance degradation under aggressive pruning (e.g., 56.2\% reduction at $\tau=0.83$), \textit{PRIME} consistently outperforms baselines, achieving higher DICE scores with narrower standard deviations (e.g., \textbf{92.11\% vs.~90.93\%} for PVT-CASCADE at $\tau=0.83$). 

Notably, at $\tau=0.88$ (19.1\% pruning), \textit{PRIME} nearly matches the full-dataset baseline (93.15\% vs.~93.17\%), highlighting its ability to retain critical samples through community detection in similarity networks. This contrasts with random pruning, which suffers from high variance (e.g., $\pm2.8$ at $\tau=0.83$), emphasizing the non-triviality of sample selection. 

Our PRIME’s robustness across architectures (PVT vs.~ResNet50) further validates its generalizability, though PVT’s superior performance suggests architectural advantages in capturing polyp features. These findings position \textit{PRIME} as a computationally efficient alternative for resource-constrained medical imaging tasks.

\subsection{Results of ClinicDB dataset with the EMCAD decoder:}
\label{assec:emcad_clinicdb}

Table \ref{tab:cvc_clinicdb_emcad} shows that \textit{PRIME} effectively optimizes data efficiency for polyp segmentation using recent EMCAD decoder \cite{Rahman_2024_CVPR} as well, particularly in data-rich regimes. Under aggressive pruning ($\tau=0.92$, 55.5\%), \textit{PRIME} achieves \textbf{92.87\% DICE} for PVT-EMCAD, outperforming entropy-based selection by \textbf{+3.14\%} and random pruning by \textbf{+3.02\%}, despite higher variability ($\pm1.5$ vs.~$\pm3.6$ for random). This underscores its ability to retain diagnostically critical samples even with significant dataset reductions. As pruning relaxes ($\tau=0.97$, 15.1\%), \textit{PRIME}’s performance nears the full-dataset baseline (94.61\% vs.~94.65\% for PVT-EMCAD), suggesting diminishing returns for retaining additional data. Notably, PVT-EMCAD consistently surpasses ResNet50-EMCAD (e.g., \textbf{94.61\% vs.~94.15\%} at $\tau=0.97$), likely due to its hierarchical attention mechanisms better capturing polyp boundaries. These insights position \textit{PRIME} as a architecture-agnostic dataset pruning method.

\begin{table}[h]
\centering
\caption{Experimental results (DICE \%) on CVC-ClinicDB dataset using PVT-EMCAD and ResNet50 (R50)-EMCAD. $\tau$ controls the similarity threshold and pruning rate. SSIM similarity achieves better DICE score than PCC in similar pruning rate.}
\vspace{-0.2cm}
\begin{adjustbox}{width=1\textwidth}
\begin{tabular}{lccccc}
\hline
\multicolumn{1}{l}{\textbf{Architectures}} & \multicolumn{1}{c}{\textbf{Methods/($\tau$, pruning \%)}} & \multicolumn{1}{l}{\begin{tabular}[c]{@{}l@{}}($\tau$=0.92, \\55.5\%) \end{tabular}} & \multicolumn{1}{l}{\begin{tabular}[c]{@{}l@{}}($\tau$=0.95,\\31.3\%) \end{tabular}} & \multicolumn{1}{l}{\begin{tabular}[c]{@{}l@{}}($\tau$=0.97,\\15.1\%) \end{tabular}} & \multicolumn{1}{l}{\begin{tabular}[c]{@{}l@{}}($\tau$=1, \\0\% pruning)\end{tabular}} \\
\hline
PVT-EMCAD & Entropy \cite{shannon1948mathematical} & 89.73$\pm$1.7 & 91.83$\pm$1.4 & 92.85$\pm$1.2 & \multirow{4}{*}{94.65$\pm$0.6} \\
PVT-EMCAD & MC-dropout \cite{gal2016dropout} & 90.15$\pm$1.6 & 92.31$\pm$1.3 & 93.05$\pm$1.1 & \\ 
PVT-EMCAD & KnowWhatToLabel \cite{dawoud2023knowing} & 90.36$\pm$1.7 & 92.81$\pm$1.1 & 93.14$\pm$1.0 & \\
PVT-EMCAD & Random & 89.85$\pm$3.6 & 91.87$\pm$2.4 & 92.43$\pm$1.7 & \\
PVT-EMCAD & \textbf{PRIME w/ PCC (Ours)} & \textbf{92.58$\pm$1.4} & \textbf{94.23$\pm$0.9} & \textbf{94.34$\pm$0.7} & \\ 
PVT-EMCAD & \textbf{PRIME w/ SSIM (Ours)} & \textbf{92.87$\pm$1.5} & \textbf{94.47$\pm$1.0} & \textbf{94.61$\pm$0.8} & \\ \hline
R50-EMCAD & Entropy \cite{shannon1948mathematical} & 89.31$\pm$1.9 & 91.51$\pm$1.5 & 92.25$\pm$1.3 & \multirow{4}{*}{94.26$\pm$0.5} \\             
R50-EMCAD & MC-dropout \cite{gal2016dropout} & 89.90$\pm$1.7 & 92.01$\pm$1.2 & 92.49$\pm$1.1 & \\    
R50-EMCAD & KnowWhatToLabel \cite{dawoud2023knowing} & 90.14$\pm$1.7 & 92.39$\pm$1.1 & 92.62$\pm$1.0 & \\  
R50-EMCAD & Random & 89.46$\pm$3.5 & 91.72$\pm$2.3 & 91.97$\pm$1.8 & \\             
R50-EMCAD & \textbf{PRIME w/ PCC (Ours)} & \textbf{92.25$\pm$1.5} & \textbf{93.72$\pm$0.9} & \textbf{94.02$\pm$0.6} & \\ 
R50-EMCAD & \textbf{PRIME w/ SSIM (Ours)} & \textbf{92.41$\pm$1.4} & \textbf{93.91$\pm$1.0} & \textbf{94.15$\pm$0.7} & \\ \hline
\end{tabular}
\end{adjustbox}
\label{tab:cvc_clinicdb_emcad}
\vspace{-0.4cm}
\end{table}


\section{Computational Complexity of PRIME}

\begin{table}[t]
    \centering
    
    \caption{Computational time for similarity matrix construction across datasets of increasing size. Here, we report the time (s) needed to preprocess (data loading, grayscale conversion, and resize to 352 $\times$ 352), and N(N-1)/2 pair-wise similarity metrics computation (N is the number of images in a dataset) in NVIDIA RTX A6000 (Ada) GPUs with 48GB memory.}
    \label{tab:comp_time}
    \vspace{-0.2cm}
    \begin{adjustbox}{width=0.85\textwidth}
    \begin{tabular}{l|r|r|r}
        \toprule
        \textbf{Dataset} & \textbf{SSIM (1 GPU) (s)} & \textbf{SSIM (8 GPUs) (s)} & \textbf{PCC (1 GPU) (s)} \\
        \midrule
        CVC-ClinicDB (550) & 73.17 & 37 & 8.01 \\
        Kvasir (900)       & 190.55 & 72 & 15.80 \\
        SUN-SEG (19,544)   & 89,852.13 (~25 hrs) & 10,920 (~3 hrs) & 563.85 \\
        \bottomrule
    \end{tabular}
    \end{adjustbox}
    \vspace{-0.1cm}
\end{table}

\subsection{Theoretical Computational Complexity Analysis}

The computational complexity of our PRIME is primarily driven by similarity matrix construction, which involves computing pairwise similarities for a dataset of \(N\) images. This step requires \(N(N-1)/2\) comparisons, leading to an overall complexity of \(O(N^2HW)\), where \(H \times W\) represents image dimensions. Given the quadratic scaling, computing the similarity matrix for large datasets is computationally demanding, but highly parallelizable, thus allowing multi-GPU acceleration to make it feasible even at large scales.

The Louvain community detection algorithm, used to select representative samples from the similarity graph, is significantly more efficient than similarity computation, operating in \(O(N \log N)\) for sparse graphs. Since our similarity networks are inherently sparse, the computational overhead of this step remains minimal even for large datasets. Additionally, as dataset pruning is a one-time preprocessing step, its cost is offset by the substantial savings in storage and annotation efforts, making PRIME a scalable and efficient solution for large-scale medical image curation.

\subsection{Empirical Computational Efficiency}

To validate the scalability of PRIME, we measure the similarity matrix computation time on datasets of increasing sizes. Table~\ref{tab:comp_time} summarizes our findings, comparing the computational costs of SSIM on single and multi-GPU setups and PCC on a single GPU.

Table~\ref{tab:comp_time} demonstrates that multi-GPU acceleration significantly reduces computational overhead, enabling SSIM-based similarity computation for SUN-SEG (19,544 images) in ~3 hours on 8 GPUs compared to ~25 hours on a single GPU. In addition, the PCC similarity matrix can be computed within only 563.85 sec (562 sec preprocessing and 1.85 sec similarity matrix computation) on a single GPU. This ensures that dataset pruning remains computationally feasible even at large scales, making PRIME a practical solution to reduce annotation costs and storage burdens in medical imaging.

While similarity computation follows quadratic scaling, our results confirm that multi-GPU parallelization enables PRIME to efficiently process large datasets. Future optimizations, such as batch comparisons, could further extend the scalability to datasets containing millions of images, strengthening PRIME’s role as an effective cost-saving and storage-efficient solution for large-scale medical image curation.

\begin{table}[t] 
\centering 
\caption{Comparison of PRIME with fixed-iteration random sampling.} \label{tab:prime_vs_random} 
\vspace{-0.2cm}
\begin{adjustbox}{width=1\textwidth}
\begin{tabular}{l|c|c} 
\toprule 
\textbf{Performance Metric} & \textbf{PRIME (Pruned Training)} & \textbf{Fixed-Iteration Random Sampling} \\ 
\midrule 
\textbf{Storage Cost} & Reduced (Only pruned dataset stored) & Full dataset stored \\ 
\textbf{Annotation Cost} & Reduced (Only pruned subset labeled) &  Full dataset annotated \\ 
\textbf{Training Speed} & Comparable (Reduced dataset) & Comparable (but full dataset overhead) \\ \textbf{DICE Score} & Minimal drop ($<$0.5\%) & Minimal to no drop \\ 
\bottomrule 
\end{tabular} 
\end{adjustbox}
\vspace{-0.2cm}
\end{table}

\section{PRIME vs. Fixed-Iteration Random Sampling}
The fixed iterations per epoch can achieve comparable training speed to our PRIME-pruned dataset. However, our primary goal is not just faster training, but also reducing annotation and storage costs, with training efficiency being a byproduct of dataset pruning.

Unlike random sampling, which selects a subset of images per iteration, but still requires storing and accessing the entire dataset, PRIME systematically prunes redundant images while preserving dataset diversity, thus reducing annotation effort without compromising segmentation performance. This distinction is crucial in medical imaging, where expert annotation is costly. Even with fixed-iteration random sampling, the need to maintain and retrieve the full dataset creates unnecessary storage and computational overhead, whereas PRIME significantly reduces both. Table \ref{tab:prime_vs_random} provides a structured comparison between PRIME and fixed-iteration random sampling.

\section{Current Limitations and Future Work}
Our dataset pruning method balances data reduction with high segmentation accuracy across models, datasets, and similarity metrics. Additionally, as a training-free method, our pruning method minimizes the annotation costs and effort by effectively identifying representative samples, which is particularly advantageous in medical imaging where data annotation (labeling) is time-intensive and costly. 

However, the DICE scores decline at extreme pruning levels, indicating a potential loss of critical data diversity. Furthermore, the method requires similarity threshold tuning, which varies by dataset. Future work will focus on exploring more advanced metrics (such as shape-aware and boundary-sensitive similarity measures), extending our method to other medical imaging tasks, and incorporating adaptive similarity thresholding to enhance robustness and generalizability across diverse medical imaging scenarios.


\end{document}
