\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{adjustbox}
\usepackage{graphicx}
% \usepackage{showframe} 

\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\jmlrvolume{-- nnn}

\editors{Accepted for publication at MIDL 2026}

\title[Testicular Tubule Representation Learning]{Learning Structure-Aware Foundational Representation of Rat Testicular Tubules Using Multiple Instance Learning}

% iBOT-MIL: Self-Supervised Masked Instance Modelling for Robust Classification of Testicular Pathologies
 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Vedang Kshirsagar\midljointauthortext{Contributed equally}}\Email{vedang.kshirsagar@airamatrix.com}\\
\Name{Saketh Juturu\midlotherjointauthor} \Email{saketh.juturu@airamatrix.com}\\
\Name{Geetank Raipuria} \Email{geetank.raipuria@airamatrix.com}\\
\Name{Nitin Singhal} \Email{nitin.singhal@airamatrix.com}\\
\addr Mumbai, India
}

\begin{document}

\maketitle

\begin{abstract}
Testicular toxicity is a critical factor in preclinical drug safety assessment, yet automated modelling of testicular abnormalities remains largely unexplored. 
Unlike liver or kidney tissue, the testis tissue is organized into tubules that vary substantially in size and structure, making fixed-resolution patch classification ineffective. We first demonstrate that resizing tubules significantly degrades performance particularly for larger sized tubules and a Multiple Instance Learning (MIL) model offers substantial improvements. Building on this, we introduce TBA-MIL, a transformer-based aggregation model with learnable positional embeddings that encodes the structure of tubules and is pre-trained using a self-supervised Masked Instance Modelling (MIM-MIL) framework, learning tubule representations from large-scale unlabeled data. Across four tubule types, TBA-MIL with MIM-MIL outperforms state-of-the-art MIL models and establishes a strong baseline for automated testicular toxicity assessment. Additionally, we evaluate the proposed framework on an independent toxicological study and show that the predicted abnormality distributions significantly differentiate control and treated animal tissues, consistent with expert pathologists' assessment.%Additionally, we evaluate the effectiveness of the proposed method in distinguishing abnormalities between control and treated animal tissues within a toxicological study, demonstrating a statistically significant difference between the two dosage groups.
% \end{abstract}
% The accurate classification of testicular histological abnormalities—such as Sertoli Cell Only (SCO) syndrome, maturation arrest, and hypospermatogenesis—is essential for assessing male infertility and reproductive toxicity. However, seminiferous tubules present a unique computer vision challenge: they exhibit drastic morphological heterogeneity, varying significantly in size and shape due to pathological atrophy or tangential sectioning. This variability renders standard fixed-size deep learning inputs ineffective, as resizing introduces distortion and cropping loses context. Consequently, a Multiple Instance Learning (MIL) paradigm is necessary to treat each tubule as a flexible "bag" of constituent patches, preserving global structure regardless of dimensions.

% We introduce iBOT-MIL, a self-supervised framework that adapts Masked Image Modelling to this MIL setting. By employing a Student-Teacher distillation architecture, iBOT-MIL learns to predict the semantic identity of masked patches based on their intra-tubular neighbors. This objective forces the model to internalize the rules of cellular co-occurrence (e.g., determining if a tubule lacks germ cells) without requiring slide-level labels. Experiments demonstrate that iBOT-MIL effectively handles variable-sized inputs and learns robust, pathology-discriminative representations, significantly outperforming generic foundation models in classifying complex tubular abnormalities.
% \end{abstract}

\begin{keywords}
Histopathology, Toxicologic Pathology, Testicular Toxicity, Multiple Instance Learning, Foundation Models, Self-Supervised Learning, Masked Image Modelling
\end{keywords}

\section{Introduction}

% introduce drug safety assessment and testicular toxicty
Histopathology based drug-induced tissue injury identification is a critical step in preclinical safety assessment of a potential therapeutic agent. A pathologist identifies, characterizes, and grades any observed microscopic changes 
on tissue samples from dosed animals. The process provides the tissue morphological data that helps the toxicologic pathologist understand and predict a drug's potential toxic effects. The assessment allows translating potential risks to the human clinical setting and informing appropriate dosing strategies and monitoring requirements.



% introduce Testicular Toxicity
With advancements in digital pathology and Whole Slide Images (WSI), deep learning models have shown promising results for identifying a wide variety of tissue injuries \cite{zingman2024learning, jaume2024deep, juturu2025unsupervised, zehnder2022multiscale, linmans2024diffusion, pocevivciute2025out}, which assist pathologists in determining drug-induced toxicity. However, most of these works have focused primarily on Liver (Hepatic) or Kidney (Renal) toxicity, the major metabolic and excretory organs. While liver and kidney failure are immediately life-threatening, testicular toxicity is critical for often program-ending reasons including unique and irreplaceable organ function, low tolerance for risk in non-life-saving drugs and secondary impact on hormone production. This makes Testicular Toxicity findings a decisive factor in terminating a drug program early, particularly when the drug is intended for a non-life-threatening or chronic condition in a broad patient population.



% whats different about testes modelling
Models for liver and kidney injury detection have been developed on tissue patches extracted from WSI, as the tissue injuries can be identified without requiring larger tissue context. However, in case of testes tissue, the organ consists of tubules as sub-structures which can reflect varying degrees of toxicity. Figure \ref{fig:figure_1} provides sample of normal and drug affected abnormal tubules. Creating patches from the tissue does not provide sufficient context to identify the injury. Furthermore, tubules in a WSI  vary significantly in size and aspect ratio, as seen in figure \ref{fig:figure_1} and \ref{fig:hist_tubule_size}. This makes modelling tubules for identification of drug-induced toxicity non-trivial.

\begin{figure}[htbp!]
        \centering
        \includegraphics[width=0.75\textwidth]{images/figure1_testes.drawio_compressed.png}
        \caption{Sample images of testicular tubules, resized proportionally to maintain relative size. a-b) Normal tubule (No injuries, intact) c) Degeneration Tubular d) Degeneration Germ Cell e) Dilation Tubular }
        \label{fig:figure_1}
    \end{figure}


In this work, we first investigate the performance of state-of-the-art convolution and transformer-based feature extractors to classify testicular tubules into normal and injury classes. For this, the tubules are resized to a fixed size before passing to the model. We extensively evaluate by ablating for different input image sizes. The analysis shows that resizing tubule images significantly degrades performance on large sized tubules.
Next, we model a tubule as a bag-of-words, i.e., Multiple Instance Learning, extracting features of non-overlapping patches from a tubule, followed by feature aggregation. The analysis shows that the MIL model outperforms the resized image-based classifier for all tubule sizes.

Based on this evidence, we propose a MIL model based on the transformer architecture, that incorporates positional encoding to retain the relative position of the patch instances, uses a foundational feature extractor trained on testes data, and importantly is itself pre-trained using self-supervised loss on a large scale tubular data to learn tubular representations. We compare the proposed framework with state-of-the-art MIL models and show that it outperforms on tubular injury classification. We also perform an ablation study to show the significance of pre-training and positional embedding in the MIL model. Figure \ref{fig:main} provides an overview of the proposed representation learning for testicular tubules.

To the best of our knowledge, this work is the first to extensively evaluate testicular injury classification in Wistar rats. Our main contributions are as follows:
\begin{enumerate}
    \item We demonstrate that modelling each tubule as a bag of features using Multiple Instance Learning (MIL) outperforms fixed-size image classification approaches.
    \item We benchmark state-of-the-art MIL models for tubule injury classification
    \item We present a foundational self-supervised pretraining strategy for tubular representation learning - Masked Instance Modelling (MIM-MIL), using a new transformer-based aggregation MIL model (TBA-MIL), that outperforms all MIL models.
    \item We evaluate the utility of the tubule injury detection using the proposed approach on a toxicological study. 
\end{enumerate}


% propose a MIL model based on the transformer architecture that using 

% Further we, experiemnt with different MIL SOTA models. And introduce a new MIL model that out-performs the state-of-the-art. 

% Furthremore, we show that using pre-trianed a self-supervised feature extractor on testes data, further improves the results of MIL mode. Lastly, we propose a new , and show that the pre-training out-performs all models. 



\vspace{-2mm}
\section{Related Work}
\vspace{-1mm}
\subsection{Drug-Induced Injury Detection}
Recent works on drug-induced injury detection in digital histopathology emphasize domain-specific representation learning and out-of-distribution (OOD) modelling to address the scarcity and heterogeneity of lesion annotations in toxicology pipelines. \cite{zingman2024learning} and  \cite{dippel2024ai} employ  supervised patch classification task to train a feature extractor, followed by detecting patches anomalous from normal tissue representation. Other works leverage self-supervised foundational models to extract features for supervised injury classification \cite{jaume2024deep}  and unsupervised anomaly detection \cite{juturu2025unsupervised} using a neighbourhood density (K-Nearest Neighbours) in the latent space of foundation model. 

Another direction of work uses generative models to learn normal tissue representation via reconstruction task, and identify anomalous tissue injury as tissue with high reconstruction error. \cite{zehnder2022multiscale} trained a generative adversarial network with multi-scale patches as input to enhance regional interpretation, whereas \cite{linmans2024diffusion} train a denoising diffusion probabilistic with a partial diffusion process to learn in-distribution image space, and found it to outperform GAN based models. Lastly, \cite{pocevivciute2025out} and  \cite{juturu2025unsupervised} compared reconstruction-based approaches with methods based on latent space of foundation model. All of the above approaches work on patch level, such that each patches is classified as anomalous or a specific tissue injury type. In case of testicular abnormalities, creating tiles loses context of the tubular structure making the approach unfit for injury detection.

%  Both generative adversarial network \cite{} use 
%  regularized skip-connections to detect 

\vspace{-2mm}
\subsection{Multiple Instance Learning}
\vspace{-1mm}
Multiple-instance learning (MIL) has become the dominant computational modelling approach for weakly supervised histopathology as it allows to directly learn slide-level labels from WSI, without needing patch or pixel level labels that are expensive to obtain. A feature encoder is used to produce patch embeddings which are then pooled to produce a bag score for the entire WSI. %\cite{ilse2018attention} presented a attention-based MIL that learns a self-attention weighted instance embeddings as representation for the WSI. 

The encoder choice strongly affects MIL performance, a histology-tailored pretraining yields superior embeddings that are more sensitive to subtle morphological changes and  less sensitive to stain variation \cite{wolflein2024good}. \cite{shaomultiple} and \cite{wolflein2024good} compared various publicly available foundational models for patch feature extraction and MIL aggregation architectures, on diverse tasks, validating the utility and robustness of MIL approach for WSI feature representation learning. Both of the works show that ABMIL \cite{ilse2018attention} outperforms aggregation methods. 

While effective, most widely used MIL aggregators operate on unordered sets of patch embeddings and summarize the bag into a single global representation, implicitly discarding spatial relationships and higher-order structural organization within the tissue. As a result, such approaches may be limited in their ability to capture biologically meaningful structures that depend on relative spatial arrangement.


\vspace{-2mm}
\subsection{Whole Slide Foundation Models}
\vspace{-1mm}
Numerous works have proposed slide-level pre-training to learn WSI representation, allowing transferability to low weakly supervised dataset regimes. These methods can be divided in to two sub-types - unimodal \cite{chen2022scaling, lazard2023giga, xu2024whole, lenz2025unsupervised, shaomultiple} and multi-model \cite{jaume2024multistain, shaikovski2024prism, wang2024pathology}. Early works \cite{chen2022scaling, lazard2023giga} demonstrated that MIL trained with self-supervision can extract useful representations of WSI for down-stream tasks. 
\cite{chen2022scaling} exploit the pyramid structure of WSIs to learn representations that capture both fine-grained morphology and coarse spatial context, by a hierarchical training using self-distillation loss. \cite{lazard2023giga} adapted self-supervised contrastive loss to gigapixel images by creating multiples representations of the same WSI. \cite{xu2024whole} adopted transformer with dilated attention \cite{ding2023longnet} as slide encoder to handle massive sequence of images patches from a WSI and generate contextualized embeddings. 
%\cite{lenz2025unsupervised}integrate tile embeddings from multiple FMs, 
Other methods \cite{wang2024pathology, shaomultiple} employ supervised pre-training to learning aggregation layers that can be used as slide foundational models. 


% \subsection{Self-Supervised Learning}
\vspace{-2mm}
\section{Dataset}
\vspace{-1mm}
The dataset used for this work consists of in-house Wistar rat testicular histopathology slides collected from pre-clinical toxicology studies. A total of 648 WSIs were available, of which 148 WSIs were used to create the supervised annotated dataset, whereas the remaining 500 were used for self-supervised training. Additionally, a toxicological study consisting of 8 Control and 16 Treated tissue WSIs is used to evaluate the effectiveness of the proposed testes tubule modelling framework.
The following subsections describe the dataset preparation in detail. \\

\subsection{{Tubule Data Preparation}}
This work models testicular tubules as the fundamental building blocks of the tissue instead of using equally sized patches. To enable this, we train a segmentation U-Net model for tubule detection, which is used to extract tubule structures from the WSIs at 10× magnification. Appendix section \ref{sec:appendix-segmentation} provides details of the training setup and data used to train the tubule segmentation model. Each extracted tubule is oriented along the diagonal; that is, we consider each tubule as an ellipse and align its major axis with the diagonal of an imaginary box. This ensures geometric consistency. The region around the tubule is padded with white background using a pixel value of 250, which is similar to the background pixel values on the slide.

% We use a segmentation mask algorithm to generate tubule masks (BWL masks) for each WSI. Each tubule is mapped to an unique integer as represented in the mask which is used to further extract and dump these tubules at 10X magnification. We then orient each tubule at 45 degrees along their major axis. After this, we pad and make the background white using pixel value 250. These two steps are done so that geometric and background consistency is maintained throughout the analysis. 

\vspace{-2mm}
\subsection{Supervised Tubular Classification Data}
\vspace{-1mm}
The supervised dataset is a collection of tubules of varying sizes, sampled and annotated by a panel pathologist from 148 WSI. It consists of a total of 10,880 tubules, assigned to one of four classes — Normal, Degeneration Germ Cell, Degeneration Tubular, and Dilation Tubular — the latter three being common injury types. The Normal class has 5,599 samples, Degeneration Germ Cell has 1,893, Degeneration Tubular has 1,584, and Dilation Tubular has 1,804. The WSIs are split 70:30 for training and testing, 100 WSIs being used for training and 48 WSIs for testing. The training set is further split 75:25 into training and validation subsets, stratified by the number of samples in each class and tubule size. All the results are reported on test dataset.

% Some additional information on our classes is as follows; Normal class, has tubules which have no prescence of any kind of injuries in them. On a tubular level we can see mainly 3 types of Complex Injuries. First is Germ Cell Degeneration which is mainly seen when the cells in the tubules start becoming Degenerate and Necrotic without disturbing much of the tissue structure. Second class is Tubular Degeneration which is when the entire tubule becomes injured. It ranges from tubules having mild disintegration to extreme disintegration. Finally we have Tubular Dilation which is an extreme type of injury wherein the tubules expand from the center outwards creating  a hollow and tissueless center. 


%The tubules are split 70:30 for training and testing, stratified by the number of samples in each class. We also ensure that the distribution of tubule height and width is consistent across the train and test sets. The training set is further split 75:25 into training and validation subsets. All the results are reported on test dataset.




% stratified by the number of samples in each class. We also ensure that the distribution of tubule height and width is consistent across the train and test sets. The training set is further split 75:25 into training and validation subsets. 


% sampled from these WSIs which belong to 4 classes. The 4 classes are Normal, Germ Cell Degeneration, Tubular Degeneration and Tubular Dilation. Apart from Normal class all the other classes are Not Normal wherein the tubule is affected by that kind of abnormality.  Normal class has 5599 samples, Germ Cell Degeneration has 1893 samples, Tubular Degeneration has 1584 samples and Tubular Dilation has 1804. These tubules are then split into 7543 train and 3337 test samples (approx 70-30 split). During training, we further split train samples into 5709 train and 1834 validation samples (apimplprox 75-25 split) whereas the test set is kept for blind testing. Splitting is carried out in two ways, first we split the Normal class samples in the aforementioned ratios startified on their Staging metadata, next we split the Not Normal classes' samples based on Stratification since they do not belong to any stage. We also ensure that the distribution of tubules based on their height and width is consistent across train and test samples. 

\vspace{-2mm}
\subsection{Unsupervised data}
\vspace{-1mm}
From the 648 WSI, 500 are used to generate two datasets: \textbf{1) Unsupervised tubular dataset}, consisting of about 400,000 tubules including both normal and injury classes. This dataset is used to pre-train the MIL model using self-supervision to learn tubular representations; \textbf{2) Unsupervised patch dataset}, consisting of about 12 million tissue patches sampled at 5×, 10×, and 20× magnification, used for training a foundational model for patch-level feature extraction. 

% For training our SSL model we use a total of 60k tubules preprocessed in the same way mentioned in the data preparation section. We sample these tubules from WSIs from 4 labs. 

\vspace{-2mm}
\section{Classifier vs Multiple Instance Learning}
\vspace{-1mm}
\label{ssec:mil_better}
The initial modelling of testicular tubules is based on a naive approach to resize all tubules to a fixed size, close to the median. The resized tubules are used for training and evaluating the model. Figure \ref{fig:conv_vs_vit} provides results for ConvNextV2 \cite{woo2023convnext} and ViT \cite{dosovitskiy2020image} models, across different model capacities. ConvNext models perform better than ViT, which can be attributed to limited availability of training data. ConvNext-Tiny performs the best achieving 90.37\% balanced accuracy. 

We train the best performing model (ConvNextV2-Tiny) with different input image sizes - 512, 768, 1024, and bin the results into categories based on actual tubules size, as seen in figure \ref{fig:tubule_size_bin}. The tubule of size greater than 1280 would be re-sized to 512 for the ConvNext-512 model, and so on. It can be observed that the performance of all models remain similar or degrade for large sized tubules. This can be due to the models' inability to model larger contextual information. Transformer models, that have better ability to build context over the entire image, are limited by the amount of training data.

\begin{figure}[htbp]
        \centering
        \includegraphics[width=0.95\textwidth]{images/vit_vs_convNext.png}
        \caption{Performance of ConvNext\cite{woo2023convnext} and ViT\cite{dosovitskiy2020image} model for tubule injury classification}
        \label{fig:conv_vs_vit}
\end{figure}



%We experiment with different image sizes for the best performing model , 

%To better understand the results, a further analyse of the test data was performed to identify the false predictions and acreatain is the resizing stratergy causes mis-prediction on tubules sized sifferently than median tubule size. Figure \ref{} provides the performance of the best performing model, binned by actual tubules size. It was can be observed that the performance on the small tubules is the highest for the model with smallest image size, as the input image size increase, this performance reduces, possibly due to upsampling adding blur artefact. On other hand, the performance on large sized tubules improves with increase in input image size. 

%Show evidence that classifier mis-classifies - some samples

\begin{figure}[htbp]
        \centering
        \includegraphics[width=0.85\textwidth]{images/tubule_size_ablation.png}
        \caption{Analysis on performance of different model on tubules binned by their size. All convnext models are ConvNext-Tiny, and the number represents size of the input image used for training and evaluation}
        \label{fig:tubule_size_bin}
\end{figure}

% ConvNext vs ViT (different model sizes, median image size)
% Best model with different image sizes (512, 768, 1024, 1536, 2048)
% UniV2 vs Virchow ()

%\subsection{Do we need Multiple Instance Learning?}
Since the tubule resizing approach gives poor performance on large sized tubules, a modelling approach that doesn't require resizing is needed. We hypothesize that each tubule can be modelled as a bag of patches, following the multiple instance learning \cite{ilse2018attention, shaomultiple}. For this we use UNIV2 \cite{chen2024towards} as the feature extractor and ABMIL for feature aggregation, since it has shown to outperform state-of-the-art models across multiple histopathology tasks \cite{wolflein2024good} and ability to transfer across tasks \cite{shaomultiple}. 
%Figure \ref{fig:} shows the results for different tubule sizes and overall balanced accuracy. 


As seen in figure \ref{fig:tubule_size_bin}, MIL model significantly improves the performance for all tubules sizes. We believe that this is due to a multitude of advantages 1) MIL approach does not reduce the spatial dimension of the tubule images, rather creates tiles which are passed through to feature extractor followed by feature aggregation. This would allow better attention to finer morphological features of the tubule 2) the aggregation layers using self-attention that can better extract a global context 3) better features obtained from pre-trained extractor trained on large scale histopathology data.

% as it does not reduce the spatial dimension of the tubule images, rather creates tiles which are passed through a feature extactor which 

%Compare classifier (re-sized) vs MIL approach (no-resize)

% As seen in figure \ref{fig:tubule_size_bin}, MIL model significantly performance for the large tubles. MIL model does not reduce the spatial dimension of the tubule images, rather creates tiles which are passed through to featur extactor. This would allow better attention to finer morphological features of the tubule.  Interesting, the performance is also improved on smaller sized tubules. We attribute this gain to better features obatined from pre-trained extractor trained on large scale histopathology data (UNIV2) and attention aggregation. 

\vspace{-2mm}
\subsection{Implementation Details}
\vspace{-1mm}
\label{ssec:implementation}
% \begin{itemize}
All the supervised models were executed on \textbf{two NVIDIA A100 GPUs}, the use of multiple GPUs was leveraged for efficient distributed training and increased effective batch size. We experiment with learning rates [1e-3, 1e-6] depending upon the training type and models and report the best metrics accordingly.  Due to the data imbalance between Normal class and other classes we used Weighted Cross-Entropy loss for better performance. We use an effective batch size 32 with gradient accumulation for all training runs, which allows training even for large image sizes. Mixed Precision was utilized for all runs. We employ the {AdamW} optimizer with a standard Cosine Learning Rate Scheduler. 
We use a mix of geometric (rotation, flipping) and color (Brightness Contrast, HSV jitter, grayscaling) augmentations during the training phase. All results are reported as average of three runs. 
% \end{itemize}

\section{Methodology}
\label{sec:methodology}

% \subsection{Problem Formulation: The Necessity of Multiple Instance Learning (MIL)}
% \label{ssec:problem_formulation}
% The accurate classification of testicular pathology is challenged by the \textbf{geometric heterogeneity} of the seminiferous tubules. Tubules vary drastically in size and shape due to pathological conditions (e.g., atrophy) and tangential sectioning. This morphological variability renders fixed-size input deep learning architectures ineffective, as resizing distorts cellular details, and cropping loses global context.

\begin{figure}[htbp]
        \centering
        \includegraphics[width=1.0\textwidth]{images/MIDL 2026_compressed.png}
        \caption{Overview of the proposed framework - Representation Learning for Testicular Tubules. Each tubule is decomposed into a variable-length bag of patch instances of size $224 \times 224$ and encoded using a foundational ViT feature extractor (Testes-SSL). The resulting features are aggregated using the transformer-based TBA-MIL model with learnable positional embeddings that capture the underlying tubule structure. The aggregation layers are pretrained using Masked Instance Modelling (MIM-MIL), self-supervised learning strategy, that employs student-teacher distillation to learn contextual and morphology-aware representations from large-scale unlabeled tubular data.}
        \label{fig:main}
    \end{figure}


Based on the evidence obtained in section \ref{ssec:mil_better}, we adopt the {Multiple Instance Learning (MIL)} paradigm, defining a single tubule ROI as a variable-length bag $X = \{x_1, x_2, ..., x_N\}$, where $x_i$ is a constituent patch. Our objective is to learn a system that processes this dynamic collection of patches to capture the collective structural integrity necessary for accurate abnormality classification. Figure \ref{fig:main} provides an overview of the proposed representation learning for testicular tubules.

\subsection{Transformer Based Aggregation: TBA-MIL}

\textbf{Feature Encoder Testes-SSL:} A {Vision Transformer (ViT-Small)}, trained using DINO \cite{caron2021emerging} self-supervised learning, extracts high-dimensional feature embeddings for each patch $x_i$. \\
\textbf{Structure-Aware Learnable Positional Embeddings:} To capture the essential {radial context} of the tubule, we incorporate {learnable positional embeddings} $P$. A fixed bank of embeddings $P \in \mathbb{R}^{L_{max} \times D}$ is learned, where $L_{max}=25$ based on the largest tubule size. This learned encoding is added to the feature embedding $z_i$ of each patch before it enters the Transformer Aggregator:
$$
z'_i = z_i + P_i
$$
For bags shorter than $L_{max}$, only the first $N$ embeddings are utilized, imparting necessary spatial awareness. \\
\textbf{Patch Aggregator:} A Transformer Encoder with 4 layers, 8 attention heads and  embedding dimension of size 384 is used to processes the sequence of patch features and the \texttt{[CLS]} token. \\


\subsection{Masked Instance Modelling (MIM-MIL)}
\label{ssec:architecture}
We propose a self-supervised framework based for pre-training MIL model using knowledge distillation and Masked Instance Modelling (MIM), inspired by previous work on patch level foundational models \cite{zhou2021ibot, he2022masked}. The setup uses two identical networks: a {Student} ($S$) and a {Teacher} ($T$), both sharing the same architecture as described in section. Additionally, a projection head is added to both teacher and student. \\

\textbf{Projection Heads:} The output features from the Aggregator are passed to two distinct, projection heads following \cite{zhou2021ibot, caron2021emerging} that map the high-dimensional features to the prototype space $\mathbb{R}^{D_{\text{proto}}}$ (where $D_{\text{proto}}$ is the dimension of the prototypes, e.g., 8192):
    \begin{itemize}
        \item \textbf{Global Head ($h_g$):} Processes the \texttt{[CLS]} token (the bag representation) for the global loss $\mathcal{L}_{global}$.
        \item \textbf{Local Head ($h_l$):} Processes the patch tokens for the local loss $\mathcal{L}_{local}$.
    \end{itemize}


\subsubsection{Student-Teacher Distillation}
The Teacher network ($\theta_t$) provides stable targets for the Student ($\theta_s$). The Teacher's weights are updated as an Exponential Moving Average (EMA) of the Student's weights, ensuring stability during training:
$$
\theta_t \leftarrow \lambda \theta_t + (1 - \lambda) \theta_s
$$
where $\lambda$ follows a cosine schedule, typically starting at $0.996$.

% \subsubsection{Structure-Aware Learnable Positional Embeddings}
% To capture the essential \textbf{radial context} of the tubule, we incorporate \textbf{learnable positional embeddings} $P$. A fixed bank of embeddings $P \in \mathbb{R}^{L_{max} \times D}$ is learned, where $L_{max}=50$. This learned encoding is added to the feature embedding $z_i$ of each patch before it enters the Transformer Aggregator:
% $$
% z'_i = z_i + P_i
% $$
% For bags shorter than $L_{max}$, only the first $N$ embeddings are utilized, imparting necessary spatial awareness.

% \subsection{MIM-MIL : Foundational Pre-training using Masked Instance Modelling}
% \label{ssec:pretraining_objective}

% Our core self-supervised objective is {Masked Instance Modelling}, which forces the Student to learn the rules of tissue organization by prediction.

% \subsubsection{Stochastic Instance Masking}
% We apply a stochastic binary mask $M \in \{0, 1\}^N$ to the bag $X$. The Student receives a corrupted view $\tilde{X}$, where the patches are masked randomly with mask ratio ranging from 0 to 0.3 and replaced by a learnable \texttt{[MASK]} token $e_{mask}$. The Teacher processes the original, uncorrupted bag $X$.\textcolor{red}{Following \cite{zhou2021ibot, he2022masked}, all tokens including masked token retain positional embedding.}

% \subsubsection{Semantic Distillation Loss}
% We utilize a loss to match the student's prediction to the teacher's semantic assignment (prototypes), with the Teacher acting as an online tokenizer. The total loss ($\mathcal{L}$) combines a Global Loss (CLS token) and a Local Loss (masked patches):
% $$
% \mathcal{L} = \mathcal{L}_{global} + \frac{1}{2} (\mathcal{L}_{local}^1 + \mathcal{L}_{local}^2)
% $$
% The critical {Local Loss} ($\mathcal{L}_{local}$) minimizes the Cross-Entropy between the Student's predicted distribution ($p_s$) and the Teacher's sharpened distribution ($p_t$), calculated only on the masked tokens:
% $$
% \mathcal{L}_{local} = - \sum_{i \in \text{Masked}} p_t(x_i)^{\tau_t} \cdot \log p_s(\tilde{x}_i)^{\tau_s}
% $$
% where $\tau_t$ and $\tau_s$ are the temperature parameters. This Cross-Entropy formulation ensures the model learns to identify structural components, optimizing the latent space for downstream classification of abnormalities. \\

% \textcolor{red}{The TBA-MIL and MIM-MIL methods are closely coupled and mutually reinforcing. TBA-MIL incoporates positional encoding which are essential for stochastic instance masking \cite{he2022masked}, without which the the model would have no information about the location of the masked tokens, also, TBA-MIL outputs both a CLS token and patch-level tokens allowing the model to learn from both global \& local self-supervised losses.}

\subsection{MIM-MIL : Foundational Pre-training using Masked Instance Modelling}
\label{ssec:pretraining_objective}

Our core self-supervised objective is \textit{Masked Instance Modelling}, which forces the Student to learn the rules of tissue organization by prediction.

\subsubsection{Stochastic Instance Masking}
We apply a stochastic binary mask $M \in \{0, 1\}^N$ to the bag $X$. The Student receives a corrupted view $\tilde{X}$, where the patches are masked randomly with mask ratio ranging from 0 to 0.3 and replaced by a learnable \texttt{[MASK]} token $e_{mask}$. The Teacher processes the original, uncorrupted bag $X$. Following standard protocols \cite{zhou2021ibot, he2022masked}, positional embeddings are added to all tokens, including the masked tokens, to ensure the model retains spatial context despite the corruption.

\subsubsection{Semantic Distillation Loss}
We utilize a loss to match the student's prediction to the teacher's semantic assignment (prototypes), with the Teacher acting as an online tokenizer. We employ a cross-view strategy where the Student and Teacher process two different augmented views, $u$ and $v$, of the same image bag. The total loss ($\mathcal{L}$) combines a symmetrized Global Loss (on CLS tokens) and Local Loss (on masked patches):
$$
\mathcal{L} = \frac{1}{2}(\mathcal{L}_{global}^1 + \mathcal{L}_{global}^2) + \frac{1}{2} (\mathcal{L}_{local}^1 + \mathcal{L}_{local}^2)
$$
The \textit{Global Loss} terms enforce consistency between the global representations (CLS token) of the two views. The \textit{Local Loss} ($\mathcal{L}_{local}$) minimizes the Cross-Entropy between the Student's predicted distribution ($p_s$) for the masked view $u$ and the Teacher's sharpened distribution ($p_t$) for the clean view $u$:
$$
\mathcal{L}_{local} = - \sum_{i \in \text{Masked}} p_t(u_i) \cdot \log p_s(\tilde{u}_i)
$$
where $p_t$ and $p_s$ are the softmax probabilities sharpened by temperature parameters $\tau_t$ and $\tau_s$ respectively. This Cross-Entropy formulation ensures the model learns to identify structural components, optimizing the latent space for downstream classification of abnormalities.

The TBA-MIL architecture and MIM-MIL framework are closely coupled and mutually reinforcing. TBA-MIL incorporates positional encodings which are essential for stochastic instance masking \cite{he2022masked}; without them, the model would lack information regarding the spatial location of the masked tokens. Furthermore, TBA-MIL explicitly outputs both a global CLS token and local patch-level tokens, enabling the model to effectively learn from both global and local self-supervised distillation losses.


% \subsection{Self-Supervised Pre-training - Masked Image Modeeling}
% \label{ssec:architecture}
% We propose \textbf{iBOT-MIL}, a self-supervised framework based on knowledge distillation and Masked Instance Modelling (MIM). The system uses two identical networks: a \textbf{Student} ($S$) and a \textbf{Teacher} ($T$).

% \subsubsection{Hierarchical Encoder-Aggregator and Projection Heads}
% Both networks share the same architecture, composed of three sequential stages:
% \begin{enumerate}
%     \item \textbf{Feature Encoder:} A \textbf{Vision Transformer (ViT-Small)}, initialized with DINO weights, extracts high-dimensional feature embeddings for each patch $x_i$.
%     \item \textbf{Bag Aggregator:} A standard \textbf{Transformer Encoder} processes the sequence of patch features and the \texttt{[CLS]} token.
%     \item \textbf{Projection Heads:} The output features from the Aggregator are passed to two distinct, linear projection heads that map the high-dimensional features to the prototype space $\mathbb{R}^{D_{\text{proto}}}$ (where $D_{\text{proto}}$ is the dimension of the prototypes, e.g., 1024):
%     \begin{itemize}
%         \item \textbf{Global Head ($h_g$):} Processes the \texttt{[CLS]} token (the bag representation) for the global loss $\mathcal{L}_{global}$.
%         \item \textbf{Local Head ($h_l$):} Processes the patch tokens for the local loss $\mathcal{L}_{local}$.
%     \end{itemize}
% \end{enumerate}

% \subsubsection{Student-Teacher Distillation}
% The Teacher network ($\theta_t$) provides stable targets for the Student ($\theta_s$). The Teacher's weights are updated as an Exponential Moving Average (EMA) of the Student's weights, ensuring stability during training:
% $$
% \theta_t \leftarrow \lambda \theta_t + (1 - \lambda) \theta_s
% $$
% where $\lambda$ follows a cosine schedule, typically starting at $0.996$.

% \subsubsection{Structure-Aware Learnable Positional Embeddings}
% To capture the essential \textbf{radial context} of the tubule, we incorporate \textbf{learnable positional embeddings} $P$. A fixed bank of embeddings $P \in \mathbb{R}^{L_{max} \times D}$ is learned, where $L_{max}=50$. This learned encoding is added to the feature embedding $z_i$ of each patch before it enters the Transformer Aggregator:
% $$
% z'_i = z_i + P_i
% $$
% For bags shorter than $L_{max}$, only the first $N$ embeddings are utilized, imparting necessary spatial awareness.

% \subsection{Pre-training Objective: Masked Instance Modelling (MIM)}
% \label{ssec:pretraining_objective}

% Our core self-supervised objective is \textbf{Masked Instance Modelling}, which forces the Student to learn the rules of tissue organization by prediction.

% \subsubsection{Stochastic Instance Masking}
% We apply a stochastic binary mask $M \in \{0, 1\}^N$ to the bag $X$. The Student receives a corrupted view $\tilde{X}$, where masked patches are replaced by a learnable \texttt{[MASK]} token $e_{mask}$. The Teacher processes the original, uncorrupted bag $X$.

% \subsubsection{Semantic Distillation Loss}
% We utilize a probabilistic loss to match the Student's prediction to the Teacher's semantic assignment (prototypes), with the Teacher acting as an "Online Tokenizer." The total loss ($\mathcal{L}$) combines a Global Loss (CLS token) and a Local Loss (masked patches):
% $$
% \mathcal{L} = \mathcal{L}_{global} + \frac{1}{2} (\mathcal{L}_{local}^1 + \mathcal{L}_{local}^2)
% $$
% The critical \textbf{Local Loss} ($\mathcal{L}_{local}$) minimizes the Cross-Entropy between the Student's predicted distribution ($p_s$) and the Teacher's sharpened distribution ($p_t$), calculated only on the masked tokens:
% $$
% \mathcal{L}_{local} = - \sum_{i \in \text{Masked}} p_t(x_i)^{\tau_t} \cdot \log p_s(\tilde{x}_i)^{\tau_s}
% $$
% where $\tau_t$ and $\tau_s$ are the temperature parameters. This Cross-Entropy formulation ensures the model learns to identify structural components, optimizing the latent space for downstream classification of abnormalities.



\subsection{Implementation Details}
\label{ssec:implementation}

The MIM-MIL pre-training was conducted on four NVIDIA A100 GPUs for 200 epochs, using the AdamW optimizer with a base learning rate of $1\times 10^{-4}$ and a batch size of 128. We utilized a cosine decay learning rate scheduler and employed automatic mixed precision to enhance memory efficiency. Following the implementation of \cite{zhou2021ibot}, a stochastic masking strategy is employed with the masking ratio uniformly sampled from the range $[0, 0.3]$,  for the semantic distillation loss, the student temperature $\tau_s$ is fixed at $0.1$, while the teacher temperature $\tau_t$ follows a linear warm-up schedule from $0.04$ to $0.07$ over the first 30 epochs to ensure stable convergence.


%\textcolor{red}{Following the official implementation of \cite{zhou2021ibot} for the ViT-S/16 architecture. Specifically, we employed a stochastic masking strategy where the prediction ratio is sampled uniformly from the range $[0, 0.3]$. This reduced masking range was selected to preserve sufficient semantic context given the smaller model capacity. For the semantic distillation loss, we fixed the student temperature $\tau_s$ at $0.1$, while the teacher temperature $\tau_t$ followed a linear warm-up schedule from $0.04$ to $0.07$ during the first 30 epochs to ensure stable convergence of the online tokenizer.}\\
    
\textbf{Supervised Fine-tuning:} The pre-trained features are used to initialize the final classification model, where the CLS token is passed through a linear layer and classified using standard supervised techniques. All results are reported as average of three runs. 


\begin{table}[htbp!]
\centering
\begin{adjustbox}{width=1\textwidth}
\small
\begin{tabular}{cccc}
\hline
\textbf{MIL Model}                                   & \textbf{Feature Extractor} & \textbf{Pre-Training}  & \textbf{Balanced Accuracy}    \\ \hline
\multicolumn{1}{c|}{ABMIL\cite{ilse2018attention}}                           & UNIV2\cite{chen2024towards}                      & None                   & 91.27                         \\
\multicolumn{1}{c|}{ABMIL\cite{ilse2018attention}}                           & UNIV2                      & Feather\cite{shaomultiple}                & 91.74                         \\
\multicolumn{1}{c|}{DFTD\cite{zhang2022dtfd}}                            & UNIV2                      & None                   & 84.38                         \\
\multicolumn{1}{c|}{DSMIL\cite{li2021dual}}                           & UNIV2                      & None                   & 91.73                         \\

\multicolumn{1}{c|}{TransMIL\cite{shao2021TransMIL}}                        & UNIV2                      & None                   & 91.71                         \\
\multicolumn{1}{c|}{TBA-MIL} & UNIV2                      & None                   & 91.28 \\
\multicolumn{1}{c|}{ABMIL}                           & Testes-SSL                      & None                   & 92.14                         \\
\multicolumn{1}{c|}{DSMIL}                           & Testes-SSL                      & None                   & 92.46                         \\
\multicolumn{1}{c|}{TransMIL}    & Testes-SSL          & None                & 92.54                         \\
\multicolumn{1}{c|}{TBA-MIL} & Testes-SSL                 & None                   & 92.19                         \\
\multicolumn{1}{c|}{\textbf{TBA-MIL}} & \textbf{Testes-SSL}                 & \textbf{MIM-MIL}  & \textbf{94.64}                         \\ \hline
\end{tabular}
\end{adjustbox}
\caption{The table compares the balanced accuracy for tubular injury classification for various MIL models, on Wistar rat test set.}
\label{table:MIL_results_new}
\end{table}


\vspace{-2mm}
\section{Results and Discussion}
\vspace{-1mm}

% We investigate the performance of state-of-the-art MIL models for supervised classification of tubular injuries, including ABMIL\cite{ilse2018attention}, DFTD\cite{zhang2022dtfd}, DSMIL\cite{li2021dual}, TRANSMIL\cite{shao2021transmil} and our proposed TBA-MIL. We also, compare the impact of self-supervised pretraining of ABMIL model using Feather\cite{shaomultiple} and our proposed MIM-MIL. Table \ref{table:MIL_results_new} provides the results. TBA-MIL outperforms all MIL models, using Testes-SSL as the feature extractor and transform based aggregation layers, even without using MIM-MIL pre-training. The MIL pre-training further increases the performance, learning from a large amount of unlabelled tubular data. Due to data imbalance, we use Balanced Accuracy as our comparison metric across all classes.

We investigate the performance of state-of-the-art MIL models for supervised classification of tubular injuries, including ABMIL \cite{ilse2018attention}, DFTD \cite{zhang2022dtfd}, DSMIL \cite{li2021dual}, TransMIL \cite{shao2021TransMIL} and our proposed TBA-MIL, using  UNIV2 and Testes-SSL weights. We also compare the impact of self-supervised pretraining of ABMIL model using Feather\cite{shaomultiple} and our proposed MIM-MIL. Table \ref{table:MIL_results_new} provides the results. 

% \textcolor{red}{ The following observations can be derived, Testes-SSL enhances the performance of all MIL models learning from large scale domain specific patch data. TBA-MIL achives similar performance to ABMIL, however, TRANSMIL outpeforms all MIL models using Testes SSL. However, the TBA-MIL truely outshines when coupled with MIM-MIL pre-training on large scale unlabelled data, for which it was designed. TBA-MIL and MIM-MIL also outperforms ABMIL with  Feather pre-trained weights, which is itself trained on large scale histopathology data.}

Testes-SSL consistently improves the performance of all evaluated MIL aggregators, highlighting the benefit of learning features from large-scale, domain-specific patch data. Notably, ABMIL equipped with Testes-SSL outperforms ABMIL using Feather pre-trained features, despite the latter being trained on large-scale, heterogeneous histopathology datasets. This indicates that domain-specific representation learning is particularly beneficial for testicular histopathology. When trained with Testes-SSL features, TBA-MIL achieves performance comparable to ABMIL, while TransMIL attains marginally higher accuracy, suggesting that architectural differences alone contribute limited gains in the absence of additional pre-training. In contrast, coupling TBA-MIL with MIM-MIL pre-training on large-scale unlabeled tubule data yields the strongest performance overall. This improvement is enabled by TBA-MIL’s ability to model token-level representations with learnable positional embeddings, which are essential for stochastic instance masking and joint global–local distillation.
%\textcolor{red}{Testes-SSL enhances the performance of all MIL models based on features learned from large-scale, domain-specific patch data. In case of ABMIL, using Testes-SSL even outperforms ABMIL with Feather pre-trained weights, which were trained on large-scale histopathology WSI datasets. This demonstrates that a domain specific feature extractor significantly benefits the MIL performance.TBA-MIL achieves performance comparable to ABMIL, while TransMIL marginally outperforms TBA-MIL models when trained with Testes-SSL. Notably, MIM-MIL pre-training on large-scale unlabeled tubule data, enabled by TBA-MIL that provides learnable positional encoding for stochastic encoding and local \& global representation, demonstrates its strongest performance. } %\footnote{State-of-the-art MIL achitectures do not provide positional encoding and local (token) representation, that are essenatial for MIM-MIL training}. }

We also perform an ablation on MIM-MIL pre-training strategy to evaluate the importance of masking and positional embedding. As seen in table \ref{table:MIL_abla}, both techniques aid in learning better feature representations. Testicular tubules have radial structure, as seen in figure \ref{fig:figure_1}, which can explain the utility of positional embedding as this allows the MIL model to localize the patches. On the other hand masking helps learning diverse features by compressing redundant visual patterns and thereby enforcing global tissue understanding.




\begin{table}[htbp!]
\centering
\begin{tabular}{cccc}
Masking & Pos Embedding & Feature Extractor & \multicolumn{1}{l}{Balanced Accuracy} \\ \hline
Yes     & No              & Testes-SSL        & 90.6                                  \\
No        & Yes           & Testes-SSL        & 91.19                                 \\
\textbf{Yes}     & \textbf{Yes}           & \textbf{Testes-SSL}        & \textbf{94.64}    \\   \hline                          
\end{tabular}
\caption{Ablation for use of Masked Instance Modelling and Positional Embedding in MIM-MIL pre-training, the table provides balanced accuracy on Wistar rat test set. }
\label{table:MIL_abla}
\end{table}


\begin{figure}[htbp]
        \centering
        \includegraphics[width=0.80\textwidth]{images/attention_maps.drawio_compressed.png}
        \caption{Patch attention scores for tubules of different classes and sizes,  obtained using TBA-MIL \& MIM-MIL framework. \textbf{a-b}: Normal Tubules, all patches except majority background patches get high attention scores; \textbf{c}: Degeneration Tubular, patches exhibiting signs of degeneration  get higher attention; \textbf{d-e}: Degeneration Germ Cell, patches with cellular level injury obtain a high attention score; \textbf{f}: Dilation Tubular, the attention focuses on thinning of epithelium and reduced germ cell layers}
        \label{fig:attention_maps}
    \end{figure}


Finally, we visualize the attention scores of patches within individual tubules, obtained from the TBA-MIL model pre-trained using the MIM-MIL framework on a large-scale unlabeled tubule dataset seen in figure \ref{fig:attention_maps}. It is observed that low-information regions, such as background-dominated patches, are consistently assigned lower attention weights. In normal tubules, attention is broadly distributed across most patches, whereas in injured tubules, patches corresponding to pathological regions receive higher attention scores. These observations further provide evidence that the model learns relevant features by focusing on pathology informative regions while effectively down weighting background noise.


\begin{figure}[htbp]
        \centering
        \includegraphics[width=0.80\textwidth]{images/boxplots4.png}
        \caption{Evaluation of proposed framework on a Wistar Rat toxicology study, to detect testicular tubule injury, on administration of a test drug. The figure shows box plots of percentage of tubules exhibiting injury in control and drug-treated WSI.}
        \label{fig:boxplot}
    \end{figure}


\vspace{-2mm}
\subsection{Analysis on Toxicological study}
\vspace{-1mm}
\label{sec:study}
We evaluate the utility of the TBA-MIL and MIM-MIL framework in assessing the toxicological effects of an administered compound. Specifically, we analyze the distribution of four tubule classes in a toxicological study by comparing the percentage of injured tubules in control and drug-treated tissues. An increased proportion of tubule injury in the treated group is indicative of compound-induced toxicity. The box plot in Figure~\ref{fig:boxplot} shows a statistically significant increase in the percentage of tubules exhibiting degeneration germ cell and degeneration tubular in the treated group compared to the control group. These findings are consistent with independent assessments by a panel of expert pathologists, who confirmed that the compound induces male reproductive toxicity characterized predominantly by these two injury patterns, thereby validating the proposed framework.

\vspace{-2mm}
\section{Conclusion}
\vspace{-1mm}
 This work shows that fixed-size tubule classification is insufficient for modelling the diverse morphology of rat testicular tubules. By treating each tubule as a variable-length bag of patches, MIL provides a more effective representation that preserves spatial structure. By integrating the TBA-MIL architecture with the MIM-MIL self-supervised pre-training strategy, the proposed approach effectively exploits large-scale unlabeled data and significantly improves performance over existing MIL baselines. Attention visualizations indicate that the learned representations emphasize pathology relevant regions while suppressing background noise, and evaluation on a toxicological study demonstrates the framework’s ability to detect statistically significant injury differences between control and treated groups in agreement with expert pathology assessments.

% This work shows that fixed-size tubule classification is insufficient for modelling the diverse morphology of rat testes tubules. By treating each tubule as a variable-length bag of patches, MIL provides a more effective representation that preserves spatial structure. Our proposed TBA-MIL a transformer-based aggregation that learns structure-aware representations from unlabelled data (MIM-MIL) 

% model further enhances this by incorporating transformer-based aggregation and positional embeddings tailored to tubule geometry. With MIM-MIL pretraining, the model learns structure-aware representations from unlabelled data, yielding substantial performance gains. Together, these contributions set a new benchmark for testicular injury classification and offer a framework for modelling complex, non-uniform tissue structures in digital histopathology.

% \subsection{Which MIL Model works best for Testes Modelling}
% %Compare different MIL architectures
% %Our model (with Univ2) : Postional encoding + cls token 
% Next, we investigate the performance of state-of-the-art multiple instance learning model, all using UNIV2 as the tile feature extractor. Transformer architecture outperforms all other models

% We also experiment with a feature extractor trained on Testes and Epididymis tissue data, using DinoV2 \cite{}. Using this feature extractor marginally improves the performance on the test data.



%\subsection{Does Training a new feature extractor help?}


% \subsection{Can Pre-training Aggregation layer help?}
% Compare different pre-trained models and SSL weights.
% Lastly,we evaluate if pre-training of the 



% varying size and a scatter plot of width and hieght.



% Importantly,  Figure \ref{} provides samples of testules of varying size and a scatter plot of width and hieght.





%  as it damage to the highly structured seminiferous epithelium and the resulting loss of germ cells may be irreversible.



% It involves a fine combed inspection of tissue samples from dosed animals and contrasting it against control tissue samples. Testicular toxicity in matured male rates 


% Tiles with an occupancy value of less than 0.1, determined by the Otsu algorithm, were discarded to focus on tissue-covered regions\cite{xu2024whole}. Here we choose
% 66 the reconstruction-based masked autoencoder approach to pretrain ES, which can work well with a small batch size \cite{xu2024whole}



% \section{Comparative Analysis with Foundation Models}
% \label{sec:comparative_analysis}

% Recent advances in computational pathology have introduced large-scale Foundation Models (FMs) that excel at generalized Whole Slide Image (WSI) tasks. However, the fine-grained analysis of \textbf{testicular structural abnormalities} requires a specialized focus. We contrast \textbf{iBOT-MIL} with five leading FMs, arguing that their generalist nature makes them suboptimal for local, context-dependent classification of pathologies like Maturation Arrest.

% \subsection{Architectural Paradigms}

% \begin{table}[h]
% \centering
% \caption{Comparison of Foundational Model Architectures and Objectives.}
% \label{tab:fm_comparison}
% \begin{tabular}{|p{2.5cm}|p{2.5cm}|p{3.5cm}|p{4.5cm}|}
% \hline
% \textbf{Model} & \textbf{Core Objective} & \textbf{Aggregation Mechanism} & \textbf{Primary Limitation for Testis Pathology} \\
% \hline
% Prov-GigaPath & Masked Autoencoder (MAE) & LongNet (Dilated Attention) & Overkill; Signal dilution for localized tubular defects. \\
% \hline
% CHIEF & Weakly Supervised Contrastive & Attention-MIL (Gated) & Relies on weak labels (often cancer); unsuitable for non-malignant defects. \\
% \hline
% PRISM & Vision-Language Contrastive & Perceiver Resampler & Requires massive, clean paired text reports (unavailable in research setting). \\
% \hline
% MADELEINE & Multi-Stain Alignment & Gated Attention & Requires expensive multi-stain data for pre-training. \\
% \hline
% COBRA & Multi-Model Feature Fusion & Mamba-2 (State Space) & Freezes encoders sensitive to cancer features, not structural integrity. \\
% \hline
% \textbf{iBOT-MIL (Ours)} & \textbf{Masked Instance Modelling} & \textbf{Standard Transformer} & \textbf{Optimized end-to-end for structural context and local semantics.} \\
% \hline
% \end{tabular}
% \end{table}

% \subsection{Detailed Comparison: The Need for Structural Specialization}

% \subsubsection{vs. Generalist WSI Models (Prov-GigaPath \& CHIEF)}
% Generalist models prioritize wide spatial context, which compromises local signal integrity.
% \begin{itemize}
%     \item \textbf{Prov-GigaPath:} Utilizes LongNet to handle massive sequence lengths, optimizing for global features. This approach effectively \textbf{dilutes the crucial micro-anatomical signals} defining specific abnormalities (e.g., the precise cellular layering disrupted in Maturation Arrest). Its complexity is unnecessary for moderate-sized tubule ROIs.
%     \item \textbf{CHIEF:} Relies on weak slide-level labels (e.g., Cancer/Normal) during pre-training. Testicular pathology requires fine-grained classification across a spectrum of non-malignant structural failures, making CHIEF's coarse objective ill-suited for unsupervised anomaly detection.
% \end{itemize}

% \subsubsection{vs. Multi-Modal/Multi-Stain Models (PRISM \& MADELEINE)}
% These models introduce dependencies on external data that limit applicability in many research settings.
% \begin{itemize}
%     \item \textbf{PRISM:} Requires paired clinical reports, which are often inconsistent or unavailable in research and toxicity studies. iBOT-MIL operates purely within the visual domain.
%     \item \textbf{MADELEINE:} Pre-training necessitates co-registered multi-stain data (H\&E plus IHC). iBOT-MIL efficiently learns structural consistency and cellular identity from \textbf{H\&E alone}, making it economically and logistically feasible.
% \end{itemize}

% \subsubsection{vs. Feature Fusion (COBRA)}
% \textbf{COBRA} focuses on fusing frozen feature embeddings from existing encoders. These encoders are typically trained on TCGA data and are therefore optimized to detect characteristics of malignant tissue (e.g., nuclear pleomorphism). This makes them inherently \textbf{insensitive to the primary structural defects} defining testicular pathologies, such as cell absence (Sertoli Cell Only) or cellular arrest (Maturation Arrest). iBOT-MIL provides an end-to-end approach, fine-tuning the patch encoder to specifically capture the unique cytological and organizational rules of testicular tissue.

% \subsection{Summary of Advantages}

% By leveraging \textbf{Masked Instance Modelling}, iBOT-MIL shifts the optimization target from general features (pixel/tumor detection) to \textbf{semantic structural validation}. This forces the model to learn the intrinsic rules of cellular co-occurrence within the tubule, providing a superior and specialized representation for abnormality classification.

% \begin{table}[h]
% \centering
% \begin{tabular}{|l|p{6cm}|p{6cm}|}
% \hline
% \textbf{Metric} & \textbf{Generalist FMs (GigaPath/CHIEF)} & \textbf{iBOT-MIL (Ours)} \\
% \hline
% \textbf{Primary Goal} & General Feature Extraction / Cancer Detection & \textbf{Semantic Structural Integrity Validation} \\
% \hline
% \textbf{Context Learning} & Long-range, sparse attention (WSI-level) & Dense, local attention (Bag-level, via MIM) \\
% \hline
% \textbf{Feature Sensitivity} & Optimized for nuclear atypia and malignancy & Optimized for cellular presence/absence and organization \\
% \hline
% \end{tabular}
% \end{table}
\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version

\midlacknowledgments{We sincerely thank Dr. Milind Dalvi and Pranab Samanta for their valuable domain expertise in testicular tissue pathology and support in creation of the dataset.}

\bibliography{midl26_286}


\appendix

\section{Sizes Distribution of Testicular Tubules}


\begin{figure}[htbp]
        \centering
        \includegraphics[width=0.75\textwidth]{images/tubule_size.png}
        \caption{Histogram for tubule sizes from train and test dataset. Median tubule size is 718 pixels, at 10x magnification.}
        \label{fig:hist_tubule_size}
    \end{figure}


\section{Tubule Segmentation Model}
\label{sec:appendix-segmentation}

% \subsection{Training and Evaluation Details}
Tubules form the fundamental structural and functional units of the testis and are the primary targets of testicular toxicity, with injury manifesting as structural or cellular disruptions. Each tubule can reflect varying degrees of toxicity, and therefore must be modelled individually. To this end, we develop a tubule segmentation algorithm using a U-Net architecture.

The dataset consists of approximately 12,000 overlapping patches (1024 × 1024) at 10× magnification extracted from 20 WSIs. These WSIs are split into training, validation and test sets (12:3:5). The WSIs used for developing the U-Net model are separate from those used to create supervised or unsupervised datasets for learning tubule representations.

The model is trained using Dice loss with a cosine decay learning rate scheduler with warm-up, a maximum learning rate of 1e-3, a batch size of 8, and a total of 100 epochs. The model achieves a Dice score of \textbf{96.4\%} on the test set. Figure \ref{fig:tubule_seg} provides sample predictions from the tubule segmentation model. It can be observed that segmentation mask prediction is sharp even in extreme injury cases, with minor missed boundary regions in a few injury tubules.


\begin{figure}[htbp]
        \centering
        \includegraphics[width=0.95\textwidth]{images/seg_pred.drawio.drawio_compressed.png}
        \caption{Sample segmentation mask predictions (overlay) on unseen data on normal and injury affected tubules. Row 1 (top): Normal Tubules, Row 2-4: Injury affected tubules.}
        \label{fig:tubule_seg}
    \end{figure}


Qualitatively we observe that the classification performance is robust to  segmentation imperfections. This can be due to multitude of reason, tubules are represented as bags of patch instances rather than relying on precise pixel-level boundaries, second, the transformer-based aggregator is trained to emphasize informative patch patterns, training on large scale unsupervised  tubule dataset, allowing it to down-weight noisy or less informative instances arising from minor segmentation errors, as seen in figure 
\ref{fig:attention_maps}. Finally, large scale pretraining on ~400,000 tubules allows the model to generalize to a variety of tubule sizes, shapes and minor error in mask.



\section{Confusion Matrix on TestSet}
\label{sec:confusion}
Figure \ref{fig:confusion} provides the confusion matrix for baseline and best performing models. It can be observed that performance gain is observed across all classes using our proposed framework; TBA-MIL pre-trained with MIM-MIL. The highest gain is obtained for classes Degeneration Germ Cell and Degeneration Tubular, reducing both false positives and false negative predictions.

\begin{figure}[htbp]
        \centering
        \includegraphics[width=0.85\textwidth]{images/confusion_matrix.drawio_compressed.png}
        \caption{Confusion Matrix obtained on testset for image Re-sizing based Classifier, ABMIL, TransMIL and TBA-MIL with MIM-MIL. Class names: Degeneration Tubular \(DT\), Degeneration Germ Cell \(DG\), Dilation Tubular \(DiT\)}
        \label{fig:confusion}
\end{figure}
\end{document}
