\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{booktabs}
\usepackage{graphicx}
\usepackage{multirow}
\newcommand\ful[2][4cm]{\underline{\makebox[#1][c]{#2}}}
% \usepackage{caption}
\jmlrvolume{-- Under Review}
\jmlryear{2024}
\jmlrworkshop{Full Paper -- MIDL 2024}
\jmlrvolume{-- nnn}
\editors{Accepted for publication at MIDL 2024}

\title[Short Title]{NcIEMIL: Rethinking Decoupled Multiple Instance Learning Framework for Histopathological Slide Classification}


% Authors with different addresses:
\midlauthor{\Name{Qiehe Sun\nametag{$^{1}$}} \Email{sunqh21@mails.tsinghua.edu.cn}\\
\Name{Doukou Jiang\nametag{$^{2}$}} \Email{woshijdk@126.com}\\
\Name{Jiawen Li\nametag{$^{1}$}} \Email{lijiawen21@mails.tsinghua.edu.cn}\\
\Name{Renao Yan\nametag{$^{1}$}} \Email{yra21@mails.tsinghua.edu.cn}\\
\Name{Yonghong He\nametag{\midljointauthortext{Corresponding author}$^{1}$}} \Email{heyh@sz.tsinghua.edu.cn}\\
\Name{Tian Guan\nametag{\midlotherjointauthor$^{1}$}} \Email{guantian@sz.tsinghua.edu.cn}\\
\Name{Zhiqiang Cheng\nametag{\midlotherjointauthor$^{3}$}} \Email{chengzhiqiang2004@aliyun.com}\\
\addr $^{1}$ Shenzhen International Graduate School, Tsinghua University, China \\
\addr $^{2}$ Department of Pathology, Shenzhen Center For Chronic Disease Control, China \\
\addr $^{3}$ Department of Pathology, The Third People’s Hospital of Shenzhen, China 
% \addr $^{*}$ Equal contribution \\
% \addr $^{\dag}$ Corresponding author
}
\begin{document}

\maketitle

\begin{abstract}
On account of superiority in annotation efficiency, multiple instance learning (MIL) has proved to be a promising framework for the whole slide image (WSI) classification in pathological diagnosis. However, current methods employ fully- or semi-decoupled frameworks to address the trade-off between billions of pixels and limited computational resources. This exacerbates the information bottleneck, leading to instance representations in a high-rank space that contains semantic redundancy compared to the potential low-rank category space of instances. Additionally, most negative instances are also independent of the positive properties of the bag. To address this, we introduce a weakly annotation-supervised filtering network, aiming to restore the low-rank nature of the slide-level representations. We then design a parallel aggregation structure that utilizes spatial attention mechanisms to model inter-correlation between instances and simultaneously assigns corresponding weights to channel dimensions to alleviate the redundant information introduced by feature extraction. Extensive experiments on the private gastrointestinal chemotaxis dataset and CAMELYON16 breast dataset show that our proposed framework is capable of handling both binary and multivariate classification problems and outperforms state-of-the-art MIL-based methods. The code is available at: {\url{https://github.com/polyethylene16/NcIEMIL}}.
\end{abstract}

\begin{keywords}
Multiple instance learning, Histopathological slide, Redundancy cleansing
\end{keywords}

\section{Introduction}

Histopathological slide examination which commonly requires a lot of time and effort from pathologists is seen as the “gold standard” in the clinical diagnosis \cite{aeffner2017gold, cai2021greater}. And computational pathology seeks to reduce the burden on physicians by employing algorithms \cite{kather2019deep, skrede2020deep, greenwald2022whole}. 

Tissue samples are stained, scanned, and stored as digital images with different magnifications in a pyramid structure known as the whole slide image (WSI). The lower layers of WSI are utilized to study the tumor heterogeneity in cell morphology, necessitating the feeding of billions of pixels into networks at once. However, GPUs struggle to handle such many parameters simultaneously \cite{tellez2019neural}. In this case, multiple instance learning (MIL) that decouples the computational process is more effective \cite{quellec2017multiple}.

% \begin{figure}[]
%  % Caption and label go in the first argument and the figure contents
%  % go in the second argument
% \floatconts
%   {fig:1}
%   {\caption{Commen image classification network and decoupled MIL network.}}
%   {\includegraphics[width=\linewidth, trim=35 85 35 85, clip]{figs/fig_1.pdf}}
% \end{figure}

Slides are meshed into numerous patches at a specific magnification. "patch" and "slide" correspond to "instance" and "bag" in MIL, respectively. A bag's characteristic is a collection of instance attributes. For slide classifcation, MIL frameworks are decoupled into two independent parts \cite{ilse2018attention}, with a deep extractor being employed to understand salient features of instances, and a shallow aggregator responsible for the integration and mapping of bags. Since the above networks are trained separately, the computational cost is greatly reduced. Specifically, on the one hand, patching enables instances to be fed into the extractor in batches, avoiding the necessity to perform large matrix operations in a single pass. On the other hand, the global pooling layer at the end of the extractor substantially reduces dimensionality, facilitating straightforward linear transformations for the assembled bag-level representations. However, upon revisiting the aforementioned process, we identified information redundancy originating from multiple sources:
\begin{itemize}
\item \textbf{Gradient operation and backpropagation.} Slide-level labels serve as the exclusive source of supervised signals, and their gradients are solely fed back into the shallow aggregator. Conversely, the extractor is not constrained, resulting in a lack of understanding regarding the task-relevant property of the instances.
\item \textbf{Instance-level feature concatenation.} In slides, the distribution of key instances tends to be sparse. However, when forming a bag embedding, all instances are involved, which leads to many irrelevant instances being of interest during aggregation.
\item \textbf{Redundancy in the channel dimension.} For a given task, instances can be perceived as manifolds in a low-dimensional space. However, extractors often embed instances in high-dimension to prevent information loss, regardless of their utility.
\end{itemize}

For this reason, we sought to optimize the interference terms mentioned above and proposed a Non-crucial Information Elimination-based MIL (NcIEMIL) architecture. In customizing the extractor, we followed the approach of Campanella et al. \cite{campanella2019clinical} by associating the corresponding bag-level annotation with the single instance that the extractor deems most likely to be positive, thus making full use of limited supervised signals. Subsequently, we exclude irrelevant instances based on the ranking of positive probability and forming a more refined bag embedding. The noise of channel dimension is specifically addressed during aggregation. We design a bi-parallel aggregator, introducing channel attention to weigh each embedding dimension, aiming to approximate the low-dimensional manifold. Simultaneously, we maintain the advantage of spatial attention, dynamically modeling the influence of different instances. To comprehensively validate the framework's effectiveness, we conducted experiments on the CAMELYON16 dataset and the retrospectively collected BgIM gastric mucosal biopsy dataset.

\section{Related Work}
Initially employed for drug activity prediction \cite{dietterich1997solving}, multiple instance learning is widely used in the whole slide image research \cite{campanella2019clinical, kanavati2020weakly, chen2021annotation, marini2022unleashing}. It aligns with the characteristics of pathology data, which is large but unlabeled, intending to reduce the daily workload of pathologists. MIL is categorized into instance- and embedding-level methods \cite{ilse2018attention}. Extensive research has been conducted on the latter as it is well-suited for more complex application scenarios, and our method also belongs to this category.
\subsection{Feature Extraction}
Due to the absence of a standard database, MIL extractors typically employ neural networks pre-trained on ImageNet \cite{deng2009imagenet} to map instances into a hidden space, retaining basic knowledge of color and texture \cite{lu2021data, shao2021transmil, zhang2022dtfd}. However, histopathological images significantly differ from natural images \cite{wang2022transformer, kang2023benchmarking}, a distinction increasingly emphasized with the advancement of self-supervised learning and extended to MIL \cite{li2021dual, chen2022scaling}. Additionally, the semi-decoupled framework selects instances deemed important by the aggregator \cite{qu2022bi, yu2023bayesian}, providing supervision to the extractor but being prone to introducing errors. In contrast, we fully utilize slide-level labels, referencing instance-level methods \cite{campanella2019clinical} to impart more determinism to the extraction network.
\subsection{Instance Filtering}
To identify the active region and eliminate extraneous interference, the instances were manually recalibrated by introducing extra supervision \cite{wang2019rmdl}. Class activation mapping (CAM) \cite{zhou2016learning} was used to indicate lesion regions on thumbnails of WSI, although with some imprecision \cite{chen2021diagnose}. In this paper, we calculate the positive scores and select discriminative instances accordingly.
\subsection{Attention-based Aggregation}
Max- and avg-pooling serve as the simplest aggregation functions, representing two extreme assumptions in the MIL formulation that are prone to opposite judgments due to biases in the extraction process. Recognizing the diverse contributions of each instance, attention mechanism is employed to efficiently aggregate representations of instances \cite{ilse2018attention, lu2021data}. Due to the great success of Transformer \cite{vaswani2017attention}, self-attention (SA) has emerged as an alternative \cite{shao2021transmil, zheng2022graph}. However, its quadratic complexity is called into question due to the significantly larger number of tokens in MIL compared to other domains. In this paper, by eliminating most non-discriminative instances, our aggregator is still constructed on top of SA, maintaining computational efficiency within an acceptable range. 

\section{Methodology}
\subsection{Multiple Instance Learning Formulation}
For clarity, we present the formulation of MIL. Considering a binary problem as an example, for a given bag $\mathbf{X} = \left \{\mathbf{x}_1, \mathbf{x}_2, \cdots, \mathbf{x}_n \right \}$ and its corresponding label $Y \in \left \{0, 1\right \}$, where $\mathbf{x}_i$ denotes the $i$-th instance of $\mathbf{X}$ and $n$ is the numbr of instances. We assume that the true label of $\mathbf{x}_i$ is $y_i \in \left \{0, 1\right \}$, which is not known in practice. Then MIL can be described as:
\begin{align}
    Y = \begin{cases}
 0, & \text{ iff } \sum_i{y_i} = 0, \\
 1, & \text{ otherwise }
\end{cases}
\label{eq1}
\end{align}
Assuming the instances within $\mathbf{X}$ are independently and identically distributed, Eq. \ref{eq1} can be generalized \cite{zaheer2017deep} as:
\begin{align}
    Y = S(\mathbf{X}) = g(\textstyle \sum_i{f(\mathbf{x}_i)})
\label{eq2}
\end{align}
where $S(\cdot)$ is a scoring function, and $f(\cdot)$ and $g(\cdot)$ are two suitable transformations. MIL is decoupled into two steps. In the embedding-level approach, $f(\cdot)$ is responsible for feature embedding. The summation function $\sum$ transforms into concatenation, and $g(\cdot)$ is an aggregation function that maps the bag-level representation to the category space.
\begin{figure}[t!]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:2}
  {\caption{Overview of NcIEMIL. (a) Instances are fed into the extractor and projection head on a bag basis, and the instance with the highest probability is involved as a representative of its corresponding bag. (b) Instances within a bag are re-ranked, and bi-directional sampling is employed to generate bag embeddings. (c) A hybrid attention-based aggregator, comprising parallel spatial and channel attention mechanisms, is employed to mitigate information redundancy.}}
  {\includegraphics[width=\linewidth, trim=5 0 5 0, clip]{figs/fig_2.pdf}}
\end{figure}

\subsection{ Non-crucial Information Elimination-based MIL}
In practice, two different networks, $f_{\theta}$ and $g_{\tau}$, perform the duties of $f(\cdot)$ and $g(\cdot)$, where $\theta$ and $\tau$ are learnable parameters. We observed that a substantial amount of irrelevant information was forced to be retained or introduced during the process. Hence, we introduce \textbf{N}on-\textbf{c}rucial \textbf{I}nformation \textbf{E}limination (NcIE) to alleviate this trend, as shown in Fig. \ref{fig:2}.
\\ \hspace*{\fill} \\
\noindent \textbf{Weakly-supervision for Extraction.} The gradient backpropagation stops between the aggregator and the extractor due to physical patching and splicing. As a result, the extractor remains unconstrained. Taking inspiration from the work of Campanella et al. \cite{campanella2019clinical}, we incorporate the ground truth $Y$ into the pre-training of the extractor. In particular, we append a mapping head $h_{\theta'}$ to $f_{\theta}$ to generate the positive probability $P(\mathbf{x}_i) = \mathbb{P}(h_{\theta'}(f_{\theta}(\mathbf{x}_i)) = y^* | \mathbf{x}_i)$ for the instance $\mathbf{x}_i$, where $y^*$ represents the positive class. The instance with the highest positive probability contributes to the parameter update, and the loss can be expressed as:
\begin{align}
    \mathcal{L} = - Y\text{log}(\hat{p}) - (1-Y)\text{log}(1-\hat{p}), \quad
    \hat{p} = max \{P(\mathbf{x}_i) | i = 1, 2, \cdots, n \}
\label{eq3}
\end{align}
where $\hat{y}$ is the score of the instance that is most likely to be positive. $n$ is not a constant as the foreground areas of slides vary, and $\theta$ and $\theta'$ are updated simultaneously.
\vspace{1em}

\noindent \textbf{Discriminative Instance Selection.} In histopathological sections, only a small portion is pertinent to the problem. Numerous irrelevant instances introduce noise for the aggregator, leading to a drastic shift in attention \cite{yan2023shapley}. We precisely filter potential discriminative instances by assigning each instance from the same bag a corresponding score that is produced by $h_{\theta'}$, and then re-ranking instances in descending order:
\begin{align}
    \mathbf{X} = \{\mathbf{x}'_j |P(\mathbf{x}'_1) \ge P(\mathbf{x}'_2) \ge \cdots \ge P(\mathbf{x}'_n)\}
\label{eq4}
\end{align}
The first $K$ and the last $K$ instances of $\mathbf{X}$ are chosen to create a new pseudo-bag $\mathbf{X}' = \{ \mathbf{x}'_1, \cdots, \mathbf{x}'_K, \mathbf{x}'_{n-K+1}, \cdots, \mathbf{x}'_n \}$. Including the latter $K$ instances aims to implicitly construct negative samples for discriminative instances while avoiding extreme data distribution.
\vspace{1em}

\noindent \textbf{Hybrid Attention-Based Aggregator.} According to Eq. \ref{eq1}, for a given task, ideally, $f(\mathbf{x}_i)$ should be a low-dimensional manifold indicative of potential categories. However, $f_{\theta}(\mathbf{x}_i)$ is commonly a high-dimensional vector, implying redundancy in the channel dimension of bag embedding. Therefore, we use squeeze and excitation \cite{hu2018squeeze} to capture channel validity for it dynamically:
\begin{align}
\mathbf{u}_{channel} = \operatorname{sigmoid}(\mathbf{W}_2 \sigma(\mathbf{W}_1 \cdot \frac{1}{2K}\textstyle \sum_{k=1}^{2K}\mathbf{u}(k))) \cdot \mathbf{u}
\end{align}
where $\mathbf{u}=||_{\mathbf{x}_k \in \mathbf{X}'}f_{\theta}(\mathbf{x}_k) \in \mathbb{R}^{2K \times d}$ denotes the bag embedding, obtained by concatenating instance embeddings, with $d$ being the embedding dimension. $\sigma$ is an activation function, while $\mathbf{W}_1$ and $\mathbf{W}_2$ are linear transformations. We maintain self-attention to model the influence of instances and their spatial correlation:
\begin{align}
\begin{cases}
  \mathbf{u}_{spatial} = \operatorname{softmax}(\mathbf{Q}\mathbf{K}^T / \sqrt{d} )\mathbf{V} \\
  \mathbf{Q} = (\mathbf{v}||\mathbf{u})\mathbf{W_Q}, \mathbf{K}=(\mathbf{v}||\mathbf{u})\mathbf{W_K}, \mathbf{V}=(\mathbf{v}||\mathbf{u})\mathbf{W_V} 
\end{cases}
\end{align}
where $\mathbf{W_Q}$, $\mathbf{W_K}$ and $\mathbf{W_V}$ are learnable, and $\mathbf{v} \in \mathbb{R}^{1 \times d}$  is a learnable embedding that represents a virtual instance used for classification. The above process is illustrated in Fig. \ref{fig:2} (c). We designed an aggregator based on hybrid attention in a "parallel-merge" structure. The intersection of attention is achieved through cross-attention:
\begin{align}
\mathbf{u}' = \operatorname{softmax}(\mathbf{u}_{spatial}\mathbf{W'_Q}(\mathbf{u}_{channel}\mathbf{W'_K})^T / \sqrt{d})\mathbf{u}_{channel}\mathbf{W'_V}
\end{align}
where $\mathbf{W'_Q}$, $\mathbf{W'_K}$ and $\mathbf{W'_V}$ are also learnable linear projections. Then the aggregated $\mathbf{v}'$ is extracted from the spatial dimension of  $\mathbf{u}' \in \mathbb{R}^{(1 + 2K) \times d}$ and converted to the result by a fully-connected layer.
\begin{figure}
    \makeatletter\def\@captype{table}\makeatother % 
    \begin{minipage}[m]{0.5\linewidth}
        \centering
        \caption{Details of BgIM dataset division.}
        \begin{tabular}{ccccc}
            \toprule[1.5pt]
                  & \multicolumn{4}{c}{Severity of Intestinalization} \\ \cmidrule{2-5} 
                  & -          & +         & ++         & +++         \\ \midrule
            Train & 56         & 74         & 32         & 16         \\
            Test  & 13         & 18         & 8          & 3          \\ \bottomrule[1.5pt]
        \end{tabular}
        \label{tab: 1}
    \end{minipage}%
    \hspace{0.3cm}
    \makeatletter\def\@captype{figure}\makeatother
    \begin{minipage}[m]{0.5\textwidth}
        \centering
        \includegraphics[width=0.6\linewidth]{figs/fig_3.pdf}
        \caption{Several samples from BgIM.}
        \label{fig3}
    \end{minipage}
\end{figure}

\section{Experiment}
\subsection{Datasets}
\noindent \textbf{CAMELYON16 Breast Dataset.} Camelyon16 \cite{bejnordi2017diagnostic} is a publicly unbalanced dataset focused on differentiating between cancer and non-cancer cases for metastasis detection in breast cancer, which consists of 270 slides for training and 129 for testing. Following pre-processing, we acquired approximately 2.6 million patches at $20\times$ magnification, averaging 7,346 patches per slide.
\vspace{1em}

\noindent \textbf{BgIM Gastric Mucosa Dataset}. Intestinal epithelial metaplasia is a common lesion of the gastric mucosa, occurring in many chronic gastric diseases, and is typically considered a precancerous condition \cite{correa2010pathology}. We retrospectively gathered 220 gastric mucosal biopsies and assessed the $4$-grading results for intestinal epithelial metaplasia with two pathologists. On average, 1,320 patches were extracted per slide at $40\times$ magnification. More details are shown in Tab. \ref{tab: 1} and Fig. \ref{fig3}.

\subsection{Experiment Setup and Evaluation Metrics}
We used the OTSU algorithm to create foreground masks for all WSIs, producing non-overlapping $256 \times 256$ patches. For CAMELYON16, we performed a 5-fold cross-validation on the official training set and reported the average performance on the official test set. We artificially divided the BgIM dataset into an 8:2 ratio for training and testing, respectively. We report the average performance across five different sets of random seeds. Metrics used include accuracy (ACC), area under the curve (AUC), and F1-score.

\subsection{Implementation Details}
We used swin-tiny \cite{liu2021swin} as the backbone of the extractor. When pre-training, a projection head was added at its end to obtain a positive probability, and a batch size of 128 was employed. In the aggregation phase, the bag embedding was increased from 768 to 1024 dimensions using a fully connected layer. Because the number of focus instances was fixed at $2K$ ($K=512$ for CAMELYON16 and $K=128$ for BgIM), the mini-batch was set to $4$, differing from $1$ in other comparison methods. We chose SGD as the optimizer with a learning rate of 2e-4 and weight decay of 1e-5, and the loss function used was cross-entropy. All experiments were conducted on an RTX 3090.

\begin{table*}[htbp]
\setlength{\abovecaptionskip}{0cm} 
\setlength{\belowcaptionskip}{-0.8cm}
\small
\caption{\textbf{Comparative results on Camelyon16, and BgIM.} Subscripts indicate the standard deviation of the 5-fold cross-validation. The best performance is then marked in bold.}
\begin{center}
\begin{tabular}{c||ccc|ccc}
\toprule[1.5pt]
\multirow{2}{*}{Method}                 & \multicolumn{3}{c|}{\ful[4cm]{CAMELYON16}} & \multicolumn{3}{c}{\ful[4cm]{BgIM}} \\ 
                                        & ACC     & AUC     & F1-Score    & ACC     & AUC     & F1-Score    \\ \midrule
Naive          & $62.40_{1.42}$ & $62.16_{1.40}$ & $45.15_{5.28}$ & $52.98_{3.32}$ & $76.58_{1.28}$ & $44.27_{4.44}$  \\
Fully-supervised        & $\textcolor{red}{91.32_{1.99}}$ & $\textcolor{red}{94.93_{0.87}}$ & $\textcolor{red}{90.73_{2.12}}$ & $\textcolor{red}{91.91_{1.17}}$ & $\textcolor{red}{98.72_{0.33}}$ & $\textcolor{red}{91.57_{2.61}}$   \\ \midrule
MIL-RNN          & $82.64_{2.17}$ & $85.02_{2.73}$ & $80.64_{1.94}$ & $53.81_{3.56}$ & $80.19_{1.86}$ & $48.55_{2.81}$   \\
ABMIL          & $82.17_{1.77}$ & $84.05_{2.27}$ & $80.21_{1.66}$ & $60.00_{0.95}$ & $86.92_{0.46}$ & $51.13_{0.15}$  \\
CLAM-MB        & $82.48_{3.27}$ & $81.20_{2.76}$ & $80.81_{3.37}$ & $64.70_{4.88}$ & $86.32_{2.69}$ & $56.50_{2.12}$  \\
DSMIL         & $81.86_{1.44}$ & $79.71_{2.87}$ & $79.31_{1.31}$ & $60.00_{3.50}$ & $85.38_{0.77}$ & $52.08_{3.28}$  \\
TransMIL        & $84.81_{1.60}$ & $88.13_{1.46}$ & $83.32_{1.17}$ & $73.81_{3.37}$ & $90.65_{0.70}$ & $64.9_{3.26}$  \\
ILRA-MIL        & $84.65_{2.93}$ & $85.42_{2.24}$ & $82.90_{3.07}$ & $75.24_{2.43}$ & $91.28_{0.84}$ & $66.76_{2.95}$  \\ \midrule
NcIEMIL           & $\mathbf{86.05_{1.55}}$ & $\mathbf{89.68_{2.10}}$ & $\mathbf{85.26_{1.54}}$ & $\mathbf{85.23_{0.95}}$ & $\mathbf{95.87_{0.60}}$ & $\mathbf{81.20_{0.94}}$  \\
\bottomrule[1.5pt]
\end{tabular}
\label{table1}
\end{center}
\end{table*}

\subsection{Quantitative Results}
Our NcIEMIL was compared with MIL-RNN \cite{campanella2019clinical}, ABMIL \cite{ilse2018attention}, CLAM \cite{lu2021data}, DSMIL \cite{li2021dual}, TransMIL \cite{shao2021transmil}, and ILRA-MIL \cite{xiang2022exploring}. Tab. \ref{table1} presents the results on the two datasets. For CAMELYON16, our model shows improvements of 1.24\%, 1.55\%, and 1.94\% in ACC, AUC, and F1-score, respectively, compared to the optimal model. While in the cumulative grading task with BgIM, our model exhibits greater effectiveness, surpassing the most effective ILRA-MIL by 9.99\%, 4.59\%, and 14.44\% in ACC, AUC, and F1-score, respectively. Additionally, we reported significance verification to indicate that our method is significantly better than the baseline methods, as shown in appendix \ref{appendix A}.

Furthermore, we introduce two non-MIL methods: navie and fully-supervised approaches. The naive image-wise approach involved directly classifying downsampled WSI thumbnails using a swin-tiny network, akin to the approach in \cite{chen2021annotation}. The fully-supervised approach used a batch of patches with real labels to train a patch-level swin-tiny network. The results of the slides were obtained by the mean value of all instances. The results show that the proposed method captures slice image details and performs much better than the naive method, but is slightly inferior to the fully supervised method due to the extractor's lack of factual knowledge. To demonstrate the accuracy of the discriminative instance selection, we also performed the corresponding visualizations, as shown in  appendix \ref{appendix B}.

\subsection{Ablation Studies}
Ablation results are presented in Tab. \ref{table2}. Initially, we validated the effectiveness of channel attention by removing the blue box in Fig. \ref{fig:2}(c). The results demonstrate that, although modest, channel attention does enhance performance. Subsequently, we modified the way of selecting focus instances, replacing bi-directional sampling with single-directional sampling and random sampling (single sampling and random sampling in Tab. \ref{table2}). The results indicated that bi-directional sampling significantly outperformed the other two approaches, validating our hypotheses regarding data distribution and implicit contrasts in bag embedding. We then tested different extractors, including replacing the original one with a network pre-trained on ImageNet and pre-trained by self-supervised learning \cite{wang2022transformer}. The selection of focal instances remains unchanged. Results show that task-specific relevant weakly supervised training of the extractor is effective.
\begin{table}[htbp]
\setlength{\abovecaptionskip}{0cm} 
\setlength{\belowcaptionskip}{1cm}
\small
\center

\caption{\textbf{Ablation Studies on Camelyon16, and BgIM.} Subscripts indicate the standard deviation of the 5-fold cross-validation. The best performance is then marked in bold.}
\resizebox{1.0\linewidth}{!}{
\begin{tabular}{c||ccc|ccc}
\toprule[1.5pt]
\multirow{2}{*}{Ablation item}                 & \multicolumn{3}{c|}{\ful[4cm]{CAMELYON16}} & \multicolumn{3}{c}{\ful[4cm]{BgIM}} \\
                      & ACC     & AUC    & F1-score    & ACC   & AUC  & F1-score  \\ \midrule
w/o channel attention & $85.89_{1.50}$ & $88.86_{1.80}$ & $84.86_{1.47}$ & $84.28_{1.17}$ & $95.86_{0.65}$ & $79.62_{1.53}$ \\ \midrule
w/ random sampling  & $83.72_{1.90}$ &$85.80_{3.24}$ & $81.76_{2.31}$ & $80.95_{1.51}$ & $94.50_{0.62}$ & $78.07_{1.14}$ \\
w/ single sampling  &  $85.43_{1.98}$ & $87.17_{3.34}$ & $\mathbf{86.13_{1.69}}$ & $80.47_{0.95}$ & $94.00_{0.49}$ & $72.79_{2.45}$ \\ \midrule
w/ ImageNet weight  & $85.12_{1.58}$ & $87.02_{3.87}$ & $84.13_{1.22}$ & $80.00_{1.90}$ & $92.49_{0.97}$ & $70.41_{1.05}$ \\ 
w/ ctranspath weight  & $85.73_{1.69}$ & $88.45_{1.61}$ & $84.84_{1.64}$ & $77.62_{4.15}$ & $92.70_{0.66}$ & $70.16_{5.87}$ \\ \midrule
w/ small $K$  & $85.75_{1.84}$ &$\mathbf{90.06_{1.87}}$ & $84.87_{1.68}$ & $83.33_{1.51}$ & $95.21_{0.50}$ & $80.09_{2.14}$ \\
w/ medium $K$  &  $85.36_{1.49}$ & $89.46_{1.30}$ & $84.01_{1.30}$ & $84.28_{1.90}$ & $94.90_{0.83}$ & $80.00_{1.95}$ \\ \midrule
NcIEMIL           & $\mathbf{86.05_{1.55}}$ & $89.68_{2.10}$ & $85.26_{1.54}$ & $\mathbf{85.23_{0.95}}$ & $\mathbf{95.87_{0.60}}$ & $\mathbf{81.20_{0.94}}$ \\ \bottomrule[1.5pt]
\end{tabular}}
\label{table2}
\end{table}
We also performed ablation experiments on $K$. We introduced two values for $K$: a small $K$ ($K=128$ for CAMELYON16 and $K=32$ for BgIM) and a medium $K$ ($K=288$ for CAMELYON16 and $K=72$ for BgIM), and assessed their impacts on the results. The results show that a larger 
$K$ has more advantages but is limited by the slide area.

\section{Conclusion}
In this paper, we reconsider the decoupled MIL framework and assess the noise sources. To reduce information redundancy, we reshape the focal instance selection by employing weakly supervised training extractors and then create a hybrid attention-based aggregator. We collected a gastric mucosal biopsy dataset, BgIM, to validate the method's effectiveness. Extensive experiments on CAMELYON16 and BgIM demonstrate that our method's performance is comparable to the state-of-the-art. However, the approach still has shortcomings. For instance, the weakly supervised training of aggregators tends to bring positive instances with different semantic information closer in the feature space. Therefore, optimizing this process is a consideration for future work.
\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{We express our gratitude to Shenzhen First People's Hospital for providing de-identified data. The work was supported in part by the Development and Reform Commission of Shenzhen Municipality (Number: XMHT20230115004, KCXFZ20201221173207022). The authors declare that they have no known competing financial interests or personal relationships that could have appeared to influence the work reported in this paper.}


\bibliography{midl24_099}


\appendix
% \section{Flaws of Decoupled MIL}
% Common image classification networks have achieved great success, demonstrating strong performance in several downstream tasks. Although the MIL network is highly similar to common image classification networks, concessions made due to insufficient computational resources for histopathological slide images lead to redundancy and noise.
% \begin{figure}[htbp]
%  % Caption and label go in the first argument and the figure contents
%  % go in the second argument
% \floatconts
%   {fig:1}
%   {\caption{Commen image classification network and decoupled MIL network.}}
%   {\includegraphics[width=\linewidth, trim=35 85 35 85, clip]{figs/fig_1.pdf}}
% \end{figure}
% Patch embedding, or "stem" is a key step in dividing small images into tokens and mapping the color channels into the abstract space, as shown in Fig. \ref{fig:1}. This operation is usually performed by convolution, ensuring that the image classification network maintains a high degree of consistency in gradient computation. Instead, histopathology slide images are divided into patches to conserve computational resources. This physical division treats the patch as the smallest unit of operation, completely cutting off the correlation between neighboring patches and destroying the wholeness of the slide. Although self-attention nominally attempts to reconstruct this relationship, it is not equivalent. Self-attention focuses on the semantic correlation between patches but ignores the spatial correlation between pixel points. Additionally, feature extraction handles the patch embedding role but is not constrained by supervised signals due to frame decoupling. This leads to a gap in task understanding between the extractor and aggregator.
\section{Significance Verification}
\label{appendix A}

We reported the p-values of two-sample t-tests on ACC, AUC, and F1-Score between our method and baselines respectively. $p < 0.05$ indicates that our method is significantly better than the baseline method. 
\begin{table}[htbp]
\caption{Significance analysis results on CAMELYON16 and BgIM datasets.}
\center
\begin{tabular}{c||ccc|ccc}
\toprule[1.5pt]
\multirow{2}{*}{Ablation item}                 & \multicolumn{3}{c|}{\ful[4cm]{CAMELYON16}} & \multicolumn{3}{c}{\ful[4cm]{BgIM}} \\
                      & ACC     & AUC    & F1-score    & ACC   & AUC  & F1-score  \\ \midrule
MIL-RNN        & $0.0086$ & $0.0194$ & $0.0006$ & $5.25e-5$ & $0.0001$ & $1.91e-5$  \\
ABMIL          & $0.0011$ & $0.0060$ & $0.0017$ & $8.51e-7$ & $7.48e-5$ & $1.95e-7$  \\
CLAM-MB        & $0.0318$ & $0.0089$ & $0.0193$ & $0.0008$ & $0.0008$ & $3.09e-5$  \\
DSMIL         & $0.0086$ & $0.0059$ & $0.0015$ & $4.92e-5$ & $0.0001$ & $4.63e-5$  \\
TransMIL        & $0.1091$ & $0.1874$ & $0.0451$ & $0.0010$ & $0.0003$ & $0.0005$  \\
ILRA-MIL        & $0.1893$ & $0.0581$ & $0.1191$ & $0.0002$ & $0.0012$ & $0.0002$  \\ \bottomrule
\end{tabular}
\label{tab A}
\end{table}
The results show that for the CAMELYON16 dataset, our method significantly outperforms all baseline methods except TransMIL and ILRA-MIL. For the BgIM dataset, our method significantly outperforms all baseline methods, as shown in the Tab. \ref{tab A}.
\section{Visualization}
\label{appendix B}
We present the top-3 instances and the negative top-3 instances from the 4-grades slides of BgIM and normal, micro-, and macro-metastasis slides of CAMELYON16, respectively.
\begin{figure}[ht]
  \centering
  \includegraphics[width=\linewidth,trim=0 110 0 90,clip]{figs/fig_4.pdf}
  \caption{Visualization on Camelyon16}
  \label{fig:B1}
\end{figure}
\begin{figure}[ht]
  \centering
  \includegraphics[width=\linewidth,trim=0 50 0 60,clip]{figs/fig_5.pdf}
  \caption{Visualization on BgIM}
  \label{fig:B2}
\end{figure}
For CAMELYON16, the projection head trained with weak supervision identifies the patch containing the cancerous region. This holds true for both micro- and macro-metastatic cancer slides, as shown in Fig. \ref{fig:B1}. And in the BgIM dataset, for slides with intestinal metaplasia grade 1, the projection head incorrectly assumed that negative mucosal images were positive. However, it still retrieved all patches where intestinal metaplasia actually occurred. In contrast, the projection head's judgment was more accurate for slides with intestinal metaplasia grades 2 and 3.
\end{document}

