\setcounter{footnote}{0}
\section{Method}
\subsection{Data Description}
We use 2,297 digital WSIs originated from patients in a real-world clinical setting of \texttt{Seegene Medical Foundation}\footnote{This study was performed in line with the principles of the Declaration of Helsinki. Approval was granted by the Ethics Review Board SMF-IRB-2024-007 and KH2024-059.}, which comprises a total of the finest seven classes in $\mathcal{H}=2$: tubular adenoma (TA), tubulovillous adenoma (TVA), traditional serrated adenoma (TSA), hyperplastic polyp (HP), sessile serrated lesion (SSL), inflammatory polyp (IP), and lymphoid polyp (LP). These classes are organized into three coarser categories (\textit{i}.\textit{e}., $\mathcal{H}=1$), as illustrated in Fig.~\ref{fig1} (a). Among them, Adenoma is paramount due to its potential for malignant transformation. Serrated is of secondary importance, necessitating more detailed diagnosis into SSL and HP. Each WSI has a Subsite indicating specimen location: \texttt{Proximal} for near the oral cavity, \texttt{Distal} for near the anus, \texttt{UNKNOWN} otherwise. We convert it into a three-dimensional one-hot vector $\mathbf{s}$. This clinical dataset, comprising WSIs each with a single symptom, was split into training, validation, and test sets at a 0.7:0.15:0.15 ratio. In addition, we have incorporated an additional 182 complex samples (see Table.~\ref{tab:data}), which contain two or more symptoms, into the test set, to assess the proposed method's performance in challenging real-world multi-symptom conditions.
% to examine the efficacy of the proposed method in a more challenging real-world scenario, where multiple symptoms are also found.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%% Chapter %%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Proposed Two-phase Framework}
Fig.~\ref{fig1} (b) shows the two-phase framework we propose in consideration of the \textit{intra}- and \textit{inter}- hierarchical relationships. A WSI $X_i$ is separated into $n(X_i)$ patches $\{x_{i,1},\cdots,x_{i,n(X_i)}\}$, and a pre-trained feature extractor outputs the corresponding instance bag $\mathcal{B}_i=\{z_{i,1},\cdots,z_{i,n(X_i)}\}$. The $\mathcal{B}_i$ is fed into each $\mathcal{H}$ MIL, $f^{}_{\theta_1}(\cdot)$ and $f^{}_{\theta_2}(\cdot)$. We denote the softmax outputs of $f^{}_{\theta_1}(\cdot)$ and $f^{}_{\theta_2}(\cdot)$ as $\hat{p}^{\mathcal{H}=1}\in\mathbb{R}^3$ and $\hat{p}^{\mathcal{H}=2}\in\mathbb{R}^7$, respectively. Observing that pathologists closely examine the acquisition site in the diagnosis of HP and SSL, we concatenate $\mathbf{s}$ with the input to feed into $f^{}_{\theta_2}(\cdot)$ if ${\mathrm{argmax}}_c(\hat{p}^{\mathcal{H}=1})$ is Serrated. Consequently, each hierarchy MIL is trained in an end-to-end manner with the proposed framework using the following cross-entropy term:
\begin{equation}
    \mathcal{L}_{CE}=-\frac{1}{2}\sum_{h\in\mathcal{H}}\sum_{c\in\mathcal{C}^{\mathcal{H}=h}}y^{\mathcal{H}=h}_c\log(\hat{p}^{\mathcal{H}=h}_c)
\end{equation}
where $\mathcal{C}^{\mathcal{H}}$ indicates the classes that are allocated in $\mathcal{H}$.







%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%% Chapter %%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Inter-Hierarchy Alignment}
Although hierarchical MILs predict a different number of classes, they share the same input. That is, given that both MILs evaluate the same samples, the lower-level probability distribution, aggregated to match the higher-level classes, should ideally match the higher-level probability distribution. Inspired by this motivation and \cite{garg2022learning}, we enforce $\hat{p}^{\mathcal{H}=1}$ and $\hat{p}^{\mathcal{H}=2}$ to be aligned:
\begin{gather}
    \begin{aligned}
    \mathcal{L}_{IHA}=\text{JS}(\hat{p}^{\mathcal{H}=1}{\space||\space}\dot{p}^{\mathcal{H}=1})=\frac{1}{2}\begin{pmatrix}\text{KL}(\hat{p}^{\mathcal{H}=1}{\space||\space}m)+\text{KL}(\dot{p}^{\mathcal{H}=1}{\space||\space}m)\end{pmatrix}
    % \\
    % \text{, where }m=\frac{1}{2}\times(\hat{p}^{\mathcal{H}=1}+\dot{p}^{\mathcal{H}=1})
    % \text{, where }\dot{p}^{\mathcal{H}=1}=\text{softmax}
    \end{aligned}
\end{gather}
 where $m=\frac{1}{2}\times(\hat{p}^{\mathcal{H}=1}+\dot{p}^{\mathcal{H}=1})$. JS and KL denote Jensen-Shannon Divergence and Kullback-Leibler Divergence, respectively. We perform the following operation to obtain the average distribution $m\in\mathbb{R}^3$ and the aligned probability $\dot{p}^{\mathcal{H}=1}\in\mathbb{R}^3$ from $\mathcal{H}=2$ to $\mathcal{H}=1$:
% The Jensen-Shannon Divergence (JS) is a symmetric metric measuring the similarity between two probability distributions that leverages Kullback-Leibler Divergence (KL).
\begin{equation}
    \begin{aligned} 
        \dot{p}^{\mathcal{H}=1}_{c}=\sum_{c'\subset{c}}\hat{p}_{c'}^{\mathcal{H}=2}
        \text{, where }c\in{\mathcal{C}^{\mathcal{H}=1}}\text{ and }c'\in{\mathcal{C}^{\mathcal{H}=2}}.
        % \text{, where }o\in\{\hat{p}, L\},{\space}c'\in{\mathcal{C}^{\mathcal{H}=1}}\text{, and }c\in{\mathcal{C}^{\mathcal{H}=2}}
    \end{aligned}
\end{equation}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%% Chapter %%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Upper-Hierarchy-Dependent Probability}
Classifying the three classes of $\mathcal{H}=1$ is a simpler task than the seven classes of $\mathcal{H}=2$. In other words, if $f_{\theta_1}(\cdot)$ and $f_{\theta_2}(\cdot)$ refer to each other, it is reasonable to do so from the coarse to the fine level. Therefore, we adjust the probabilities $\hat{p}^{\mathcal{H}=2}$ of $f_{\theta_2}(\cdot)$, so that it aligns with the predictions of the $f_{\theta_1}(\cdot)$ while also allowing for some dependence:
% \begin{equation}
% \tilde{p}^{\mathcal{H}=2}_c = \begin{cases*}
%   \hat{p}_c^{\mathcal{H}=2}\times\hat{p}_{c'}^{\mathcal{H}=1} & , if $c\subset{c'}$\\
%   \hspace{22pt}\hat{p}^{\mathcal{H}=2}_c & , otherwise.
% \end{cases*}
% \end{equation}
\begin{equation}
    \begin{aligned}
    \mathcal{L}_{UHD}=\text{KL}(||\tilde{p}^{\mathcal{H}=2}||_1{\space\mid\mid\space}y^{\mathcal{H}=2})\mspace{75mu}
    \\
    \text{, where } 
    \tilde{p}^{\mathcal{H}=2}_c = 
    \begin{cases*}
    \hat{p}_c^{\mathcal{H}=2}\times\hat{p}_{c'}^{\mathcal{H}=1} & , if $c\subset{c'}$\\
    \hspace{22pt}\hat{p}^{\mathcal{H}=2}_c & , otherwise.
    \end{cases*}
    \end{aligned}
\end{equation}
% \begin{equation}
%      \mathcal{L}_{UHD}=\text{KL}(||\tilde{p}^{\mathcal{H}=2}||_1{\space\mid\mid\space}y^{\mathcal{H}=2})\text{, where } 
%     \tilde{p}^{\mathcal{H}=2}_c = 
%     \begin{cases*}
%     \hat{p}_c^{\mathcal{H}=2}\times\hat{p}_{c'}^{\mathcal{H}=1} & , if $c\subset{c'}$\\
%     \hspace{22pt}\hat{p}^{\mathcal{H}=2}_c & , otherwise.
%     \end{cases*}
% \end{equation}



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%% Chapter %%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Implicit Feature Remix for Intra-Hierarchy}
\begin{figure*}[t!]
    \centering
    \begin{subfigure}{0.24\textwidth}
        \includegraphics[width=\linewidth]{figs/crop_prob_150}
        \label{fig:sub1}
    \end{subfigure}%
    \hfill% 이미지 사이 간격 조절
    \begin{subfigure}{0.24\textwidth}
        \includegraphics[width=\linewidth]{figs/crop_prob_393}
        \label{fig:sub2}
    \end{subfigure}%
    \hfill% 이미지 사이 간격 조절
    \begin{subfigure}{0.24\textwidth}
        \includegraphics[width=\linewidth]{figs/crop_prob_819}
        \label{fig:sub3}
    \end{subfigure}%
    \hfill% 이미지 사이 간격 조절
    \begin{subfigure}{0.24\textwidth}
        \includegraphics[width=\linewidth]{figs/crop_prob_7545}
        \label{fig:sub4}
    \end{subfigure}
    \caption{
    Visualization of the probability of event $E$ not occurring (Equ.~\ref{equation:P(E)}). $P(E^c)$ is visualized for the median, average, and maximum counts of $n=|\mathcal{B}_i|$. In each plot, the blue percentage indicates the proportion of cases with a probability of 99\% or less out of feasible events.}
    \label{fig:combined}
\end{figure*}

We still have the second component of the class hierarchy: Intra-hierarchy. Given that a $\mathcal{B}$ is a collection of multiple instances, a random proportion $\beta\sim\text{Uniform}(0.4,0.8)$ of instances is sampled from $\mathcal{B}_i$ and mixed into the $1-\beta$ proportion of $\mathcal{B}_j$ to synthesize bag $\mathcal{B}_{i+j}$, where $\mathcal{B}_i$ has higher priority than $\mathcal{B}_j$ within the same $\mathcal{H}$ (\textit{e}.\textit{g}., TA and LP). We perform feature remixing only when $\mathcal{B}_i$ has at least $150$ instances to create a distinguishable synthesized sample. Here, a valid concern is the possibility that the following event $E$ occurs: $E=\{$\textit{no crucial instance for diagnosis from} $\mathcal{B}_i$ \textit{are mixed into} $\mathcal{B}_{i+j}\}$. However, contrary to our concerns, if we assume $\mathcal{B}_i$ contains a proportion of $\alpha\geq0.05$ instances exhibiting symptoms, then event $E$ is rarely to occur (\textit{i}.\textit{e}., complementary set $E^c$) as depicted in following Equ.~\ref{equation:P(E)} and its visualization Fig.~\ref{fig:combined}:
\begin{equation}\label{equation:P(E)}
    \begin{aligned}
        P(E^c)=
        \begin{cases*}
            1-\frac{{}_{(n-n\alpha)}C_{n\beta}}{{}_nC_{n\beta}} & , if $\alpha+\beta<1$
            \\
            \hspace{32pt} 1 & , if ${\alpha+\beta}\geq1$.
        \end{cases*}
    \end{aligned}
\end{equation}
% For clearer comprehension, we plot the form of Equ.~\ref{equation:P(E)} in Fig.~\ref{fig:combined}. This shows that the synthesized $\mathcal{B}_{i+j}$ contains instances that exhibit symptoms of a higher intra-hierarchy class. 

% The label of $\mathcal{B}_{i+j}$ can be $y_i^{\mathcal{H}}$ of the higher intra-hierarchy bag $\mathcal{B}_i$, however, to utilize the benefits of label softening~\cite{chen2020investigation}, $k$-th dimension of the smoothed label vector ${y}_{i+j}^{\mathcal{H}}$ is defined as:
Furthermore, we introduce label softening to utilize the benefits of label softening~\cite{chen2020investigation}, where $k$-th dimension of the smoothed label vector ${y}_{i+j}^{\mathcal{H}}$ is defined as:
\begin{equation}
    \begin{aligned}
    % \tilde{y}_{k\in\{i,j\}}=\frac{\tilde{r}_k}{\tilde{r}_i+\tilde{r}_j}\mspace{150}
    {y}_{i+j,\space k}^{\mathcal{H}}=
    \begin{cases*}
        {\tilde{r}_k}/({\tilde{r}_i+\tilde{r}_j})\text{, if }k\in\{i,j\} \\
        {\mspace{41mu}0\mspace{42mu}\text{, otherwise.}}
    \end{cases*}
    \mspace{100mu}
    \\
    \text{ , where }
    \begin{cases*}
        \tilde{r}_i=r^{1/\tau}
        \\
        \tilde{r}_j=(1-r)^\tau
    \end{cases*}
    \text{ and }
    r=\frac{\beta\times|\mathcal{B}_i|}{\beta\times|\mathcal{B}_i|+(1-\beta)\times|\mathcal{B}_j|}
    \end{aligned}
\end{equation}
, where $\tau$ is the smoothing factor. The condition for $\bar{r}_i$ and $\bar{r}_j$ is designed to make the class of $\mathcal{B}_i$ dominant in $y_{i+j}^{\mathcal{H}}$.

The proposed hierarchical MIL framework is trained by the term $\mathcal{L}=\mathcal{L}_{CE}+\mathcal{L}_{IHA}+\mathcal{L}_{UHD}$.
% $\mathcal{L}=\lambda_1\mathcal{L}_{CE}+\lambda_2\mathcal{L}_{IHA}+\lambda_3\mathcal{L}_{UHD}$, where each $\lambda_{1:3}$ are hyper-parameters.
% \begin{equation}
% \mathcal{L}=\lambda_1\mathcal{L}_{CE}+\lambda_2\mathcal{L}_{IHA}+\lambda_3\mathcal{L}_{UHD}
% \end{equation}