% Please add the following required packages to your document preamble:
% \usepackage{multirow}
% \usepackage{graphicx}
\begin{table}[t!]
\centering
\caption{
The performance comparisons of alternative hierarchy-aware methods. The values in parentheses indicate standard deviation.} % Best results are highlighted in bold.
\label{tab:main}
\resizebox{\columnwidth}{!}{%
\begin{tabular}{ccccccc}
\hline
\multicolumn{7}{c}{TransMIL~\cite{shao2021transmil}} \\ \hline
\multicolumn{1}{c|}{\multirow{2}{*}{Method}} &
  \multicolumn{3}{c|}{$\mathcal{H} = 1$} &
  \multicolumn{3}{c}{$\mathcal{H} = 2$} \\ \cline{2-7} 
\multicolumn{1}{c|}{} &
  Accuracy &
  AUROC &
  \multicolumn{1}{c|}{Recall} &
  Accuracy &
  AUROC &
  Recall \\ \hline
\multicolumn{1}{c|}{CE} &
  - &
  - &
  \multicolumn{1}{c|}{-} &
  0.866(0.008) &
  0.987(0.001) &
  0.933(0.020) \\
\multicolumn{1}{c|}{Weighted CE (5:3:2)} &
  - &
  - &
  \multicolumn{1}{c|}{-} &
  0.870(0.012) &
  0.987(0.001) &
  0.933(0.020) \\
\multicolumn{1}{c|}{Weighted CE (7:2:1)} &
  - &
  - &
  \multicolumn{1}{c|}{-} &
  0.851(0.011) &
  0.986(0.002) &
  0.916(0.027) \\
\multicolumn{1}{c|}{HXE ($\alpha=0.1$)~\cite{bertinetto2020making}} &
  0.916(0.009) &
  0.985(0.002) &
  \multicolumn{1}{c|}{0.908(0.013)} &
  0.876(0.017) &
  0.987(0.002) &
  0.948(0.008) \\
\multicolumn{1}{c|}{HXE ($\alpha=0.3$)~\cite{bertinetto2020making}} &
  0.912(0.010) &
  0.986(0.002) &
  \multicolumn{1}{c|}{\textbf{0.937(0.027)}} &
  0.875(0.007) &
  0.988(0.001) &
  0.941(0.017) \\
\multicolumn{1}{c|}{Soft Labels ($\beta=5$)~\cite{bertinetto2020making}} &
  0.908(0.009) &
  0.982(0.004) &
  \multicolumn{1}{c|}{0.914(0.012)} &
  0.882(0.012) &
  0.985(0.002) &
  0.953(0.012) \\
\multicolumn{1}{c|}{Soft Labels ($\beta=10$)~\cite{bertinetto2020making}} &
  0.918(0.014) &
  0.981(0.011) &
  \multicolumn{1}{c|}{0.929(0.022)} &
  0.868(0.009) &
  0.982(0.002) &
  0.933(0.017) \\
\multicolumn{1}{c|}{Chang et al.~\cite{chang2021your}} &
  0.920(0.009) &
  0.981(0.003) &
  \multicolumn{1}{c|}{0.924(0.009)} &
  0.872(0.007) &
  0.985(0.003) &
  0.941(0.013) \\
\multicolumn{1}{c|}{HAF~\cite{garg2022learning}} &
  0.865(0.045) &
  0.960(0.003) &
  \multicolumn{1}{c|}{0.910(0.042)} &
  0.869(0.015) &
  0.986(0.002) &
  0.940(0.022) \\
\multicolumn{1}{c|}{Ours} &
  \textbf{0.922(0.009)} &
  \textbf{0.989(0.001)} &
  \multicolumn{1}{c|}{0.927(0.029)} &
  \textbf{0.898(0.006)} &
  \textbf{0.990(0.002)} &
  \textbf{0.972(0.008)} \\ \hline
\multicolumn{7}{c}{DTFD-MIL~\cite{zhang2022dtfd}} \\ \hline
\multicolumn{1}{c|}{CE} &
  - &
  - &
  \multicolumn{1}{c|}{-} &
  0.860(0.014) &
  0.986(0.002) &
  0.918(0.016) \\
\multicolumn{1}{c|}{Weighted CE (5:3:2)} &
  - &
  - &
  \multicolumn{1}{c|}{-} &
  0.871(0.012) &
  0.987(0.001) &
  0.933(0.020) \\
\multicolumn{1}{c|}{Weighted CE (7:2:1)} &
  - &
  - &
  \multicolumn{1}{c|}{-} &
  0.850(0.019) &
  0.984(0.002) &
  0.896(0.014) \\
\multicolumn{1}{c|}{HXE ($\alpha=0.1$)~\cite{bertinetto2020making}} &
  0.922(0.023) &
  0.985(0.002) &
  \multicolumn{1}{c|}{0.911(0.053)} &
  0.875(0.003) &
  0.987(0.001) &
  0.934(0.012) \\
\multicolumn{1}{c|}{HXE ($\alpha=0.3$)~\cite{bertinetto2020making}} &
  0.926(0.009) &
  0.987(0.001) &
  \multicolumn{1}{c|}{0.924(0.020)} &
  0.863(0.003) &
  0.987(0.001) &
  0.924(0.006) \\
\multicolumn{1}{c|}{Soft Labels ($\beta=5$)~\cite{bertinetto2020making}} &
  0.923(0.011) &
  0.986(0.003) &
  \multicolumn{1}{c|}{0.925(0.015)} &
  0.874(0.007) &
  0.980(0.002) &
  0.927(0.010) \\
\multicolumn{1}{c|}{Soft Labels ($\beta=10$)~\cite{bertinetto2020making}} &
  0.915(0.010) &
  0.983(0.004) &
  \multicolumn{1}{c|}{0.916(0.022)} &
  0.866(0.008) &
  0.984(0.002) &
  0.930(0.018) \\
\multicolumn{1}{c|}{Chang et al.~\cite{chang2021your}} &
  0.941(0.008) &
  0.987(0.002) &
  \multicolumn{1}{c|}{0.944(0.027))} &
  0.879(0.016) &
  0.987(0.004) &
  0.947(0.016) \\
\multicolumn{1}{c|}{HAF~\cite{garg2022learning}} &
  0.894(0.023) &
  0.976(0.006) &
  \multicolumn{1}{c|}{0.865(0.042)} &
  0.862(0.012) &
  0.986(0.003) &
  0.916(0.020) \\
\multicolumn{1}{c|}{Ours} &
  \textbf{0.948(0.007)} &
  \textbf{0.991(0.001)} &
  \multicolumn{1}{c|}{\textbf{0.955(0.013)}} &
  \textbf{0.892(0.014)} &
  \textbf{0.991(0.001)} &
  \textbf{0.970(0.010)} \\ \hline
\end{tabular}%
}
\end{table}
\section{Experiment}
\subsection{Implementation Details}
\textbf{MIL Architectures} We utilize two state-of-the-art MIL architectures: TransMIL~\cite{shao2021transmil} and DTFD-MIL~\cite{zhang2022dtfd}. TransMIL optimizes computation while capturing more advanced inter-instance relationships. DTFD-MIL conducts double-tier distillation by resampling the input into pseudo-bags. For DTDF-MIL, we adopt Aggregated Feature Selection, which typically yields superior performance.
\\
\textbf{Training Settings} We set the $\tau$ as 15. We selected $\times256$ size patches from the 1MPP of WSIs using the Otsu algorithm~\cite{otsu1975threshold}, then transformed them into individual instances with a pre-trained feature extractor~\cite{kang2023benchmarking}. We trained the model using Adam optimizer~\cite{kingma2014adam} with betas of $(0.9, 0.999)$ and a learning rate of $1e-4$. All experiments were carried out with fixed seeds on a single NVIDIA$^\circledR$ A6000 with 48GB of memory.
\\
\textbf{Comparison Methods} We set cross-entropy (CE) and weighted CE, which explicitly trains for the importance of classes, as the baseline. Hierarchical CE (HXE) and soft labels~\cite{bertinetto2020making}, Chang et al.~\cite{chang2021your}, and hierarchy-aware feature (HAF)~\cite{garg2022learning} were selected as comparison methods that can handle coarse-to-fine hierarchy. For fair comparisons, we repeated all experiments with the optimized hyper-parameters for each method, reporting the mean and standard deviation.
\\
\textbf{Evaluation Metrics} We evaluate the performance at each $\mathcal{H}$ with Accuracy, AUROC, and Recall scores. 
\begin{figure}[t!]
\centering
\includegraphics[width=\linewidth]{figs/ablation}
% \includegraphics[height=0.724528in, width=4.8in]{figs/ablation.pdf}
\caption{Ablation results on the core components of the proposed method using DTFD-MIL. Dashed lines indicate the fully equipped model's performance.} \label{fig:ablation}
\end{figure}
In particular, the recall measure used here is based on a binary metric, where the positive class is defined as Adenoma or any of its subclasses. 






\subsection{Quantitative Results}
% CE
Table.~\ref{tab:main} presents the results of running various hierarchy-aware methods against the test data. Applying weighted CE with 5:3:2 weights improves the baseline in both MIL structures. However, it also shows that excessively high weights for certain classes can reverse this gain, making performance worse than the baseline, highlighting that methods requiring explicit parameterization necessitate domain expertise and considerable empirical search.

% HXE
Moreover, HXE~\cite{bertinetto2020making} had difficulty leveraging its advantages in the minimal depth hierarchy because its conditional term operates with limited information, which hinders the differentiation of importance of the class.
% Soft Label
Consistent performance gains are observed across all comparison groups with the weak soft-labels~\cite{bertinetto2020making} (\textit{i}.\textit{e}., $\beta=5$).
% HAF
The HAF~\cite{garg2022learning} results reveal that hierarchical feature alignment is not critical for MIL. This phenomenon can be attributed to the representational disparity: linear networks exhibit limited interaction while attention-based MIL captures nuanced feature correlations, which are not adaptable across the hierarchies.
% Change et al.,
Upon the results of Chang et al.~\cite{chang2021your}, it shows remarkable performance at $\mathcal{H}=1$ compared to other methods, due to training that emphasized coarser information through initial epochs.
% Ours
% Finally, our proposed approach yielded superior performance compared to the comparison methods, without exception. Not only did it ensure high accuracy, but also demonstrated the lowest type II error rates, which is critical in the medical domain. The findings indicate that our approach provides a suitable solution for real-world clinical WSIs, considering their vertical inter-class hierarchy and diagnostic priority at the same level.
% 솔
Finally, our proposed approach yielded superior performance compared to other methods, without exception. Not only did it ensure high accuracy, but also showed the lowest type II error rates, which is critical in medical domain. The findings indicate that our approach provides a suitable solution for real-world clinical WSIs, considering their vertical inter-class hierarchy and diagnostic priority at the same level.



\begin{figure}[t]
\centering
\includegraphics[width=0.95\linewidth]{figs/quality} 
\caption{Quantitative investigation into cases with mixed symptoms. We plot the $\hat{p}^{\mathcal{H}=2}$ of models trained with feature remixed samples against those of models trained without, shown with the corresponding WSIs.} \label{fig:quality}
\end{figure}



\subsection{Further Analysis}
% \noindent\textbf{Ablation Study}
% We have conducted an ablation study to understand the effect of each component of the proposed method on performance. The results of removing $\mathcal{L}_{IHA}$, $\mathcal{L}_{UHD}$, subsite $\mathbf{s}$, feature remix, and all of the aforementioned elements are plotted in Fig.~\ref{fig:ablation}. Removing each component caused a performance deduction and excluding all the aforementioned components showed the worst. The removal of the subsite at $\mathcal{H}=2$ also affected the precision at $\mathcal{H}=1$, implying that the probability alignment has an impact on the performance of $\mathcal{H}=1$. Among individual components, feature-reverse ablation resulted in the most substantial degradation, highlighting its importance in WSIs that contain multiple symptoms. Moreover, regarding the feature remix component, the concurrent increase in both recall and accuracy suggests that the improved recall is derived from precise diagnoses, not simply over-predicting positive cases.
% 솔 버전
\noindent\textbf{Ablation Study}
We have conducted an ablation study to understand each component's effect on performance. Results of removing $\mathcal{L}_{IHA}$, $\mathcal{L}_{UHD}$, subsite $\mathbf{s}$, feature remix, and all components are shown in Fig.~\ref{fig:ablation}. Each removal caused performance degradation, with excluding all components showing the worst results. Removing subsite at $\mathcal{H}=2$ also affected precision at $\mathcal{H}=1$, indicating probability alignment impacts $\mathcal{H}=1$ performance. Feature remix ablation resulted in the most substantial degradation, highlighting its importance for WSIs with multiple symptoms. Moreover, regarding the feature remix component, the concurrent increase in both recall and accuracy suggests that the improved recall is derived from precise diagnoses, not simply over-predicting positive cases.


\vspace{0.1cm}
% \noindent\textbf{Analysis on Intra-Hierarchy}
% To understand how the model performs against challenging cases, we have examined whether the model prioritizes the most urgent class when two or more cases are mixed within a WSI. The left tissue in Fig.~\ref{fig:quality} presents an HP with a substantial mixture of IP. Without training the intra-hierarchy, the MIL model predicts IP with greater confidence than HP, simply due to the symptom area. In contrast, a model that implicitly learns the diagnostic precedence of HP over IP predicts the case with the more serious diagnosis. The tissue on the right is a sample that pathologists diagnose as TA, but previous MIL approaches classified it as HP. Although a small area of TA is observed in the magnified view, it is expected to have a higher probability because it is more urgent than HP. Implicit feature remix prioritizes the class with higher precedence when multiple classes are present in an instance bag.

% 솔 버전
\noindent\textbf{Analysis on Intra-Hierarchy}
To understand how the model performs against challenging cases, we have examined whether the model prioritizes the most urgent class when two or more cases are mixed within a WSI. The left tissue in Fig.~\ref{fig:quality} presents an HP with substantial IP mixture. Without intra-hierarchy training, the MIL model predicts IP with greater confidence than HP, simply due to the symptom area. In contrast, a model that implicitly learns the diagnostic precedence of HP over IP predicts the case with the more serious diagnosis. The tissue on the right is a sample that pathologists diagnose as TA, but previous MIL approaches classified it as HP. Although a small area of TA is observed in the magnified view, it is expected to have a higher probability because it is more urgent than HP. Implicit feature remix prioritizes the class with higher precedence when multiple classes are present in an instance bag.