\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more

\usepackage{booktabs}
\usepackage{multirow}
\usepackage{float}
\usepackage[figuresleft]{rotating}

\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\jmlrvolume{-- nnn}
\editors{Accepted for publication at MIDL 2026}

\title[PAGET]{PAGET: Hierarchical Multi-Teacher Knowledge Distillation for Comprehensive Tumor Microenvironment Segmentation}

% Authors
\midlauthor{\Name{Daisuke Komura\nametag{$^{1}$}} \Email{komura@m.u-tokyo.ac.jp}\\
\addr $^{1}$ Department of Preventive Medicine, Graduate School of Medicine, The University of Tokyo, Tokyo, Japan
\AND
\Name{Maki Takao\nametag{$^{2}$}} \Email{maki.t.espoir@gmail.com}\\
\addr $^{2}$ Department of Obstetrics and Gynecology, Graduate School, Tokyo Medical and Dental University, Tokyo, Japan
\AND
\Name{Mieko Ochi\nametag{$^{1}$}} \Email{noonecanpass.and@gmail.com}\\
\Name{Takumi Onoyama\nametag{$^{3}$}} \Email{t-onoyama@tottori-u.ac.jp}\\
\addr $^{3}$ Division of Gastroenterology and Nephrology, Department of Multidisciplinary Internal Medicine, School of
Medicine, Faculty of Medicine, Tottori University, Tottori, Japan
\AND
\Name{Hiroto Katoh\nametag{$^{1}$}} \Email{hkat-prm@m.u-tokyo.ac.jp}\\
\Name{Hiroyuki Abe\nametag{$^{4}$}} \Email{ABEH-PAT@h.u-tokyo.ac.jp}\\
\addr $^{4}$ Department of Pathology, Graduate School of Medicine, The University of Tokyo, Tokyo, Japan
\AND
\Name{Hiroyuki Sano\nametag{$^{5}$}} \Email{h.sano@biomy-tech.com}\\
\addr $^{5}$ Biomy Inc., Tokyo, Japan
\AND
\Name{Teppei Konishi\nametag{$^{5}$}} \Email{t.konishi@biomy-tech.com}\\
\Name{Toshio Kumasaka\nametag{$^{6}$}} \Email{tskumasaka@gmail.com}\\
\addr $^{6}$ Department of Pathology, Japanese Red Cross Medical Center, Tokyo, Japan
\AND
\Name{Tomoyuki Yokose\nametag{$^{7}$}} \Email{kamaboko8ty@gmail.com}\\
\addr $^{7}$ Department of Pathology, Kanagawa Cancer Center, Kanagawa, Japan
\AND
\Name{Yohei Miyagi\nametag{$^{8}$}} \Email{miyagi.0e82r@kanagawa-pho.jp}\\
\addr $^{8}$ Kanagawa Cancer Center Research Institute, Kanagawa, Japan
\AND
\AND
\Name{Tetsuo Ushiku\nametag{$^{4}$}} \Email{usikut@gmail.com}\\
\AND
\Name{Shumpei Ishikawa\nametag{$^{1,9}$}} \Email{ishum-prm@m.u-tokyo.ac.jp}\\
\addr $^{9}$ Division of Pathology, National Cancer Center Exploratory Oncology Research \& Clinical Trial Center, Chiba, Japan
}

\begin{document}

\maketitle

\begin{abstract}
Comprehensive characterization of the tumor microenvironment (TME) from H\&E-stained histopathology images remains challenging due to the diversity of cellular components and limitations of current segmentation methods. We present PAGET (Pathological image segmentation via AGgrEgated Teachers), a multi-teacher knowledge distillation framework that enables simultaneous segmentation of 13 TME components from a single efficient model. Our key insight is that teacher predictions should be aggregated following the biological taxonomy of cell types—from tissue-level context through major cell categories to specific subtypes—rather than simple voting. By training specialized teachers on immunohistochemical restaining data and distilling their aggregated knowledge, the resulting student model not only matches but consistently outperforms the teacher ensemble on external datasets. We provide two complementary variants: PAGET-S for rapid semantic segmentation and PAGET-H for detailed panoptic segmentation. Extensive evaluation across three external datasets demonstrates robust generalization. Our implementation is available at \href{https://github.com/dakomura/PAGET}{https://github.com/dakomura/PAGET}.
\end{abstract}

\begin{keywords}
Knowledge distillation, multi-teacher learning, tumor microenvironment, histopathology, semantic segmentation, panoptic segmentation
\end{keywords}

\section{Introduction}

The tumor microenvironment (TME) orchestrates cancer progression through complex interactions among epithelial cells, immune infiltrates, and stromal components ~\cite{de2023evolving,binnewies2018understanding}. Quantifying these diverse cellular populations from routine H\&E slides could transform large-scale biomarker discovery and clinical decision support. However, achieving comprehensive TME characterization with both biological fidelity and computational efficiency remains an open challenge.

Current deep learning approaches for histopathology segmentation face three interconnected limitations. First, existing methods typically identify only 3–6 cell types, insufficient for comprehensive TME analysis. Second, these methods rely on morphology-based annotations by pathologists, which can be inaccurate for cells with atypical morphology where even experts cannot make definitive identifications. Immunohistochemical (IHC) restaining techniques address this annotation challenge by enabling protein-based ground truth~\cite{komura2023restaining}, but introduce a third limitation: separate models must be trained for each antibody-cell type pair, making comprehensive TME analysis computationally prohibitive for large-scale whole slide image (WSI) analysis.

Addressing these limitations requires annotation accuracy, comprehensive cellular coverage, and processing speed. We achieve all three through multi-teacher knowledge distillation—aggregating IHC-trained teachers into unified supervision and distilling their knowledge into a single efficient student. A key insight is that cellular classification follows an inherent biological hierarchy: leukocytes subdivide into lymphoid and myeloid lineages, then into specific subtypes ~\cite{murphy2016janeway,diehl2016cell}. Rather than flat classification that ignores these relationships, our framework aggregates teacher predictions following this taxonomy.

Our contributions include: (1) a hierarchical aggregation strategy that combines teacher predictions following biological taxonomy; (2) the first unified framework for 13-class TME segmentation from H\&E slides, with semantic (PAGET-S) and panoptic (PAGET-H) variants; (3) demonstration that the student consistently outperforms the teacher ensemble on external datasets; and (4) extensive validation across three diverse datasets showing robust generalization.

\section{Related Work}

\subsection{Histopathology Image Segmentation}

Deep learning has revolutionized automated analysis of histopathology images. HoverNet~\cite{graham2019hover} achieve simultaneous nuclear segmentation and classification through multi-task learning, while HD-YOLO~\cite{rong2023deep} applies object detection paradigms to cell identification. Cerberus~\cite{graham2023one} demonstrates that a single model can perform multiple segmentation tasks. However, these methods typically focus on limited cell type repertoires (3-7 classes) and single-tissue contexts, resulting in incomplete TME characterization.

A fundamental challenge is annotation quality. Morphology-based annotations by pathologists can be unreliable for cells with atypical appearance~\cite{komura2023restaining}. IHC restaining addresses this by enabling protein-based ground truth, but requires training separate models per antibody, making comprehensive analysis computationally prohibitive. Our work bridges this gap by distilling multiple IHC-trained specialists into a single unified model.

\subsection{Knowledge Distillation}

Knowledge distillation~\cite{hinton2015distilling} transfers knowledge from complex teacher models to efficient student models and has been widely explored in computer vision. Beyond single-teacher settings, ensemble and multi-teacher distillation methods compress the predictions or features of several teachers into a single student( \cite{shen2019meal}, \cite{yang2025multi}, \cite{ye2024knowledge}). However, these approaches generally assume that all teachers solve the same task and share an identical label space, focusing on fusing complementary views of a single prediction problem. 

Our setting differs fundamentally: teachers are specialized for distinct biological entities (e.g., epithelium vs. specific immune subtypes) with heterogeneous output spaces. We propose taxonomy-aware aggregation that respects the hierarchical relationships among cell types, a formulation unexplored in medical imaging to our knowledge.


\section{Method}

\subsection{Problem Formulation}

Given an H\&E-stained histopathology image, our goal is to produce a segmentation map covering 13 TME components plus background. We achieve this through multi-teacher knowledge distillation, aggregating specialized teacher predictions into unified supervision for a single efficient student model.

\subsection{Dataset Construction for Distillation}

Our distillation dataset comprises 59,443 H\&E images from tissue microarrays spanning 22 cancer types, originally collected for IHC restaining studies ~\cite{komura2023restaining} . We apply the teacher ensemble (Section \ref{teacher_model_ensemble}) to these images to generate pseudo-labels for 13 TME components.
The 13 classes span two annotation levels: tissue-level labels (epithelium, stroma, smooth muscle) and nucleus-level labels (epithelial cells, fibroblasts, endothelial cells, red blood cells, lymphocytes, plasma cells, myeloid cells, eosinophils, neutrophils, and mitotic cells). The dataset contains 8.7 billion labeled tissue pixels and 15.4 million predicted nuclei, with detailed statistics in Table~\ref{tab:training_stats} (Appendix \ref{app:training_dataset}).


\subsection{Teacher Model Ensemble}
\label{teacher_model_ensemble}
Our teacher ensemble comprises specialized models operating at two scales (Figure \ref{fig:overview}). At the tissue level, SegPath models ~\cite{komura2023restaining} trained on IHC-restaining data provide pixel-wise segmentation of epithelium, smooth muscle, endothelium, and red blood cells. At the nucleus level, additional SegPath models identify leukocyte nuclei (CD45+), while dedicated granulocyte models trained on MPO and ECP staining distinguish neutrophil and eosinophil nuclei. MIDOG++ \cite{aubreville2023comprehensive} contributes mitotic figure detection.
HoverNet~\cite{graham2019hover} serves a dual role: it provides nucleus instance masks that define spatial boundaries for aggregation, and contributes baseline 6-class nucleus classification (trained on PanNuke\cite{gamper2020pannuke}) that is refined by more specific teachers in our hierarchical aggregation. Architectural details for all models are provided in Appendix  ~\ref{app:teacher_details}.

\subsection{Teacher Prediction Aggregation}
\label{sec:agg}
Our aggregation strategy is motivated by the assumption that higher-level biological categories (e.g., tissue context or major cell lineages) are more robustly predicted from H\&E images than fine-grained subtypes. This assumption naturally arises in settings where teacher models are trained on heterogeneous markers and operate at different semantic resolutions, as in our IHC-trained ensemble with partially overlapping label spaces. Hierarchical aggregation exploits this asymmetry by constraining subtype predictions using more reliable coarse-level outputs, which is not possible in flat aggregation.

Specifically, PAGET aggregates teacher predictions following a biological taxonomy of cell types (Figure~\ref{fig:overview}). This hierarchy-aware approach allows fine-grained teachers to override coarse classifications when confident, while maintaining biological consistency.

\begin{figure}[H]
\floatconts
  {fig:overview}
  {\caption{PAGET framework overview. }}
  {\includegraphics[width=0.9\linewidth]{MIDL Fig.1.png}}
\end{figure}

\subsubsection{Three-Level Biological Hierarchy}

We develop a three-level hierarchical classification scheme that reflects the natural taxonomy of cellular components:

\textbf{Level 1 - Tissue Context:} Distinguishes major tissue types (smooth muscle, epithelial tissue), providing essential structural context via SegPath tissue model.

\textbf{Level 2 - Major Cell Categories:} Within tissue regions, identifies broad cell types—leukocytes, endothelial cells, red blood cells, and epithelial cells—using SegPath nucleus models.

\textbf{Level 3 - Leukocyte Subtypes:} Subdivides leukocytes into lymphocytes, plasma cells, eosinophils, neutrophils and other myeloid cells.

This hierarchy progresses from morphologically stable features to specific phenotypes that require specialized IHC-trained teachers.

\subsubsection{Aggregation Algorithm}

The aggregation proceeds in five steps. Let $\Omega \subset \mathbb{Z}^2$ denote the image domain and $S(p)$ the final label at pixel $p \in \Omega$.

\textbf{Step 1 (Spatial initialization):} HoverNet extracts nucleus instances $\mathcal{N} = \{\mathcal{N}_i \subset \Omega \mid i=1,\ldots,I\}$, and Otsu thresholding on Gaussian-smoothed images identifies background pixels $\mathcal{B} \subset \Omega$. The remaining pixels form the tissue region $\Omega' = \Omega \setminus (\mathcal{N} \cup \mathcal{B})$.

\textbf{Step 2 (Tissue classification):} For each pixel $p \in \Omega'$, we assign a tissue label based on SegPath tissue model logits $\ell^{\text{tis}}_c(p)$ for $c \in \{\text{smooth muscle}, \text{epithelium}\}$:
\begin{equation}
t(p) =
\begin{cases}
\arg\max_{c} \ell^{\text{tis}}_c(p), & \max_{c} \ell^{\text{tis}}_c(p) > 0, \\
\text{stroma}, & \text{otherwise}.
\end{cases}
\end{equation}

\textbf{Step 3 (Hierarchical nucleus classification):} Classification proceeds in two stages. First, at the pixel level, teacher predictions are aggregated following the three-level hierarchy (Section~3.4.1): predictions from deeper levels override coarser classifications when teachers at those levels produce positive outputs. For example, a pixel predicted as leukocyte (Level 2) is overridden by lymphocyte (Level 3) if the lymphocyte teacher fires.

Second, at the nucleus level, we aggregate pixel predictions within each nucleus $\mathcal{N}_i$. Let $\text{count}_c(i) = |\{p \in \mathcal{N}_i : s(p) = c\}|$ denote the pixel count for class $c$. We compute the most frequent non-background class:
\[
c^*(i) = \arg\max_{c \neq 0} \text{count}_c(i).
\]
When $c^*(i) = \text{LEU}$ (leukocyte, Level 2), we further refine to Level 3 subtypes:
\[
y(i) = 
\begin{cases}
\displaystyle\arg\max_{s \in \mathcal{S}_{\text{LEU}}} \text{count}_s(i), & \text{if } \exists\, s \in \mathcal{S}_{\text{LEU}}: \text{count}_s(i) > 0, \\
\text{LEU}, & \text{otherwise},
\end{cases}
\]
where $\mathcal{S}_{\text{LEU}} = \{\text{lymphocyte, plasma cell, myeloid cell, eosinophil, neutrophil}\}$. For all other classes, $y(i) = c^*(i)$.

\textbf{Step 4 (Label Completion):} This step completes the label space by addressing cell types that no specialized teacher can directly predict, using two rules: (a) unclassified nuclei within epithelial tissue regions are labeled as epithelial cells, as they predominantly represent epithelial nuclei; (b) nuclei classified only as ``connective'' by HoverNet are assigned as fibroblasts, since endothelial cells have already been identified by SegPath and fibroblasts constitute the remaining stromal cell population.

\textbf{Step 5 (Mitosis integration):} MIDOG++ detections are converted to circular regions of interest (radius 30 pixels), and overlapping nuclei are reclassified as mitotic figures.

This hierarchy-aware aggregation yields the final segmentation $S$ by combining tissue- and nucleus-level decisions in a biologically consistent manner.

\subsection{Student Model Architecture}

We employ SegFormer~\citep{xie2021segformer} with MiT-B5 encoder (pretrained on ImageNet) for the student model. Input images are processed at 20$\times$ magnification (384$\times$384 pixels); many clinical sites operate at 20$\times$ due to storage and scanning time constraints, making this resolution practically relevant.

The student directly predicts pixel-wise labels for all 14 classes (13 TME components plus background). We provide two inference variants. \textbf{PAGET-S} (Semantic) outputs these pixel-wise predictions directly, optimizing for speed. \textbf{PAGET-H} (Panoptic) combines PAGET-S predictions with HoverNet nucleus instance masks, assigning each nucleus the majority class among its constituent pixels. Both variants share the same trained SegFormer weights; PAGET-H adds HoverNet inference time to provide instance-level output.

\subsection{Training Details}

We employ AdamW optimizer with learning rate 6e-5, betas (0.9, 0.999), and weight decay 0.01. The learning rate schedule combines linear warmup from 0 to 1500 iterations followed by polynomial decay from 1500 to 48,000 iterations. Training uses standard CrossEntropyLoss against aggregated teacher labels with batch size 4. Data augmentation includes random resizing (0.85-1.15), cropping (384×384), horizontal/vertical flipping (p=0.5), random blur, gamma adjustment, and photometric distortions. Training was conducted on 8× NVIDIA H100 80GB GPUs. 

\section{Experimental Setup}

\subsection{Datasets}

For internal testing, we held out 3,133 images from the training set, covering all 22 cancer types. Here, aggregated teacher predictions serve as ground truth, enabling evaluation of how well the student reproduces teacher supervision.

For external validation, we employed three datasets with human annotations as ground truth. PanopTILs~\citep{liu2024panoptic} provides breast cancer samples with expert annotations. Lizard~\citep{graham2021lizard} contains colorectal cancer images from four subsets (DigestPath, GlaS, CoNSeP, CRAG); we exclude PanNuke to avoid data leakage, as HoverNet in our teacher ensemble was trained on this dataset. KCCRC is a multi-institutional cohort from Japanese Red Cross Medical Center and Kanagawa Cancer Center, containing colon and gastric samples with pathologist annotations for immune cell subtypes and endothelial cells.

This study was conducted in accordance with the Declaration of Helsinki and approved by the Institutional Review Boards of The University of Tokyo (approval numbers 2381 and 2019158NI), Japanese Red Cross Medical Center (approval number 1414), and Kanagawa Cancer Center (approval number 2020-118).


\subsection{Baselines}

We compare against publicly available representative methods. HD-YOLO~\citep{rong2023deep} applies object detection for cell identification (lung and breast variants). HoverNet~\citep{graham2019hover} provides nuclear instance segmentation and classification (PanNuke and MoNuSAC~\citep{verma2020monusac} variants). Cerberus~\citep{graham2023one} performs multi-task segmentation; it is excluded from Lizard evaluation due to training set overlap. We also compare against our teacher ensemble to evaluate whether the distilled student can match or exceed teacher performance.

\subsection{Evaluation Metrics}

Due to varying class definitions across datasets and models, we designed hierarchical class mapping in consultation with pathologists (Appendix~\ref{app:class_mapping}). For tissue-level segmentation, we report Dice score. For nucleus-level classification, we report Matthews Correlation Coefficient (MCC)~\citep{chicco2021matthews} computed per nucleus instance for each class separately. MCC ranges from $-1$ (complete disagreement) to $+1$ (perfect agreement), with 0 indicating random prediction; it provides balanced evaluation for imbalanced classes common in histopathology.


\section{Results and Discussion}

\subsection{Internal Validation}

On internal test data, aggregated teacher predictions serve as ground truth, enabling evaluation of how faithfully the student reproduces teacher supervision. Table~\ref{tab:internal_performance} (Appendix ~\ref{app:internal}) summarizes the results.

Both PAGET-S and PAGET-H achieve high fidelity to teacher labels. For tissue-level segmentation, both variants perform comparably, with IoU scores exceeding 0.70 for stroma and 0.80 for epithelium and smooth muscle. For nucleus-level segmentation, PAGET-H consistently outperforms PAGET-S across all classes except for endothelial cells. For example, epithelial cell nucleus IoU improves from 0.760 to 0.853, and lymphocyte from 0.646 to 0.753. This gain likely stem from majority voting within each nucleus instance, which reduces pixel-level noise in semantic predictions and yields more stable class assignments.



\subsection{Ablation Study}

To validate our hierarchical aggregation design, we compared two strategies using PanopTILs and KCCRC, which provide ground truth annotations compatible with the 40$\times$ resolution at which our IHC-restaining teacher models operate. We evaluated: (1) \textbf{flat aggregation}, where the class with maximum logit across all 9 directly-predicted cell types is selected, and (2) \textbf{hierarchical aggregation}, using our proposed biological hierarchy, including both partial hierarchies that exclude deeper levels and the full hierarchy. In flat aggregation, all teacher predictions compete directly in a single label space, whereas hierarchical aggregation applies predictions sequentially following biological hierarchy, such that coarse-level decisions constrain downstream subtype classification (Figure~\ref{fig:hier_vs_flat}). The 9 cell types exclude stroma, epithelial cell nuclei, fibroblasts, and mitotic cells, which are assigned through refinement rules rather than direct SegPath prediction (Section \ref{sec:agg}). Both strategies use identical teacher models; only the aggregation method differs.

Table~\ref{tab:ablation} summarizes results. Hierarchical aggregation and its intermediate variants with reduced hierarchy depth consistently outperforms flat aggregation for cell-level classification, with substantial relative improvements for lymphocytes (+24.5\% in PanopTILs, +25.7\% in KCCRC) and eosinophils (+81.9\% in KCCRC). Performance generally improves as deeper hierarchy levels are incorporated, with gains most evident for cell types at deeper hierarchy levels, where coarse-level context helps disambiguate fine-grained subtypes. Tissue-level segmentation shows comparable performance between strategies, as expected since tissue classification occurs at the first hierarchy level without subsequent refinement.

\begin{table}[H]
\centering
\caption{Ablation study: Flat vs hierarchical aggregation with different hierarchy depths} (Dice score). Best in \textbf{bold}.
\label{tab:ablation}
\small
\begin{tabular}{llccccc}
\toprule
\textbf{Dataset} & \textbf{Strategy} & \textbf{Epi} & \textbf{Blood} & \textbf{Lym} & \textbf{Pls} & \textbf{Leu} \\
\midrule
\multirow{2}{*}{PanopTILs} & Flat & \textbf{0.736} & \textbf{0.372} & 0.094 & 0.073 & \textbf{0.372} \\
 & 2 layers (merge level 1 and 2) & \textbf{0.736} & \textbf{0.372} & 0.117 & \textbf{0.118} & 0.369 \\
 & Full hierarchy & 0.735 & 0.367 & \textbf{0.117} & \textbf{0.118} & 0.370 \\
\midrule
 & & \textbf{Lym} & \textbf{Pls} & \textbf{Mye} & \textbf{Neu} & \textbf{Eos} \\
\midrule
\multirow{2}{*}{KCCRC} & Flat & 0.358 & 0.017 & 0.223 & 0.205 & 0.149 \\
 & 2 layers (merge level 1 and 2) & \textbf{0.450} & \textbf{0.020} & \textbf{0.235} & 0.161 & 0.233 \\
 & Full hierarchy & \textbf{0.450} & \textbf{0.020} & \textbf{0.235} & \textbf{0.237} & \textbf{0.271} \\
\bottomrule
\end{tabular}
\end{table}

\subsection{External Validation}

Figure~\ref{fig:qualitative} and \ref{fig:qualitative_enlarged} (Appendix~\ref{app:external}) shows representative qualitative results on PanopTILs, demonstrating that our proposed variants achieve high-quality results, with PAGET-H providing particularly accurate nuclear boundaries. 

Figures~\ref{fig:dataset_comparison} and \ref{fig:celltype_comparison} summarize quantitative performance across datasets and cell types. Complete numerical results are provided in Table~\ref{tab:unified_results} (Appendix~\ref{app:external}). Across most cohorts, PAGET-S and PAGET-H consistently outperform both the full teacher pipeline (including HoverNet and refinement rules) and conventional nucleus segmentation models. On KCCRC, collected from Japanese institutions as was our training data, student and teacher performance are comparable. On datasets from different countries, however, the distilled student frequently exceeds teacher performance, suggesting that the combination of hierarchical aggregation and data augmentation provides effective regularization against distribution shift.

Cell-type-wise comparisons show that PAGET achieves competitive or superior performance across evaluated classes. While baseline models typically support only a subset of cell types, PAGET provides predictions for all 13 TME components from a single model. Notably, higher-level categories such as leukocytes consistently achieve higher accuracy than fine-grained subtypes (Table~\ref{tab:unified_results}), empirically supporting the assumption underlying our hierarchical aggregation design (Section~\ref{sec:agg}).

\begin{figure}[htbp]
\floatconts
  {fig:qualitative}
  {\caption{Representative segmentation results on PanopTILs dataset comparing PAGET variants with baseline methods. }}
  {\includegraphics[width=1.0\linewidth]{figure2_small.pdf}}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[width=.80\linewidth]{datasetwise_cleaned_NO_ABBR.png}
    \caption{Dataset-wise comparison of PAGET versus baseline models. Bars show mean performance differences (PAGET - baseline), measured by Dice for tissue, MCC for nuclei, computed separately for each dataset. Baselines as indicated on x-axis.} Positive values indicate better performance for PAGET. The number above each bar denotes the number of tissue or nucleus categories available for that comparison.
    \label{fig:dataset_comparison}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[width=.80\linewidth]{celltype_all_model_comparison_2x3_noSD.png}
    \caption{Cell-type-wise comparison of PAGET versus baseline models. Bars show mean performance differences (PAGET - baseline), measured by Dice for tissue, MCC for nuclei, averaged over all datasets in which the corresponding cell type is available. Baselines as indicated on x-axis. Positive values indicate better performance for PAGET. The number above each bar denotes the number of datasets available for that comparison.}
    \label{fig:celltype_comparison}
\end{figure}

To evaluate zero-shot generalization capability, we applied PAGET to adenoid cystic carcinoma, a cancer type completely absent from the 22 cancer types in our training dataset (Fig~\ref{fig:unseen_cancer}) We analyzed one case and had a board-certified pathologist review the segmentation results for obvious errors and missed detections. Notably, tumor cells—which we hypothesized would be the most challenging to generalize given their type-specific morphological characteristics—showed no obvious misclassifications. While some missed detections were observed, particularly in regions where even the reviewing pathologist found cell type determination ambiguous (marked separately), no cell type showed systematic failure compared to the performance on cancer types included in training. Although this evaluation is limited to a single unseen cancer type and thus definitive conclusions cannot be drawn, these preliminary results suggest that PAGET may have learned generalizable morphological features for cell type recognition that extend beyond the specific cancer types in the training data.

\subsection{Conflict Cases}
To understand failure cases and how hierarchical aggregation handles conflicting predictions, we analyzed pixels where multiple teachers predicted $>$90\% probability simultaneously on the KCCRC dataset (Figure~\ref{fig:conflict}). At the tissue level, smooth muscle and epithelium showed moderate overlap (6–11\% of high-probability pixels). At the leukocyte subtype level, neutrophil-eosinophil conflicts were frequent (34–42\% of high-probability pixels), reflecting the morphological similarity between these granulocytes in H\&E. Lymphocyte-plasma cell conflicts were less common (3–12\%). When conflicts occur, they typically involve cells that are morphologically ambiguous (Figure~\ref{fig:conflict_example}). Notably, 67–68\% of granulocyte high-probability pixels also showed high myeloid probability. Existing models also show limited accuracy for these cell subtypes (Table~\ref{tab:class_mapping_full}), suggesting that this is an inherently challenging task from H\&E images. Despite these conflicts, PAGET achieves competitive or superior performance compared to existing models across evaluated cell types.



\begin{figure}
    \centering
    \includegraphics[width=.95\linewidth]{conflict_heatmap_3levels_Greens.png}
    \caption{Within-level conflict rates between teacher models. Each cell shows the percentage of high prediction probability pixels ($>$90\%) for the row cell type that also have high probability for the column cell type. }
    \label{fig:conflict}
\end{figure}


\subsection{Computational Efficiency}

PAGET-S processes a 384$\times$384 tile in 4 ms on a single NVIDIA V100 GPU, achieving approximately 207$\times$ speedup compared to the teacher ensemble alone and 301$\times$ speedup compared to the full teacher pipeline with HoverNet (Table~\ref{tab:timing}, Appendix~\ref{app:time}). For a typical WSI (100k$\times$100k pixels at 40$\times$), PAGET-S completes processing in approximately 1 minute by directly accessing the 20$\times$ layer from the pyramidal image structure, versus over 6 hours for the full teacher pipeline. PAGET-H, which combines PAGET-S with HoverNet for panoptic segmentation, processes the same WSI in approximately 2 hours, achieving 3.2$\times$ speedup.

In terms of accuracy, PAGET-H generally achieves slightly higher performance than PAGET-S, with consistent improvements observed for connective tissue and leukocyte classification (Figure~\ref{fig:celltype_comparison}). PAGET-S is thus suited for large-scale WSI screening where speed is critical, while PAGET-H is recommended when precise nucleus boundaries are required or when accurate identification of stromal and immune cells is prioritized.

\section{Conclusion}

We presented PAGET, a hierarchical multi-teacher knowledge distillation framework enabling simultaneous segmentation of 13 TME components from H\&E slides. Our ablation study validates that aggregating predictions following biological taxonomy improves classification over flat aggregation. The distilled student frequently outperforms the teacher ensemble on external datasets, suggesting effective regularization against distribution shift. PAGET-S and PAGET-H provide a unified solution bridging annotation accuracy and computational efficiency for comprehensive TME characterization.


PAGET relies on the assumption that higher-level tissue and lineage predictions are more reliable than fine-grained subtype predictions. 
In scenarios where tissue segmentation is unreliable (e.g., under severe domain shift) or when cell subtypes lack clear morphological or immunohistochemical separability in H\&E, errors or biases at higher hierarchy levels may propagate to downstream subtype predictions, potentially suppressing correct predictions or amplifying noise introduced during label aggregation.

Beyond the specific teacher models used in this study, the proposed framework is architecture-agnostic and provides a flexible foundation for incorporating future advances, such as foundation-model-based components, as teachers or student backbones \citep{pachitariu2025cellpose_sam}, \citep{guo2025evaluating_cell_foundation}, \citep{hörst2025cellvitenergyefficientadaptivecell}
\clearpage

\midlacknowledgments{%
This work was supported by AMED Practical Research for Innovative Cancer Control grants JP 24ck0106873 and JP 24ck0106904, and JSPS KAKENHI Grant-in-Aid for Scientific Research (B) grant number 21H03836. We thank the pathology teams at collaborating institutions for dataset curation and validation support. We thank Biomy Inc. for developing analysis tools.
}

\bibliography{midl26_24}

\clearpage
\appendix
\section{Training dataset statistics}
\label{app:training_dataset}

Table~\ref{tab:training_stats} summarizes the training dataset statistics. Table~\ref{tab:cancer_types_full} lists all 22 cancer types with image counts.


\begin{table}[htbp]
\floatconts
  {tab:training_stats}%
  {\caption{Comprehensive training dataset statistics}}%
  {\small
  \begin{tabular}{lrlr}
  \toprule
  \textbf{Nucleus Type} & \textbf{Count} & \textbf{Tissue Type} & \textbf{Pixels} \\
  \midrule
  Epithelial cell & 8.4M & Epithelium & 4.0B \\
  Fibroblast & 2.9M & Stroma & 2.5B \\
  Lymphocyte & 1.2M & Smooth muscle & 2.1B \\
  Plasma cell & 357K & Red blood cell & 130M \\
  Myeloid cell & 461K & & \\
  Eosinophil & 37K & & \\
  Neutrophil & 291K & & \\
  Endothelial cell & 484K & & \\
  Mitotic cell & 2.7K & & \\
  \midrule
  \textbf{Total nuclei} & \textbf{15.4M} & \textbf{Total pixels} & \textbf{8.7B} \\
  \bottomrule
  \end{tabular}}
\end{table}


\begin{table}[htbp]
\floatconts
  {tab:cancer_types_full}%
  {\caption{Training dataset composition by cancer type}}%
  {\begin{tabular}{lc}
  \toprule
  \textbf{Cancer Type} & \textbf{Images} \\
  \midrule
  Endometrial cancer & 3,347 \\
  Breast cancer & 3,264 \\
  Bladder cancer & 2,884 \\
  Urothelial tumor & 2,873 \\
  Prostate cancer & 2,790 \\
  Kidney tumor & 2,783 \\
  Gastric cancer & 2,679 \\
  Extrahepatic bile duct cancer & 2,517 \\
  Colorectal cancer & 2,290 \\
  Triple-negative breast cancer & 2,046 \\
  Esophagogastric junction cancer & 2,035 \\
  Gastric cancer lymph node metastasis & 1,911 \\
  Lung squamous cell carcinoma & 1,852 \\
  Benign breast lesion & 1,831 \\
  Pancreatic cancer & 1,785 \\
  Hypopharyngeal and laryngeal cancer & 1,747 \\
  Hepatocellular carcinoma & 1,723 \\
  Cervical squamous cell carcinoma & 1,709 \\
  Pancreatic neuroendocrine tumor & 1,675 \\
  Liver cancer & 1,220 \\
  Thymoma & 667 \\
  Ovarian mucinous cystic neoplasm & 575 \\
  \midrule
  \textbf{Total} & \textbf{59,443} \\
  \bottomrule
  \end{tabular}}
\end{table}


\section{Teacher Model Architectural Details}
\label{app:teacher_details}

\paragraph{SegPath Models.}
Models for epithelium, smooth muscle, endothelium, red blood cells, and leukocytes follow the architectures and training procedures described in \citet{komura2023restaining}. All models take inputs at 40$\times$ magnification.

\paragraph{Neutrophil Model.}
U-Net with EfficientNet-B1 encoder, noisy student pretraining. Input: 492$\times$492 at 40$\times$. Trained on MPO antibody staining. We utilized the Dice loss function, achieving a validation Dice score of 0.411.

\paragraph{Eosinophil Model.}
DeepLabV3+ with ResNet34 encoder, pretrained on ImageNet. Input: 492$\times$492 at 40$\times$. Trained on ECP antibody staining. This model also used Dice loss, resulting in a validation Dice score of 0.299.

\paragraph{MIDOG++.}
RetinaNet-based detector trained on the MIDOG++ challenge dataset~\citep{aubreville2023comprehensive}. Multi-scanner training for robust mitosis detection.

\paragraph{HoverNet.}
Standard architecture from \citet{graham2019hover}, trained on PanNuke dataset~\citep{gamper2020pannuke}. Provides 6-class nucleus classification and instance segmentation.


\section{Internal Validation}
\label{app:internal}

Table~\ref{tab:internal_performance} summarizes the internal test performance.


\begin{table}[htbp]
\floatconts
  {tab:internal_performance}
  {\caption{Internal test performance (IoU). Best results in \textbf{bold}.}}
  {\small
  \begin{tabular}{lcc | lcc}
  \toprule
  \textbf{Tissue Class} & \textbf{PAGET-S} & \textbf{PAGET-H} &
  \textbf{Nucleus Class} & \textbf{PAGET-S} & \textbf{PAGET-H} \\ 
  \midrule
  Background         & 0.847 & \textbf{0.848} & Epithelial cell & 0.760 & \textbf{0.853} \\
  Stroma             & 0.709 & \textbf{0.715} & Fibroblast               & 0.613 & \textbf{0.649} \\
  Smooth muscle      & \textbf{0.822} & 0.814 & Mitotic cell             & 0.302 & \textbf{0.382} \\
  Epithelium         & 0.772 & \textbf{0.809} & Lymphocyte               & 0.646 & \textbf{0.753} \\
  Red blood cell     & \textbf{0.805} & 0.783 & Plasma cell              & 0.556 & \textbf{0.612} \\
                     &       &               & Myeloid cell             & 0.399 & \textbf{0.450} \\
                     &       &               & Eosinophil               & 0.440 & \textbf{0.525} \\
                     &       &               & Neutrophil               & 0.538 & \textbf{0.605} \\
                     &       &               & Endothelial cell               & \textbf{0.585} & 0.532 \\
  \bottomrule
  \end{tabular}
  }
\end{table}
\section{Ablation Study}
\label{app:ablation}
Figure~\ref{fig:hier_vs_flat} illustrates the flat and hierarchical aggregation used in the ablation study.

\begin{figure}
    \centering
    \includegraphics[width=.95\linewidth]{Comparison of Flat and Hierarchical Aggregation Methods.png}
    \caption{Flat and hierarchical aggregation used in the ablation study.}
    \label{fig:hier_vs_flat}
\end{figure}



\section{Class Mapping for Evaluation}
\label{app:class_mapping}

Different datasets and models use varying class definitions. To enable fair comparison, we designed hierarchical class mapping in consultation with pathologists. Table~\ref{tab:class_mapping_full} shows the correspondence used for evaluation. For example, PAGET's lymphocyte, plasma cell, myeloid cell, eosinophil, and neutrophil predictions are combined when comparing against ground truth labeled simply as ``leukocyte.''

\begin{sidewaystable}
\centering
\caption{Complete class correspondence across evaluation datasets}
\label{tab:class_mapping_full}
{\scriptsize
\begin{tabular}{|l|l|p{2.5cm}|p{1.2cm}|p{1.4cm}|p{1.4cm}|p{1.3cm}|p{2.2cm}|}
\hline
\textbf{Dataset} & \textbf{Eval Class} & \textbf{Class in Dataset} & \textbf{PAGET} & \textbf{HoverNet (PanNuke)} & \textbf{HoverNet (MoNuSAC)} & \textbf{HD-YOLO} & \textbf{Cerberus} \\
\hline
\hline
\multirow{6}{*}{PanopTILs} & Epithelial tissue & cancerous epithelium, normal epithelium, cancer nucleus, normal epithelial nucleus & epi, epi\_n & -- & -- & -- & -- \\
\cline{2-8}
 & Epithelial cell & cancer nucleus, normal epithelial nucleus & epi\_n & neopla, no-neo & epi & tumor & epithelial \\
\cline{2-8}
 & Connective tissue cell & stromal nucleus, large stromal nucleus & endo, fib & connec & -- & stromal & connective tissue cell \\
\cline{2-8}
 & Leukocyte & lymphocyte nucleus, plasma cell / large TIL nucleus & lym, pls, mye, eos, neu & inflam & lym, macro, neut & sTILs, macrophage & neutrophil, lymphocyte, plasma cell, eosinophil \\
\cline{2-8}
 & Lymphocyte & lymphocyte nucleus & lym & -- & lym & sTILs & lymphocyte \\
\cline{2-8}
 & Plasma cell & plasma cell / large TIL nucleus & pls & -- & -- & -- & plasma cell \\
\hline
\hline
\multirow{7}{*}{Lizard} & Epithelial cell & epithelial & epi\_n & neopla, no-neo & epi & tumor & -- \\
\cline{2-8}
 & Connective tissue cell & connective & endo, fib & connec & -- & stromal & -- \\
\cline{2-8}
 & Leukocyte & lymphocyte, plasma, neutrophil, eosinophil & lym, pls, mye, eos, neu & inflam & lym, macro, neut & sTILs, macrophage & -- \\
\cline{2-8}
 & Lymphocyte & lymphocyte & lym & -- & lym & sTILs & -- \\
\cline{2-8}
 & Plasma cell & plasma & pls & -- & -- & -- & -- \\
\cline{2-8}
 & Eosinophil & eosinophil & eos & -- & -- & -- & -- \\
\cline{2-8}
 & Neutrophil & neutrophil & neu & -- & neutrophil & -- & -- \\
\hline
\hline
\multirow{8}{*}{KCCRC} & Endothelial cell & endothelial cell & endo & -- & -- & -- & -- \\
\cline{2-8}
 & Leukocyte & lymphocyte, plasma cell, myeloid cell, eosinophil, neutrophil & lym, pls, mye, eos, neu & inflam & lym, macro, neut & sTILs, macrophage & neutrophil, lymphocyte, plasma cell, eosinophil \\
\cline{2-8}
 & Lymphocyte & lymphocyte & lym & -- & lym & sTILs & lymphocyte \\
\cline{2-8}
 & Plasma cell & plasma cell & pls & -- & -- & -- & plasma cell \\
\cline{2-8}
 & Myeloid cell & myeloid cell, eosinophil, neutrophil & mye & -- & -- & -- & -- \\
\cline{2-8}
 & Eosinophil & eosinophil & eos & -- & -- & -- & eosinophil \\
\cline{2-8}
 & Neutrophil & neutrophil & neu & -- & neutrophil & -- & neutrophil \\
\cline{2-8}
 & Mitotic cell & mitotic cell & mit & -- & -- & -- & -- \\
\hline
\end{tabular}}
\end{sidewaystable}


\section{External Validation Results}
\label{app:external}

\begin{figure}[htbp]
\floatconts
  {fig:qualitative_enlarged}
  {\caption{Zoomed-in views of the central regions shown in Figure \ref{fig:qualitative}.} }
  {\includegraphics[width=1.0\linewidth]{figure2_enlarged.png}}
\end{figure}

Table~\ref{tab:unified_results} reports comprehensive external validation results across all datasets. 

\begin{sidewaystable}[p]
\centering
\caption{Comprehensive external validation results across all datasets. Dice scores for tissue-level segmentation, MCC for nucleus-level classification. Best performance in \textbf{bold}, second-best \underline{underlined}. -- indicates unsupported class, n/a indicates ground truth data unavailable.}
\label{tab:unified_results}
\tiny
\begin{tabular}{llcccccccccccc}
\toprule
\multirow{2}{*}{\textbf{Dataset}} & \multirow{2}{*}{\textbf{Model}} & \multicolumn{3}{c}{\textbf{Tissue (Dice)}} & \multicolumn{9}{c}{\textbf{Nucleus (MCC)}} \\
\cmidrule(lr){3-5} \cmidrule(lr){6-14}
& & Epi & Str & Bld & Epi & Con & Leu & Lym & Pls & Mye & Eos & Neu & Endo \\
\midrule
\multirow{7}{*}{\rotatebox[origin=c]{90}{PanopTILs}} 
& \textbf{PAGET-S} & \underline{0.868} &  \underline{0.694} & 0.443 & \underline{0.657}& 0.378& \underline{0.562} & \underline{0.406} & \textbf{0.228} & n/a & n/a & n/a & n/a \\
& \textbf{PAGET-H} & \textbf{0.868} & \textbf{0.696} & \underline{0.450} & \textbf{0.663} & \textbf{0.415} & \textbf{0.579} & \textbf{0.424} & \underline{0.226} & n/a & n/a & n/a & n/a \\
& Teacher\textsubscript{full}            & 0.797& -- & \textbf{0.465} & 0.435 & 0.295 & 0.345 & 0.209 & 0.176 & n/a & n/a & n/a & n/a \\
& HD-YOLO (Breast)   & -- & -- & -- & 0.512& \underline{0.389}& 0.449 & 0.356 & -- & n/a & n/a & n/a & n/a \\
& HoverNet (PanNuke) & -- & -- & -- & 0.461 & 0.316 & 0.400 & -- & -- & n/a & n/a & n/a & n/a \\
& HoverNet (MoNuSAC) & -- & -- & -- & 0.438 & -- & 0.424 & 0.368 & -- & n/a & n/a & n/a & n/a \\
& Cerberus           & 0.800& -- & -- & 0.437 & 0.210 & 0.367 & 0.311 & 0.121 & n/a & n/a & n/a & n/a \\
\midrule
\multirow{7}{*}{\rotatebox[origin=c]{90}{DigestPath}}
& \textbf{PAGET-S}         & n/a & n/a & n/a & \textbf{0.784} & \underline{0.402} & 0.481 & 0.279 & \underline{0.313} & n/a & \underline{0.242}& \textbf{0.123} & n/a \\
& \textbf{PAGET-H}         & n/a & n/a & n/a & \underline{0.777}   & \textbf{0.423}   & \textbf{0.550} & \underline{0.320}& \textbf{0.302} & n/a & \textbf{0.255}& \underline{0.099}& n/a \\
& Teacher\textsubscript{full}                & n/a & n/a & n/a & 0.539 & 0.376 & 0.480 & 0.272 & 0.187 & n/a & 0 & 0.016 & n/a \\
& HoverNet (PanNuke)       & n/a & n/a & n/a & 0.476            & 0.402& \underline{0.482} & -- & -- & n/a & -- & -- & n/a \\
& HoverNet (MoNuSAC)       & n/a & n/a & n/a & 0.236            & --               & 0.446            & \textbf{0.449}& -- & n/a & -- & 0.048& n/a \\
& HD-YOLO (Lung)           & n/a & n/a & n/a & 0.295            & 0.098            & 0.190            & 0.184            & -- & n/a & -- & -- & n/a \\
& HD-YOLO (Breast)         & n/a & n/a & n/a & 0.424            & 0.281            & 0.354            & 0.290            & -- & n/a & -- & -- & n/a \\
\midrule
\multirow{7}{*}{\rotatebox[origin=c]{90}{GlaS}}
& \textbf{PAGET-S}         & n/a & n/a & n/a & \underline{0.767} & 0.454& 0.541 & 0.371 & \textbf{0.206}& n/a & \underline{0.158}& 0.009 & n/a \\
& \textbf{PAGET-H}         & n/a & n/a & n/a & \textbf{0.779}   & \textbf{0.556}   & \textbf{0.601}& 0.426& \underline{0.192}& n/a & \textbf{0.168} & 0.020& n/a \\
& Teacher\textsubscript{full}                  & n/a & n/a & n/a & 0.719 & 0.498 & 0.407 & 0.234 & 0.033 & n/a & 0 & \underline{0.045}& n/a \\
& HoverNet (PanNuke)       & n/a & n/a & n/a & 0.687            & \underline{0.505} & \underline{0.597}& -- & -- & n/a & -- & -- & n/a \\
& HoverNet (MoNuSAC)       & n/a & n/a & n/a & 0.525            & --               & 0.543 & \underline{0.513}& -- & n/a & -- & \textbf{0.084}& n/a \\
& HD-YOLO (Lung)           & n/a & n/a & n/a & 0.372            & 0.143            & 0.208 & 0.204 & -- & n/a & -- & -- & n/a \\
& HD-YOLO (Breast)         & n/a & n/a & n/a & 0.638            & 0.409            & 0.613 & \textbf{0.526}& -- & n/a & -- & -- & n/a \\
\midrule
\multirow{7}{*}{\rotatebox[origin=c]{90}{CoNSeP}}
& \textbf{PAGET-S}         & n/a & n/a & n/a & \underline{0.894} & 0.609& 0.734 & \underline{0.662}& \underline{0.424}& n/a & \textbf{0.470}& \underline{0.378} & n/a \\
& \textbf{PAGET-H}         & n/a & n/a & n/a & \textbf{0.904}   & 0.630& 0.746& 0.657 & \textbf{0.429}& n/a & \underline{0.450}&  0.367 & n/a \\
& Teacher\textsubscript{full}                  & n/a & n/a & n/a & 0.710 & \underline{0.677}& 0.621 & 0.513 & 0.000 & n/a & 0.000 & 0.249 & n/a \\
& HoverNet (PanNuke)       & n/a & n/a & n/a & 0.860            & \textbf{0.736}   & \textbf{0.831} & -- & -- & n/a & -- & -- & n/a \\
& HoverNet (MoNuSAC)       & n/a & n/a & n/a & 0.714            & --               & \underline{0.771}& \textbf{0.674}& -- & n/a & -- & \textbf{0.386} & n/a \\
& HD-YOLO (Lung)           & n/a & n/a & n/a & 0.129            & 0.157            & 0.081 & 0.094 & -- & n/a & -- & -- & n/a \\
& HD-YOLO (Breast)         & n/a & n/a & n/a & 0.619            & 0.382            & 0.688 & 0.589 & -- & n/a & -- & -- & n/a \\
\midrule
\multirow{7}{*}{\rotatebox[origin=c]{90}{CRAG}}
& \textbf{PAGET-S}         & n/a & n/a & n/a & \textbf{0.877}   & 0.611& 0.712 & 0.518 & \textbf{0.399}& n/a & \textbf{0.374}& \textbf{0.343} & n/a \\
& \textbf{PAGET-H}         & n/a & n/a & n/a & \underline{0.864} & \textbf{0.695}   & \underline{0.737} & \underline{0.546} & \underline{0.391}& n/a & \underline{0.366}& \underline{0.340} & n/a \\
& Teacher\textsubscript{full}                  & n/a & n/a & n/a & 0.772 & \underline{0.688}& 0.563 & 0.335 & 0.108 & n/a & 0.000 & 0.150 & n/a \\
& HoverNet (PanNuke)       & n/a & n/a & n/a & 0.794            & 0.663            & \textbf{0.808} & -- & -- & n/a & -- & -- & n/a \\
& HoverNet (MoNuSAC)       & n/a & n/a & n/a & 0.637            & --               & 0.728 & \textbf{0.619} & -- & n/a & -- & 0.065 & n/a \\
& HD-YOLO (Lung)           & n/a & n/a & n/a & 0.121            & 0.025            & 0.095 & 0.115 & -- & n/a & -- & -- & n/a \\
& HD-YOLO (Breast)         & n/a & n/a & n/a & 0.375            & 0.257            & 0.383 & 0.315 & -- & n/a & -- & -- & n/a \\
\midrule
\multirow{8}{*}{\rotatebox[origin=c]{90}{KCCRC}}
& \textbf{PAGET-S}         & n/a & n/a & n/a & n/a & n/a & \textbf{0.507} & \underline{0.539}& \textbf{0.498} & \textbf{0.343} & 0.331 & \underline{0.385} & \textbf{0.251}\\
& \textbf{PAGET-H}         & n/a & n/a & n/a & n/a & n/a & \underline{0.497} & 0.533& \underline{0.484} & \underline{0.320} & \underline{0.331}& \textbf{0.388} & 0.226 \\
& Teacher\textsubscript{full}                  & n/a & n/a & n/a & n/a & n/a & 0.487 & \textbf{0.600}& 0.477 & 0.314 & 0.267 & 0.296 & \underline{0.237}\\
& HoverNet (PanNuke)       & n/a & n/a & n/a & n/a & n/a & 0.377 & -- & -- & -- & -- & -- & -- \\
& HoverNet (MoNuSAC)       & n/a & n/a & n/a & n/a & n/a & 0.404 & 0.307 & -- & -- & -- & 0.357 & -- \\
& HD-YOLO (Lung)           & n/a & n/a & n/a & n/a & n/a & 0.378 & 0.364 & -- & -- & -- & -- & -- \\
& HD-YOLO (Breast)         & n/a & n/a & n/a & n/a & n/a & 0.463 & 0.291 & -- & -- & -- & -- & -- \\
& Cerberus                 & n/a & n/a & n/a & n/a & n/a & 0.453 & 0.261 & 0.387 & -- & \textbf{0.343}& -- & -- \\
\bottomrule
\end{tabular}

\vspace{0.3cm}
\raggedright
\textit{Abbreviations:} Epi=Epithelium/Epithelial, Str=Stroma, Bld=Blood, Con=Connective tissue, Leu=Leukocyte, Lym=Lymphocyte, Pls=Plasma cell, Mye=Myeloid cell, Eos=Eosinophil, Neu=Neutrophil, Endo=Endothelial cell, Fib=Fibroblast, Mit=Mitotic cell
\end{sidewaystable}

\begin{figure}[htbp]
    \centering
    \includegraphics[width=\textwidth]{ACC.png}
    \caption{Zero-shot generalization to an unseen cancer type. Representative regions from an adenoid cystic carcinoma case, a cancer type not included in the training data, showing H\&E-stained images (top) and corresponding PAGET-S segmentation results (bottom). A board-certified pathologist reviewed the segmentation outputs and annotated errors: red circles indicate obvious misclassifications, blue circles indicate obvious missed detections, and yellow circles indicate missed cells where the cell type was ambiguous even upon expert review.}
    \label{fig:unseen_cancer}
\end{figure}

\section{Conflict cases}
\label{app:conf}
Figure~\ref{fig:conflict_example} shows representative examples of prediction conflicts between teacher models.


\begin{figure}[htbp]
    \centering
    \includegraphics[width=.90\linewidth]{conflict_lym_pls.png}
    \includegraphics[width=.90\linewidth]{conflict_neu_eos.png}
    \caption{Representative examples of prediction conflicts between teachers. Top: lymphocyte-plasma cell conflicts. Bottom: neutrophil-eosinophil conflicts. Each panel shows the H\&E image, probability maps for each class, pixels with over 90\% prediction probability for each class, and conflict regions where both classes exceed 90\% probability (red).}
    \label{fig:conflict_example}
\end{figure}

\section{Benchmark Configuration and Processing Time Breakdown}
\label{app:time}
All timing measurements were performed on a single NVIDIA Tesla V100-SXM2-32GB GPU. Each measurement was averaged over 3 runs after 1 warmup iteration using 1,000 patches to minimize model loading overhead. PAGET-S and teacher models were evaluated on equivalent tissue areas (384×384 at 20× for PAGET-S, 768×768 at 40× for teacher models/HoverNet).

Table~\ref{tab:timing} shows processing time per tile for each pipeline component.

\begin{table}[htbp]
\centering
\caption{Processing time per tile (ms) on Tesla V100-SXM2-32GB.}
\label{tab:timing}
\begin{tabular}{lccc}
\toprule
\textbf{Component} & \textbf{Input Size} & \textbf{Time (ms)} & \textbf{Speedup} \\
\midrule
\multicolumn{4}{l}{\textit{Individual components}} \\
PAGET-S (SegFormer) & 384×384 @20× & 4.3 $\pm$ 0.1 & -- \\
HoverNet & 768×768 @40× & 403 $\pm$ 1 & -- \\
Teacher  & 768×768 @40× & 890 $\pm$ 23 & -- \\
\midrule
\multicolumn{4}{l}{\textit{Combined pipelines}} \\
\textbf{PAGET-S} (semantic only) & -- & \textbf{4.3} & 301× vs Teacher\textsubscript{full} \\
\textbf{PAGET-H} (PAGET-S + HoverNet) & -- & \textbf{407} & 3.2× vs Teacher\textsubscript{full} \\
Teacher & -- & 890 & -- \\
Teacher\textsubscript{full} (Teacher + HoverNet) & -- & 1,293 & (reference) \\
\bottomrule
\end{tabular}
\end{table}
\end{document}
