\documentclass{midl}

\usepackage{svg,multicol}
\usepackage{graphicx}
\usepackage[english]{babel}
\usepackage{caption}
\usepackage{multirow}

\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025}
\jmlrvolume{-- 255}
\editors{Accepted for publication at MIDL 2025}

\title[]{Foundation Model Ensemble for Out-of-Distribution Generalization: Predicting Lymph Node Metastasis in Early Gastric Cancer Using Whole-Slide Imaging}

\begin{document}

\midlauthor{\Name{Woojin Chung\nametag{$^{1}$}} \Email{goglxych97@hufs.ac.kr} \AND
\addr $^{1}$ Deparment of Biomedical Engineering, Hankuk University of Foreign Studies, Yongin-si 17035, Gyeonggi-do, Korea. \AND
\Name{Yujun Park\nametag{$^{2}$}} \Email{isutar\_star@naver.com} \\
\addr $^{2}$ Department of Pathology, CHA Bundang Medical Center, CHA University, Seongnam-si 13496, Gyeonggi-do, Korea. \AND
\Name{Yoonho Nam\nametag{$^{1}$}} \Email{yoonhonam@hufs.ac.kr}\\
}

\maketitle

\begin{abstract}
Recent advances in deep learning have improved the practicality of automated analysis for whole-slide imaging. However, challenges remain in image analysis due to variations in imaging equipment, tissue preparation, staining protocols, and other variables. These variations hinder the generalizability of trained models to external datasets. Recently, foundation models trained on large-scale pathology datasets have been introduced by various research groups, demonstrating the potential to address this issue. Since each foundation model was trained on datasets collected from different sources under varying settings, the learned representations reflect different characteristics to some extent. These differences suggest that leveraging the information of multiple models could improve generalization and robustness compared to using a single model. In this study, we investigate foundation model ensembles for predicting lymph node metastasis in early gastric cancer across three different datasets. By comparing ensemble models with individual ones, we demonstrate that ensembling multiple foundation models improves performance in whole-slide imaging for both in-distribution and out-of-distribution data.

\end{abstract}

\begin{keywords}
Whole-Slide Imaging (WSI), Foundation Model, Foundation Model Ensemble, Lymph Node Metastasis Prediction, Early Gastric Cancer
\end{keywords}

\section{Introduction}
The development of computational pathology has made whole-slide imaging (WSI) an essential tool in pathology diagnosis and research \cite{Aeffner2019, Kumar2020}. WSI enables high-resolution scanning of pathology slides, converting them into digital images for efficient computational processing and analysis. Although these advancements have greatly enhanced computational pathology, variability in pathological images—arising from differences in imaging equipment, tissue processing, staining protocols, and other variables—limits model generalizability across datasets \cite{Aeffner2019}.

Foundation models, trained on diverse datasets through self-supervised learning, have the potential to address such limitations by learning generalized representations. These representations capture complex patterns in training data, enabling robust performance on external datasets. Recently, foundation models trained on large-scale pathology datasets have been introduced by various research groups, demonstrating their adaptability to downstream tasks \cite{Ciga2022, Wang2022, Chen2022, Filiot2023, Vorontsov2023, Hua2023, Alfasly2024, Chen2024, Lu2024, Nechaev2024, Xu2024, Yang2024}. Each foundation model, trained on datasets collected from different sources and under varying settings, develops its own representations that reflect these differences, leading to varied benchmark performances \cite{Wolflein2023}. This diversity suggests that each foundation model may contribute complementary information to downstream tasks.

\begin{figure}
\centerline{\includegraphics[width=0.9\textwidth]{figures/figure1.png}}
\caption{Although all these images are WSIs from EGC patients, they show differences in imaging characteristics, including color tone and intensity, depending on the acquisition methods or resection methods.}\label{fig1}
\end{figure}

The complementary information captured by different foundation models could enhance generalization and robustness when combined, particularly on external datasets. Previous studies showed that transforming the features of a foundation model using information from another produced more generalized results compared to relying on a single model \cite{Chung2024}. This insight highlights the potential of foundation model ensembles for overcoming limitations in model generalization due to dataset variations.

Ensemble methods, which combine models to improve overall performance, have been widely utilized in machine learning to address the limitations of single models by reducing variance \cite{Mohammed2023}. Beyond variance reduction, ensembling foundation models can integrate their unique information derived from pre-trained data, further enhancing generalization. However, the efficacy of foundation model ensembles in the context of WSI remains underexplored, with limited research conducted in this area.

In this study, we investigate the effectiveness of foundation model ensembles for predicting lymph node metastasis (LNM) in early gastric cancer (EGC) WSIs, including performance on out-of-distribution (OOD) datasets. We evaluated model generalizability using two test datasets with different distributions: one varying in data acquisition and another in resection methods. A detailed description is provided in Section~\ref{sec:datasets}. In our case, these OOD datasets are contextually relevant but exhibit distinct characteristics in color tone and intensity (Figure~\ref{fig1}). Following the definition by \citeauthor{Farquhar2022}, they can be categorized as related distributions, a subset of OOD. By ensembling multiple foundation models, we aim to explore their contribution to model performance and generalization in WSI analysis.

% In this study, we investigate the effectiveness of foundation model ensembles for predicting lymph node metastasis (LNM) in early gastric cancer (EGC) WSIs including out-of-distribution (OOD) datasets. We evaluated the model generalizability on two test datasets with different distributions: one varying in data acquisition and another in resection methods. A detailed description is provided in Section~\ref{sec:datasets}. In our case, these OOD datasets are contextually relevant but exhibit distinct characteristics in color tone and intensity (Figure~\ref{fig1}), and can be categorized as a related distribution—a subset of OOD, following the definition by \citeauthor{Farquhar2022}. By ensembling multiple foundation models, we aim to explore their contribution to model performance and generalization in WSI analysis.

\section{Method}
LNM in gastric cancer is closely associated with characteristics of tumor areas \cite{Maruyama1989}. Therefore, to predict LNM in EGC, we first trained patch-level classification networks to extract tumor regions from WSIs. Within these regions, we trained patch-level classification networks to predict the LNM using slide-level labels. These predictions were then aggregated into a slide-level representation, and ensemble methods were applied using the top-performing models on the validation dataset and were compared across datasets.

\subsection{Datasets} \label{sec:datasets}
This study used three datasets—internal, external, and endoscopic submucosal dissection (ESD)—categorized based on the acquiring institution and treatment type. The internal dataset was selected from patients at our institution who underwent curative surgical resection with lymph node dissection for EGC. The external dataset comprised surgical cases for EGC collected from different institutions and scanned with a different scanner. The ESD dataset included endoscopic resection cases with subsequent lymph node dissection from multiple institutions. We split the internal dataset into training and validation sets, using the training set for model training and the validation set for ensemble validation. The external and ESD datasets were used as test sets. Table~\ref{tab1} provides a summary of the datasets. (Note: LN+ and LN- indicate counts of WSIs with and without LNM, respectively.)
\begin{table}[h]
\caption{Summary of the datasets.}
\centering
\label{tab1}
\resizebox{\textwidth}{!}{%
\renewcommand{\arraystretch}{0.9} 
\begin{tabular}{|l|l|ll|l|l|}
\hline
\textbf{\footnotesize Dataset}
& \textbf{\footnotesize Split}
& \textbf{\footnotesize LN+}
& \textbf{\footnotesize LN-}
& \textbf{\footnotesize Institution}
& \textbf{\footnotesize Treatment} \\
\hline
\multirow{2}{*}{\footnotesize Internal Dataset}
& \footnotesize Train
& \footnotesize 100
& \footnotesize 100
& \multirow{2}{*}{\footnotesize Internal Institution}
& \multirow{3}{*}{\begin{tabular}[c]{@{}l@{}}
\footnotesize Curative surgical resection with \\ \footnotesize lymph node dissection
\end{tabular}} \\
\cline{2-4}
& \footnotesize Valid
& \footnotesize 30
& \footnotesize 73
&
& \\
\cline{1-5}
\footnotesize External Dataset
& \footnotesize Test
& \footnotesize 30
& \footnotesize 71
& \footnotesize External Institution
& \\
\hline
\footnotesize ESD Dataset
& \footnotesize Test
& \footnotesize 23
& \footnotesize 96
& \footnotesize Internal + External
& \begin{tabular}[c]{@{}l@{}}
\footnotesize Endoscopic resection cases with \\ \footnotesize subsequent lymph node dissection
\end{tabular} \\
\hline
\end{tabular}}
\end{table}

\subsection{Pre-trained Models} \label{sec:models}
In this study, we selected 13 pre-trained models for ensemble learning—one ImageNet pre-trained model and 12 foundation models in computational pathology. The computational pathology foundation models included Ciga et al. \cite{Ciga2022}, CTransPath \cite{Wang2022}, HIPT \cite{Chen2022}, Phikon \cite{Farquhar2022}, Virchow \cite{Vorontsov2023}, PathoDuet \cite{Hua2023}, PathDINO \cite{Alfasly2024}, UNI \cite{Chen2024}, CONCH \cite{Lu2024}, Hibou \cite{Nechaev2024}, Prov-GigaPath \cite{Xu2024}, BEPH \cite{Yang2024} with detailed descriptions provided in Table~\ref{tab2}.

\begin{table}[ht]
\centering
\caption{Details of the Pre-trained Models Used}
\label{tab2}
\resizebox{\textwidth}{!}{%
\renewcommand{\arraystretch}{0.9} % 줄 간격 조정
\begin{tabular}{|p{2.2cm}|p{3.4cm}|p{3cm}|p{5.3cm}|p{1.6cm}|}
\hline
\textbf{\footnotesize Name}
& \textbf{\footnotesize Model Architecture}
& \textbf{\footnotesize Trained Method}
& \textbf{\footnotesize Trained Dataset}
& \textbf{\footnotesize Feature Dim} \\
\hline
\footnotesize ImageNet
& \footnotesize ResNet34
& \footnotesize Supervised Learning
& \footnotesize 1.2M natural images                & \footnotesize 768 \\
\footnotesize Ciga et al.
& \footnotesize ResNet18               
& \footnotesize SimCLR                 
& \footnotesize 206K patches + 25K WSIs from multiple sources
& \footnotesize 512 \\
\footnotesize CTransPath 
& \footnotesize CNN + Swin ViT   
& \footnotesize SRCL  
& \footnotesize 15M patches from 30K WSIs (TCGA and PAIP)                    
& \footnotesize 768 \\
\footnotesize HIPT  
& \footnotesize Three hierarchical ViT
& \footnotesize DINO     
& \footnotesize 10,678 WSIs, 104M 256x256 images,  408K 4096×4096 images
& \footnotesize 384  \\
\footnotesize Phikon    
& \footnotesize ViT-B 
& \footnotesize iBOT                
& \footnotesize 43M patches and 6K WSIs from PanCancer40M (including TCGA-COAD, PanCancer4M) 
& \footnotesize 768  \\
\footnotesize Virchow   
& \footnotesize ViT-H  
& \footnotesize DINOv2         
& \footnotesize 1.5M WSIs (MSKCC)        
& \footnotesize 1280 \\
\footnotesize Pathoduet  
& \footnotesize ViT-B      
& \footnotesize MoCov3 + SimSiam + InfoNCE Loss
& \footnotesize 11K WSIs (TCGA), 2771  pair WSIs  (HyReCo) and  3896 pair WSIs(BCI)                   
& \footnotesize 768  \\
\footnotesize PathDINO   
& \footnotesize Lightweight ViT  
& \footnotesize DINO + HistoRotate 
& \footnotesize 6M patches from 11K WSIs (TCGA) 
& \footnotesize 384 \\
\footnotesize UNI      
& \footnotesize ViT-L 
& \footnotesize DINOv2            
& \footnotesize 100M patches from 100K WSIs (Mass-100K) 
& \footnotesize 1024 \\
\footnotesize CONCH   
& \footnotesize Transformer-based  
& \footnotesize CoCa      
& \footnotesize 1.17M image-caption pairs       
& \footnotesize 512  \\
\footnotesize Hibou     
& \footnotesize ViT-B and ViT-L
& \footnotesize DINOv2     
& \footnotesize 1.2B patches (L), 512M patches (B)
& \footnotesize 768  \\
\footnotesize Prov-GigaPath  
& \footnotesize LongNet 
& \footnotesize DINOv2 + MAE           
& \footnotesize 1.38B tiles from 171K WSIs (Prov-Path)                    
& \footnotesize 1536 \\
\footnotesize BEPH     
& \footnotesize ViT-B                  
& \footnotesize BEiT               
& \footnotesize ImageNet-1k and 11M patches from 11,760 pathology images (TCGA)                      
& \footnotesize 768  \\
\hline
\end{tabular}%
}
\end{table}

\subsection{Tumor Region Extraction} \label{sec:cancerseg}
A pathologist annotated the tumor areas in 80 WSIs, 40 cases with and 40 cases without LNM. Using these annotations, we fine-tuned three separate foundation models by Ciga et al. for patch-level classification of cancer regions at three magnifications (20×, 10×, and 5×). We averaged the probability maps generated by the three models and applied a threshold ($>$ 0.5) to extract the tumor regions from the WSIs. A pathologist reviewed the extracted regions across the entire dataset and confirmed their appropriateness. These validated regions were then used for LNM prediction.

\subsection{Data Processing for LNM Prediction}
Since each foundation model requires a different input image size based on its trained settings, we first tiled the extracted tumor regions into 512×512 pixels with an overlap ratio of 0.5 between adjacent tiles. Then, the tiles were randomly cropped into patches matching the input size required by each foundation model  (224×224 pixels for scratch-trained networks).  Each foundation model also required a distinct normalization method; we therefore applied the corresponding normalization parameters. Aside from these differences, the same data augmentation strategies were consistently applied across all training processes.

\subsection{Training Single Models for LNM Prediction} \label{sec:prediction}
For each model described in Section~\ref{sec:models}, we trained a patch-level classifier using the tumor regions extracted in Section~\ref{sec:cancerseg}. Each classifier consisted of three non-linear layers followed by a sigmoid activation and was trained under identical settings. For baseline comparisons, ResNet34 \cite{He2015D} and ViT-Base \cite{Dosovitskiy2020} were trained from scratch. All LNM prediction networks were trained and evaluated at 10× magnification. Each trained model generated an LNM risk probability map for each WSI. Slide-level LNM predictions were obtained by averaging the top 100 patches with the highest risk scores from the probability map, which were selected experimentally.

\subsection{Foundation Model Ensembles for LNM Prediction}
Based on the single models trained in Section~\ref{sec:prediction}, we selected the top-performing models using their performance on the internal validation set. Specifically, we constructed ensembles using the top 3 and top 5 models, respectively. For each of these subsets, we applied three different ensemble strategies for LNM prediction: (1) Soft voting for slide-level classification, (2) Averaging probability maps to aggregate patch-level predictions, and (3) Feature concatenation, where extracted features from multiple models were combined and fed into a classification network.

In addition, we compared single-model ensembles with multi-model ensembles. For the highest-performing foundation model based on internal validation performance, we implemented soft voting and averaging probability maps.

\subsection{Evaluating Model Calibration, Uncertainty and Consistency}
To further understand the advantages of foundation model ensembles, we quantified model calibration, uncertainty and consistency. Calibration was measured by the brier score (BS) \cite{Brier1950}, and uncertainty was measured using the widely adopted negative log-likelihood (NLL) \cite{Lakshminarayanan2017}, defined as:
\[\text{BS} = \frac{1}{N}\sum_{i=1}^{N} (\hat{y}_i - y_i)^2, \quad
\text{NLL} = -\frac{1}{N}\sum_{i=1}^{N} \left[ y_i \log(\hat{y}_i) + (1-y_i)\log(1-\hat{y}_i) \right]. \]
BS and NLL values were compared using predictive scores at both the patch-level within cancer regions and the slide-level of WSIs.

Model consistency was evaluated by analyzing the stability of predictions across patches with similar histopathological features in 28 true positive LNM cases from the internal dataset. For these cases, a pathologist carefully annotated the top 20 patches per case based on three primary criteria: (1) Tumor Differentiation and Main Types, (2) Inflammatory Response, and (3) Stromal and Tissue Features, resulting in a total of 560 annotated patches. To evaluate the consistency of predictions for each patch category, we calculated the standard deviation of the prediction scores.

\section{Result}
\subsection{Evaluation of Single Models for LNM Prediction}
The area under the curve (AUC) was used as the evaluation metric. Table~\ref{tab3} presents the AUC scores for LNM prediction for individual models on the Internal, External, and ESD datasets. Even when trained on the same data using same conditions(e.g. learning parameters, loss functions, and data augmentations), the results varied across the models. Figure~\ref{fig2} shows representative examples of LNM predictions for different models.

\begin{table}[h]
\centering
\caption{AUC Scores for Different Models Across Datasets}
\label{tab3}
\resizebox{0.85\textwidth}{!}{%
\renewcommand{\arraystretch}{0.7} 
\begin{tabular}{|l|l|l|l|}
\hline
\scriptsize \textbf{Model Name}
& \scriptsize \textbf{Internal Dataset}
& \scriptsize \textbf{External Dataset}
& \scriptsize \textbf{ESD Dataset} \\
\hline
\scriptsize Scratch ResNet34
& \scriptsize 0.839
& \scriptsize 0.748
& \scriptsize 0.538 \\
\scriptsize Scratch ViT-base
& \scriptsize 0.827
& \scriptsize 0.786
& \scriptsize 0.471 \\ 
\hline
\scriptsize ImageNet (ResNet34)
& \scriptsize 0.794
& \scriptsize 0.760
& \scriptsize 0.641 \\
\scriptsize Ciga et al.
& \scriptsize \textbf{0.867}
& \scriptsize 0.742
& \scriptsize 0.663 \\
\scriptsize CTransPath
& \scriptsize 0.859
& \scriptsize 0.778
& \scriptsize \textbf{0.702} \\
\scriptsize HIPT
& \scriptsize 0.802
& \scriptsize 0.701
& \scriptsize 0.600 \\
\scriptsize Phikon
& \scriptsize 0.841
& \scriptsize 0.763
& \scriptsize 0.628 \\
\scriptsize Virchow
& \scriptsize 0.858
& \scriptsize 0.754
& \scriptsize 0.655 \\
\scriptsize Pathoduet
& \scriptsize 0.740
& \scriptsize 0.734
& \scriptsize 0.510 \\
\scriptsize PathDINO
& \scriptsize 0.847
& \scriptsize 0.700
& \scriptsize 0.534 \\
\scriptsize UNI
& \scriptsize 0.853
& \scriptsize 0.718
& \scriptsize 0.631 \\
\scriptsize CONCH
& \scriptsize 0.816
& \scriptsize 0.700
& \scriptsize 0.688 \\
\scriptsize Hibou
& \scriptsize 0.843
& \scriptsize 0.779
& \scriptsize 0.597 \\
\scriptsize Prov-GigaPath
& \scriptsize 0.835
& \scriptsize 0.646
& \scriptsize 0.621 \\
\scriptsize BEPH
& \scriptsize 0.827
& \scriptsize \textbf{0.796}
& \scriptsize 0.604 \\
\hline
\end{tabular}
}
\caption*{\hspace{13mm} \raggedright
\scriptsize * The best model in each dataset is highlighted in bold.}
\end{table}

On the Internal Dataset, most foundation models demonstrated comparable or superior performance to the scratch-trained models. The highest-performing model in our downstream task was Ciga. et al., achieving an AUC score of 0.867.

On the external dataset, however, only BEPH (AUC 0.796) as feature extractors outperformed scratch-trained ViT-base (AUC 0.786), which was trained from scratch. This result may suggest that freezing the parameters of foundation models as feature extractors limits their ability to adapt their internal representations. 

On the ESD dataset, scratch-trained models performed poorly (AUC 0.538 and 0.471), while the foundation models exhibited a range of AUC scores (0.510–0.702). The highest-performing models were CTransPath (0.702), CONCH (0.688), and Ciga et al. (0.663).

\subsection{Evaluation of Foundation Model Ensembles for LNM Prediction}
We selected the top-performing individual foundation models on the internal validation dataset for ensembling. Specifically, the top-3 models were Ciga et al., CTransPath, and Virchow, while the top-5 included UNI and PathDINO in addition. For each groups, ensemble methods, including soft voting, averaging probability maps, and feature concatenation, were used, and the AUC results for each method are presented in Table~\ref{tab4}.

\begin{table}[ht]
\renewcommand{\arraystretch}{1}
\centering
\caption{Result AUC of Foundation Model Ensemble}
\label{tab4}
\small
\renewcommand{\arraystretch}{0.8} 
\begin{tabular}{|p{2.6cm}|p{7.2cm}|p{1.2cm}p{1.2cm}p{1.1cm}|}
\hline
& \footnotesize Method
& \footnotesize Internal
& \footnotesize External
& \footnotesize ESD \\
\hline
\multirow{8}{*}{\footnotesize Single Model}
& \footnotesize Scratch ResNet34
& \footnotesize 0.839
& \footnotesize 0.748
& \footnotesize 0.538 \\
& \footnotesize Scratch ResNet34 (Soft Voting)
& \footnotesize 0.809 
& \footnotesize 0.747
& \footnotesize 0.559 \\
& \footnotesize Scratch ResNet34 (Averaging Probability Maps)
& \footnotesize 0.811
& \footnotesize 0.747
& \footnotesize 0.556 \\
& \footnotesize Ciga. et al.
& \footnotesize 0.867
& \footnotesize 0.742
& \footnotesize 0.663 \\
& \footnotesize Ciga. et al. (Soft Voting)
& \footnotesize 0.863
& \footnotesize 0.741
& \footnotesize 0.639 \\
& \footnotesize Ciga. et al. (Averaging Probability Maps)
& \footnotesize 0.864
& \footnotesize 0.743
& \footnotesize 0.638 \\
& \footnotesize BEPH
& \footnotesize 0.827
& \footnotesize 0.796
& \footnotesize 0.604 \\
& \footnotesize CTransPath
& \footnotesize 0.859
& \footnotesize 0.778
& \footnotesize 0.702 \\
\hline
\multirow{2}{*}{\footnotesize Top-3 Ensemble}
& \footnotesize Soft Voting
& \footnotesize \textbf{0.886}
& \footnotesize 0.761
& \footnotesize 0.702 \\
& \footnotesize Averaging Probability Maps
& \footnotesize 0.879
& \textbf{\footnotesize 0.806}
& \textbf{\footnotesize 0.714} \\
& \footnotesize \footnotesize Feature Concatenation
& \footnotesize 0.848
& \footnotesize 0.718
& \footnotesize 0.678 \\
\hline
\multirow{2}{*}{\footnotesize Top-5 Ensemble}
& \footnotesize Soft Voting
& \footnotesize 0.883
& \footnotesize 0.746
& \footnotesize 0.682 \\
& \footnotesize Average Probability Map
& \footnotesize 0.876
& \footnotesize 0.763
& \footnotesize 0.680 \\
& \footnotesize Feature Concatenation
& \footnotesize 0.827
& \footnotesize 0.673
& \footnotesize 0.628 \\
\hline
\end{tabular}
\caption*{\hspace{0mm} \raggedright
\scriptsize * The best model in each dataset is highlighted in bold.}
\end{table}

\begin{figure}
\centerline{\includegraphics[width=\textwidth]{figures/figure2.png}}
\caption{This figure shows examples of prediction heatmaps generated by individual foundation models and the top-3 ensemble using averaging probability maps.}
\label{fig2}
\end{figure}
As shown in Table~\ref{tab4}, both the top-3 and top-5 ensembles demonstrated improved performance on the internal dataset when soft voting and averaging probability maps were applied. Notably, top-3 averaging the probability maps of the consistently outperformed both the individual foundation models and the scratch-trained models across all datasets. The top-5 ensemble showed lower performance compared to the top-3 ensemble across datasets in our setting. Moreover, single-model ensembles did not show significant improvements in performance.

\subsection{Top-3 Averaging Probability Maps Model Calibration}
We evaluated the BS at both the patch and slide levels for the top-3 ensemble using averaging probability maps, which demonstrated improved performance on the two OOD datasets. We compared the ensemble model's performance to that of its individual constituent models—Ciga et al., CTransPath, and Virchow. While the individual models exhibited variations in BS across different datasets, the ensemble method overall maintained lower values, indicating well-calibrated results.

At the patch-level, the ensemble achieved a BS of 0.234 in the internal dataset, compared to 0.235, 0.298, and 0.281 from individual models. In the external dataset, it achieved 0.224, while individual models achieved 0.229, 0.374, and 0.244. In the ESD dataset, the ensemble achieved 0.210, compared to 0.244, 0.206, and 0.285.

At the slide-level, the ensemble achieved a BS of 0.244 in the internal dataset, compared to 0.330, 0.208, and 0.456 from individual models. In the external dataset, it achieved 0.184, while individual models recorded 0.219, 0.221, and 0.224. In the ESD dataset, the ensemble achieved 0.448, compared to 0.465, 0.510, and 0.720.

\subsection{Top-3 Averaging Probability Maps Model Uncertainty}
To further evaluate the effectiveness of the foundation model ensemble, we evaluated uncertainty using NLL at both the patch and slide levels. The top-3 ensemble using averaging probability maps consistently exhibited lower uncertainty across all datasets compared to individual models (Ciga. et al., CTransPath, and Virchow).

At the patch-level, the ensemble achieved an NLL of 0.692 in the internal dataset, lower than 1.075, 0.692, and 1.003 from individual models. In the external dataset, the ensemble had 0.641, compared to 1.402, 0.657, and 0.686. Similarly, in the ESD dataset, the ensemble had 0.633, while individual models showed 0.727, 0.703, and 1.117.

At the slide-level, the ensemble method continued to demonstrate reduced uncertainty. In the internal dataset, the ensemble achieved 0.680, while individual models recorded 1.537, 0.867, and 0.602. In the external dataset, the ensemble had 0.552, compared to 0.638, 0.630, and 0.692. In the ESD dataset, where overall uncertainty was higher, the ensemble still showed the lowest NLL (1.166) compared to 3.169, 1.191, and 1.369.

\subsection{Top-3 Averaging Probability Maps Model Consistency}
To assess the consistency of the probability map ensemble, we analyzed the standard deviation of prediction scores across commonly observed patch categories. The two most frequent categories were “Moderately differentiated” (19.64\%) and “Poorly differentiated” (11.43\%). Examples of these patch categories can be found in Figure~\ref{fig4}.

For the top-3 foundation models that achieved the best performance on the internal dataset (CTransPath, Virchow, and Ciga et al.), the standard deviation of risk probability scores for the “Moderately differentiated” category was 0.069, 0.045, and 0.082, respectively. When using the ensemble method, which combined the probability maps of these top-3 models, the standard deviation decreased to 0.042, indicating improved consistency.

For the “Poorly differentiated” category, the standard deviation for the top-3 models was 0.066, 0.053, and 0.027, respectively. Using the ensemble method, the standard deviation was reduced to 0.033. In this category, a decreased value was observed for two of the models, further supporting the effectiveness of ensembling in stabilizing predictions.

\begin{figure}
\centerline{\includegraphics[width=\textwidth]{figures/figure3.png}}
\caption{This figure shows the two most frequently observed categories among the top patches for LNM prediction: (left) Moderately differentiated with no significant inflammation, and (right) Poorly differentiated with no significant inflammation. The predicted scores from different models(scratch-trained ResNet34, Ciga et al., and averaging probability maps) are displayed for each patch.
} \label{fig4}
\end{figure}

\section{Discussion and Conclusion}
In this paper, we evaluated the performance of individual foundation models and the effectiveness of foundation model ensembles for LNM prediction in EGC in three different EGC datasets. Among the ensemble methods, averaging the probability maps of the top-3 high-performing models in our task consistently improved performance across all datasets. This suggests that using ensemble methods to integrate information from multiple models in the context of WSI analysis is beneficial for improving overall performance. However, in averaging probability maps, using five models instead of three resulted in reduced performance, underscoring the importance of selecting foundation models suited to the ensemble approach and the downstream task.

The method of concatenating features and retraining has shown inferior performance compared to averaging probability maps. This may be attributed to the use of a simple three-layer non-linear classifier, which likely struggled to effectively capture relevant information within the increased dimensionality of the feature space. The higher dimensionality from concatenation may have led to noise or redundancy. These findings highlight the need for more efficient methods to integrate features across foundation models.

The effectiveness of the foundation model ensemble was indirectly evaluated through model calibration, uncertainty and consistency. The ensemble method reduced calibration and uncertainty, and stabilized predictions within the patch category. These results suggest that combining the information of individual foundation models could enhance generalization and robustness, including performance on OOD datasets.

There are several limitations in our study. First, foundation models were used solely as feature extractors, which may have constrained their ability to generalize, as seen in the test results on the external dataset. Due to computational constraints in our experimental setting, fine-tuning large models in their entirety was challenging. Future studies should explore fine-tuned foundation model ensembles to improve adaptability to out-of-distribution datasets. Additionally, as the task was performed as a simple classification at the patch-level, incorporating advanced methods such as CLAM \cite{Lu2021} or multiple instance learning-based approaches could potentially improve performance. Another limitation lies in the use of simple probability map averaging as the ensemble method, which may not be optimal. For instance, weighting could be applied to the ensemble probability maps, or a Teacher-Student network could be trained to learn from the ensembled results. Moreover, integrating feature selection from foundation models with feature concatenation may enhance representational capacity. Further exploration of alternative ensemble strategies for foundation models is needed to identify more effective approaches.

\newpage
\bibliography{midl25_255}

\newpage
\appendix

\section{Computational Cost Comparison of Various Models} \label{app1}
\begin{table}[ht]
\centering
\resizebox{0.8\columnwidth}{!}{
\renewcommand{\arraystretch}{0.9} 
\begin{tabular}{|c|cccc|}
\hline
& \footnotesize
\textbf{Num params}
& \footnotesize \textbf{GFLOPs} 
& \footnotesize \textbf{Size(MB)}
& \footnotesize \textbf{Depth} \\
\hline
\footnotesize ResNet34
& \footnotesize 21.80M
& \footnotesize 3.68B
& \footnotesize 83.15
& \footnotesize 54 \\
\footnotesize ViT-Base
& \footnotesize 86.57M
& \footnotesize 16.87B
& \footnotesize 330.22
& \footnotesize 38 \\
\hline
\footnotesize Ciga et al.
& \footnotesize 11.18M
& \footnotesize 1.83B
& \footnotesize 42.63
& \footnotesize 29 \\
\footnotesize Ctransapth
& \footnotesize 27.52M
& \footnotesize 4.51B
& \footnotesize 104.98
& \footnotesize 56 \\
\footnotesize HIPT
& \footnotesize 21.67M    
& \footnotesize 6.15B
& \footnotesize 82.64
& \footnotesize 49 \\
\footnotesize Phikon
& \footnotesize 85.80M
& \footnotesize 17.58B
& \footnotesize 327.29
& \footnotesize 49 \\
\footnotesize Virchow
& \footnotesize 631.23M
& \footnotesize 162.07B
& \footnotesize 2.41K
& \footnotesize 129 \\
\footnotesize Pathoduet
& \footnotesize 85.80M
& \footnotesize 16.95B
& \footnotesize 327.3
& \footnotesize 49 \\
\footnotesize PathDINO
& \footnotesize 9.56M
& \footnotesize 13.43B
& \footnotesize 36.47
& \footnotesize 21 \\
\footnotesize UNI
& \footnotesize 303.35M
& \footnotesize 59.70B
& \footnotesize 1.16K
& \footnotesize 97 \\
\footnotesize CONCH
& \footnotesize 90.39M
& \footnotesize 69.07B
& \footnotesize 344.82
& \footnotesize 51 \\
\footnotesize Hibou
& \footnotesize 85.74M
& \footnotesize 23.56B
& \footnotesize 327.07
& \footnotesize 49 \\
\footnotesize Prov-GigaPath
& \footnotesize 1.13B    
& \footnotesize 223.45B
& \footnotesize 4.33K  
& \footnotesize 161 \\
\footnotesize BEPH     
& \footnotesize 85.76M   
& \footnotesize 17.58B
& \footnotesize 327.15  
& \footnotesize 49 \\
\hline
\end{tabular}%
}
\end{table}

\end{document}