\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{multirow}
\usepackage{array}
\usepackage{amsmath,amssymb,amsfonts}
\usepackage{algorithmic}
\usepackage{graphicx}
\usepackage{textcomp}
\usepackage{xcolor}
\usepackage{comment}
\usepackage{booktabs}
\usepackage{xcolor}
\jmlrvolume{-- Under Review}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026 submission}
\editors{Under Review for MIDL 2026}

\title[]{Quantifying and Mitigating Hospital Domain Bias in Pathology Foundation Models using Adversarial Feature Disentanglement}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Mengliang Zhang\nametag{$^{1}$}}  \Email{mxz3935@mavs.uta.edu}\\
\addr $^{1}$ CSE Department, The University of Texas at Arlington \\
}


\begin{document}

\maketitle

\begin{abstract}
Pathology foundation models (PFMs) have demonstrated remarkable potential in whole-slide image (WSI) diagnosis. However, pathology images from different hospitals exhibit domain shifts due to variations in scanning hardware and preprocessing. These differences cause PFMs to learn spurious hospital-specific features, severely compromising their robustness and generalizability in clinical settings. We present the first systematic study of this hospital-source domain bias in PFMs. To address the critical trade-off between diagnostic utility and domain predictability, we establish a quantification pipeline and introduce the Robustness Index (RI). Furthermore, we propose a lightweight adversarial framework for feature disentanglement. This framework employs a trainable adapter and a domain classifier connected via a Gradient Reversal Layer (GRL) to remove latent hospital-specific information from frozen PFM representations without modifying the encoder itself. Experiments on multi-center histopathology datasets demonstrate that our approach substantially suppresses domain predictability and achieves significant gains in feature robustness. Crucially, the method maintains or improves disease classification performance, proving its efficacy particularly in out-of-domain scenarios. Our code is provided at: \href{https://github.com/MengRes/pfm_domain_bias}{https://github.com/MengRes/pfm\_domain\_bias}
\end{abstract}

\begin{keywords}
Pathology Image, Domain Bias, Foundation model.
\end{keywords}

\section{Introduction}

Whole-slide imaging (WSI) has become a key tool in digital pathology, enabling scalable analysis of gigapixel histopathology slides. In most downstream applications, WSIs are partitioned into small patches, and deep models—such as ResNet~\cite{heDeepResidualLearning2016}, Vision Transformer (ViT)~\cite{dosovitskiyImageWorth16x162021}, or foundation encoders like UNI~\cite{chenGeneralPurposeFoundationModel2024}, PLIP~\cite{huangVisualLanguageFoundation2023} are employed to extract visual features for tasks including disease classification, tumor grading, and subtype analysis. These patch-level features form the backbone of many computational pathology pipelines.

However, a critical yet often underexplored challenge in this setting is the presence of domain-specific bias in the data. Patches collected from different hospitals or scanners frequently differ in staining protocols, image resolution, scanner artifacts, and tissue preparation. Such variations introduce spurious correlations into the learned representations, causing models to inadvertently rely on hospital-specific cues rather than true disease-related signals, severely compromising their robustness and generalizability in clinical settings. Even state-of-the-art pathology foundation models (PFMs), such as Phikon \citep{filiotScalingSelfSupervisedLearning2023} and UNI \citep{chenGeneralPurposeFoundationModel2024}, exhibit a significant degree of domain bias. Visual confirmation of this phenomenon is provided in Figure \ref{fig:tsne_result}, where t-SNE \citep{vandermaatenVisualizingDataUsing2008} clustering of patch embeddings from the TCGA-BRCA \cite{cancergenomeatlasnetworkComprehensiveMolecularPortraits2012} dataset clearly shows that patches from the same hospital source cluster tightly together. This indicates the learned features inadvertently retain substantial hospital-specific information.

Several prior studies have explored domain bias in PFMs. Vaidya et al. \cite{vaidyaDemographicBiasMisdiagnosis2024} examined performance disparities across patient populations, while Lin et al. investigated inter-hospital variations. de Jong et al. \cite{jongCurrentPathologyFoundation2025} proposed a robustness metric using a KNN-based approach. Other studies attempted to mitigate domain shifts through non-learnable stain normalization techniques, such as Macenko \cite{m.macenkoMethodNormalizingHistology2009} normalization, which only address low-level color differences. Another line of work leverages parameter-efficient fine-tuning methods, such as LoRA (Low-Rank Adaptation) \cite{huLoRALowRankAdaptation2021}, to adapt PFMs to out-of-distribution (OOD) data. Yet, these approaches have notable limitations: the KNN-based metric lacks a comprehensive dataset-level assessment; stain normalization fails to resolve complex, non-stain-related domain shifts; and LoRA fine-tuning approaches do not explicitly ensure the reduction of domain bias while preserving diagnostic performance. Thus, there is a clear need for a systematic, parameter-efficient framework that can rigorously quantify and explicitly mitigate domain bias while guaranteeing the preservation of diagnostic performance.

%Several prior studies have explored domain bias in pathology foundation models (PFMs). Vaidya et al. \cite{vaidyaDemographicBiasMisdiagnosis2024} examined performance disparities of PFMs across different patient populations, while Lin et al. investigated inter-hospital variations and analyzed the limitations of DINO \cite{caronEmergingPropertiesSelfSupervised2021}-based feature learning. Edwin et al. \cite{jongCurrentPathologyFoundation2025} proposed a robustness metric for PFMs on patches from different hospital sources, using a KNN-based approach to measure the number of neighboring patches with the same hospital origin and disease label, thereby computing a robustness index. Other studies attempted to mitigate domain shifts through stain normalization techniques, such as Macenko \cite{m.macenkoMethodNormalizingHistology2009} normalization, which standardizes patch color distributions across hospitals before feature extraction. However, these methods are non-learnable, sensitive to parameter tuning, and only address low-level color differences, leaving other sources of domain variability—such as scanner artifacts or labeling inconsistencies—unresolved.


In this work, a systematic pipeline is established to evaluate PFM domain bias, using separate Multi-Layer Perceptron (MLP) training to quantify severity via hospital classification AUC. We introduce the Robustness Index to quantify the utility-predictability trade-off. To mitigate bias, we propose a lightweight adversarial framework utilizing a trainable projection head, disease classifier, and a domain classifier connected by a Gradient Reversal Layer (GRL). During backpropagation, the GRL reverses the gradient, suppressing hospital-related cues while preserving disease information. Experimental results show the framework effectively suppresses hospital-specific signals, alleviating domain bias, and achieving substantial robustness gains on multi-center datasets

%%%In this work, we first establish a pipeline to evaluate the domain bias of pathology foundation models. We extract features and train separate multi-layer perceptron (MLP) for disease and hospital-source classification. By measuring the hospital classification AUC, we quantify the severity of domain bias. Crucially, we introduce the Robustness Index (RI) to systematically quantify the trade-off between diagnostic utility and domain predictability. We then propose a lightweight adversarial framework to mitigate hospital-specific information in the extracted features. Our method introduces a trainable projection head, a domain classifier, and a disease classifier, where the domain classifier is connected to the projection head via a Gradient Reversal Layer (GRL). During backpropagation, the GRL reverses the gradient from the domain classifier, serving as adversarial feedback to suppress hospital-related cues while preserving disease-discriminative information throughout training. We validate our approach on multiple multi-center histopathology datasets. Experimental results demonstrate that the proposed adversarial training framework effectively suppresses hospital-specific signals and alleviates domain bias, achieving substantial robustness gains.

%Another line of work leverages Canonical Correlation Analysis (CCA)  to map patches from different hospitals into a shared feature space. Yet, these approaches have notable limitations. Edwin’s \cite{jongCurrentPathologyFoundation2025} metric cannot provide a dataset-level assessment of domain bias. CCA-based methods focus on feature alignment but overlook whether the aligned representations retain disease-discriminative power. Vaidya’s \cite{vaidyaDemographicBiasMisdiagnosis2024} approach requires additional LoRA \cite{huLoRALowRankAdaptation2021} fine-tuning of PFMs and does not explicitly ensure the reduction of domain bias while preserving diagnostic performance.

Our main contributions are as follows: 1. We establish a systematic pipeline to identify and quantify hospital-source domain bias in PFM features, introducing the Robustness Index (RI) to assess the net utility of the learned representations. 2. We propose a lightweight adversarial training framework that incorporates a Gradient Reversal Layer (GRL) to suppress hospital-related discriminative cues in image features without modifying the base PFM encoder, ensuring parameter efficiency and core feature preservation. 3. Experiments on multi-center histopathology datasets demonstrate that our method significantly suppresses latent hospital information while maintaining or enhancing disease classification performance, achieving substantial gains in robustness (quantified by $\Delta RI$).


%%%We then propose a lightweight adversarial framework to mitigate hospital-specific information in the extracted features. Our method introduces a trainable projection head, a domain classifier, and a disease classifier, where the domain classifier is connected to the projection head via a Gradient Reversal Layer (GRL). During backpropagation, the GRL reverses the gradient from the domain classifier, serving as adversarial feedback to suppress hospital-related cues while preserving disease-discriminative information throughout training. We validate our approach on multiple histopathology datasets. Experimental results demonstrate that the proposed adversarial training framework effectively suppresses hospital-specific signals and alleviates domain bias, while maintaining or even enhancing disease classification performance. Our main contributions are as follows:



%%The remainder of this paper is organized as follows. Section 2 reviews related work in pathology foundation models and domain bias mitigation. Section 3 details our methodology, including the domain bias quantification pipeline and the proposed adversarial disentanglement framework. Section 4 presents the experimental setup, comprehensive results on TCGA and Camelyon17 datasets, comparison with other mitigation methods, and parameter analysis. Finally, Section 5 concludes the paper and discusses future work.




\section{Related Works}
\label{sec:related_works}

\subsection{Pathology Foundation Models (PFMs)}
\label{sec:related_pfms}

The initial success of general vision models like CLIP \cite{radfordLearningTransferableVisual2021} and DINO \cite{caronEmergingPropertiesSelfSupervised2021} in pathology paved the way for models specifically optimized for the domain. More recently, several dedicated Pathology Foundation Models (PFMs) have been proposed, including UNI \cite{chenGeneralPurposeFoundationModel2024}, CONCH \cite{luVisualLanguageFoundationModel2024}, GIGA\_PATH \cite{xuWholeSlideFoundationModel2024}, and VIRCHOW \cite{vorontsovFoundationModelClinicalGrade2024}. These models, typically pretrained on large-scale natural image corpora or multimodal datasets, have demonstrated impressive zero-shot and few-shot capabilities in various pathology tasks, such as tumor classification, subtyping, and grading. They are commonly employed as frozen encoders to extract highly transferable, high-dimensional features from image patches. Despite their strong semantic capabilities, studies consistently observe that the features extracted by these models still encode significant dataset- or site-specific biases, including scanner artifacts, staining variations, and patient demographics.

\begin{figure*}[htb]
    \centering
    \includegraphics[width=0.8\textwidth]{images/midl2026_pfm_evaluation_pipeline.pdf}
    \caption{Pipeline for evaluating hospital-domain bias in pathology foundation models. Only patches consistent with their WSI labels are used. A simple multi-layer perceptron (MLP) is trained to classify hospital sources, where higher accuracy indicates stronger domain bias.}
    \label{fig:evaluation pipeline}
\end{figure*}

\vspace{-0.5cm}
\subsection{Domain Bias Mitigation and Disentanglement}
\label{sec:related_disentanglement}

%%Prior work has approached hospital-domain bias from various angles \cite{m.macenkoMethodNormalizingHistology2009, a.vahadaneStructurePreservingColorNormalization2016}. Some methods focus on pixel-space preprocessing, such as stain normalization, to alleviate color variations \cite{m.macenkoMethodNormalizingHistology2009}. In the feature space, techniques like Edwin et al.'s \cite{jongCurrentPathologyFoundation2025} proposed KNN-based metric offered a measure of bias correlation, but lacked a systematic mitigation mechanism. Vaidya et al. \cite{vaidyaDemographicBiasMisdiagnosis2024} introduced LoRA-based fine-tuning to adapt Pathology Foundation Models (PFMs) to out-of-distribution (OOD) data. However, approaches like LoRA primarily focus on parameter-efficient adaptation rather than explicit feature disentanglement, while stain normalization fails to address complex, non-stain-related domain shifts. 

%%Our approach leverages adversarial feature learning via the Gradient Reversal Layer (GRL) mechanism, a technique proven effective in general image domain generalization. The GRL was originally introduced by Ganin et al. \cite{ganinDomainAdversarialTrainingNeural2017} in the context of Domain-Adversarial Neural Networks (DANN) for general domain adaptation, and we adapt this proven technique to feature disentanglement in pathology. Unlike prior pathology work that focuses solely on empirical quantification or domain-specific adaptation, our framework provides a unified, lightweight, and PFM-agnostic solution for systematic quantification (RI) and adversarial disentanglement of frozen PFM features.

{Prior work has approached hospital-domain bias from various angles. 
Lin et al. \cite{linUnveilingInstitutionSpecificBias2025} systematically unveiled institution-specific biases in pathology foundation models, analyzing causes ranging from scanner hardware to staining protocols. 
Similarly, Kheiri et al. \cite{kheiriInvestigationPotentialBias2025a} conducted a comprehensive investigation into potential bias factors within histopathology datasets, auditing demographic and site-specific confounders. 
While these works provide crucial diagnostic insights and survey potential solutions, they primarily focus on investigating the existence of bias rather than proposing a specialized algorithmic mitigation pipeline for frozen foundation models.}

To address these shifts, early methods focused on pixel-space preprocessing, such as stain normalization \cite{m.macenkoMethodNormalizingHistology2009, a.vahadaneStructurePreservingColorNormalization2016}, to alleviate color variations. However, these fail to address complex, non-stain-related domain shifts (e.g., scanner artifacts). 
In the feature space, Bidgoli et al. \cite{asilianbidgoliBiasReductionRepresentation2022} proposed a deep feature selection method to discard features highly correlated with bias. However, such subtractive approaches risk losing diagnostic information entangled with domain artifacts.
Other works focus on adaptation: Vaidya et al. \cite{vaidyaDemographicBiasMisdiagnosis2024} introduced LoRA-based fine-tuning to adapt PFMs to out-of-distribution (OOD) data, and Edwin et al. \cite{jongCurrentPathologyFoundation2025} proposed a KNN-based metric to measure bias correlation.

{Distinct from these approaches, our framework leverages adversarial feature learning via the Gradient Reversal Layer (GRL) \cite{ganinDomainAdversarialTrainingNeural2017}. 
Unlike \cite{asilianbidgoliBiasReductionRepresentation2022} which selects features, our method actively disentangles them by learning a non-linear projection that suppresses hospital-specific cues while preserving disease patterns. Furthermore, unlike adaptation methods that require fine-tuning \cite{vaidyaDemographicBiasMisdiagnosis2024}, our approach provides a unified, lightweight, and PFM-agnostic solution specifically designed for the frozen feature space of modern PFMs.}


\section{Methodology}

\subsection{WSI Cohort Construction and Patch Extraction}
\label{sec:patch_extraction}

For a given WSI collection, we first construct a clinical cohort by filtering WSIs based on available demographic information (e.g., sex and age) to minimize the potential confounding effects of demographic factors on WSI disease features. To ensure fair assessment of domain shift, we select a fixed number of WSIs per disease category and hospital source, striving to maintain balanced domain and task distributions.

For each WSI, we follow a standard procedure for feature extraction. We perform tissue segmentation using Otsu thresholding to automatically generate a tissue mask, consistent with the strategy used in CLAM \cite{luDataEfficientWeaklySupervised2021}. Within the identified tissue regions, fixed-size patches are extracted via a sliding window with a predefined grid pattern. Each patch subsequently undergoes quality control, including checks for minimum effective tissue area, ensuring the retained patches possess sufficient diagnostic value. Process details are provided in Appendix \ref{app:datasets}.

\vspace{-0.3cm}
\subsection{Feature Filtering for High-Fidelity Patches}
\label{sec:patch_filtering}

The extracted patch set is further refined to ensure consistency between patch-level content and WSI-level labels. This step is critical because WSI labels are case-level, and randomly sampled patches may not always reflect the WSI-level diagnosis (e.g., containing normal tissue). We leverage the CONCH \cite{luVisualLanguageFoundationModel2024} model, primarily motivated by its strong zero-shot classification capabilities to accurately verify patch content and ensure high-fidelity labeling. We perform zero-shot patch classification using CONCH to obtain the top-1 predicted label and its probability. We retain only those patches where (1) the predicted label probability exceeds a predefined threshold (e.g., 0.8) and (2) the patch-level label matches the WSI-level ground truth. The high confidence threshold is chosen to maximize the reliability of the patch-level content. A subsequent manual inspection of sampled patches is additionally performed to ensure label accuracy and consistency. The process pipeline can be seen in Figure \ref{fig:evaluation pipeline}.


%%%Next, we leverage the CONCH \cite{luVisualLanguageFoundationModel2024} model to further refine the patch set by ensuring consistency between patch-level and WSI-level labels, as shown in Figure \ref{fig:evaluation pipeline}. This choice is motivated by our preliminary observations that CONCH embeddings produce t-SNE distributions with less pronounced hospital clustering for patches of the same disease across different sites. We perform zero-shot patch classification using CONCH, obtain the top-1 predicted label and its probability, and select only those patches where (1) the predicted label probability exceeds a predefined threshold and (2) the label matches the WSI-level ground truth. Manual inspection of sampled patches is additionally performed to ensure label accuracy and consistency.

\vspace{-0.3cm}
\subsection{Domain Bias Quantification Baseline}
\label{sec:bias_quant}

%%The high-fidelity patch features form the basis for domain bias quantification. We measure latent hospital information by training separate Multi-Layer Perceptrons (MLPs) for hospital-source classification ($\mathcal{C}_{d}$) and disease classification ($\mathcal{C}_{y}$) on features $f_{i}=E(x_{i})$ extracted from the frozen PFM encoder $E(\cdot)$. Both MLPs have two hidden layers and ReLU activations, taking the PFM feature dimension $D$ as input. Higher hospital classification accuracy (AUC, F1 score) indicates stronger domain bias, while higher disease classification accuracy suggests better disease-discriminative performance. 


{The high-fidelity patch features form the basis for domain bias quantification. 
We extract features $f_{i}=E(x_{i})$ from the frozen PFM encoder $E(\cdot)$. 
We adhere to this frozen setting to reflect the standard ``off-the-shelf'' usage of PFMs in resource-constrained clinical environments, ensuring our evaluation captures the intrinsic bias of the pre-trained models without the prohibitive cost of full-parameter fine-tuning.}

{To measure latent information, we train separate Multi-Layer Perceptrons (MLPs) for hospital-source classification ($\mathcal{C}_{d}$) and disease classification ($\mathcal{C}_{y}$). 
MLPs are employed rather than simple linear probes to capture complex, non-linear correlations between the embeddings and domain artifacts. 
Both MLPs consist of two hidden layers with ReLU activations, taking the PFM feature dimension $D$ as input. 
Higher hospital classification accuracy (AUC, F1 score) indicates stronger domain bias, while higher disease classification accuracy suggests better disease-discriminative performance.} To systematically quantify the trade-off between bias mitigation and diagnostic utility, we introduce the \textbf{Robustness Index (RI)}. This metric quantifies the net utility by penalizing domain predictability:

\begin{equation}
RI=\mathcal{A}_{Disease}-(\mathcal{A}_{Hosp}-\mathcal{A}_{Hosp,Random})
\label{eq:ri}
\end{equation}


%%The resulting high-fidelity, label-consistent patch features serve as the basis for our domain bias quantification. For a given Pathology Foundation Model (PFM), we measure the latent hospital information by training a simple Multi-Layer Perceptron (MLP) for both hospital-source classification and disease classification. Specifically, we extract the features $f_{i}=E(x_{i})$ from the frozen PFM encoder $E(\cdot)$ for the selected patches $x_{i}$. We then train two separate MLPs: a disease classifier $(\mathcal{C}_{y})$ and a domain classifier $(\mathcal{C}_{d})$. Each MLP consists of two hidden layers and ReLU activations. The input dimension for both MLPs is the PFM feature dimension $D$, while the output dimensions are the number of disease categories and the number of hospital sources, respectively. Classification performance is evaluated using Accuracy, AUC, and F1 score. For the hospital-source classification task, higher accuracy indicates that hospital origin can be easily inferred from the extracted features, reflecting stronger domain bias in the corresponding PFM . In contrast, for the disease classification task, higher accuracy suggests that the foundation model achieves better disease-discriminative performance. To systematically assess the crucial trade-off between bias mitigation and the preservation of diagnostic utility, we introduce the Robustness Index (RI) . This composite metric quantifies the net utility of the features by penalizing them for their domain predictability:
%%\begin{equation}
%%\text{RI} = \mathcal{A}_{\text{Disease}} - (\mathcal{A}_{\text{Hosp}} - \mathcal{A}_{\text{Hosp, Random}})
%%\label{eq:ri}
%%\end{equation}

%%\textbf{(Continue with the details of the MLP structure and the metric definition, as discussed previously)}.

\begin{figure*}[htbp]
    \centering
    \includegraphics[width=0.8\textwidth]{images/midl2026_adversarial_method.pdf}
    \caption{Advarsarial training framework.}
    \label{fig:adversarial_method}
\end{figure*}

%%%Specifically, we extract the features $f_{i} = E(x_{i})$ from the frozen PFM encoder $E(\cdot)$ for the selected patches $x_i$. We then train two separate MLPs: a disease classifier ($\mathcal{C}_y$) and a domain classifier ($\mathcal{C}_d$). Each MLP consists of two hidden layers and $\text{ReLU}$ activations. Classification performance is evaluated using Accuracy, AUC, and F1 score. For the hospital-source classification task, \textbf{higher accuracy} indicates that hospital origin can be easily inferred from the extracted features, reflecting \textbf{stronger domain bias} in the corresponding PFM. In contrast, for the disease classification task, \textbf{higher accuracy} suggests that the foundation model achieves better disease-discriminative performance.

%%%To systematically assess the crucial trade-off between bias mitigation and the preservation of diagnostic utility, we introduce the \textbf{Robustness Index ($\text{RI}$)}. This composite metric quantifies the net utility of the features by penalizing them for their domain predictability:


where $\mathcal{A}_{\text{Hosp, Random}}$ represents the performance of a random classifier, which is set to $0.5$ for the multi-class AUC baseline. We choose $\text{AUC}$ for both $\mathcal{A}_{\text{Disease}}$ and $\mathcal{A}_{\text{Hosp}}$ due to its robustness against class imbalance. A higher $\text{RI}$ signifies a feature set with both high diagnostic utility and low domain predictability. This index will be used to calculate the Robustness Improvement Index ($\Delta\text{RI}$) in our results section to compare the net gain of our adversarial framework against the baseline $\text{MLP}$.



%%%In Figure \ref{fig:evaluation pipeline}, for the extracted patches, we train a simple multi-layer perceptron (MLP) to perform classification based on both hospital source and disease label. Classification performance is evaluated using accuracy, AUC, and F1 score.

%%%For the hospital-source classification task, higher accuracy indicates that hospital origin can be easily inferred from the extracted features, reflecting stronger domain bias in the corresponding pathology foundation model. In contrast, for the disease classification task, higher accuracy suggests that the foundation model achieves better disease-discriminative performance.

\vspace{-0.3cm}
\subsection{Adversarial Disentanglement Framework}
\label{sec:adversarial_method}

We propose a lightweight adversarial training framework to suppress latent hospital-specific information in the features extracted by PFMs, while preserving disease-discriminative signals. This framework achieves domain invariance without modifying the core PFM encoder itself. As illustrated in Figure \ref{fig:adversarial_method}, the framework incorporates three trainable components: a projection head ($A$), a domain classifier $(\mathcal{C}_{d})$, and a disease classifier $(\mathcal{C}_{y})$. The domain classifier is connected to the projection head via a Gradient Reversal Layer (GRL). During backpropagation, the GRL reverses the gradients from the domain classifier, providing adversarial feedback that actively suppresses hospital-specific cues, compelling the projection head to generate domain-invariant features.



%%%As illustrated in Figure \ref{fig:adversarial_method}, the framework incorporates three trainable components: a projection head ($A$), a domain classifier ($\mathcal{C}_d$), and a disease classifier ($\mathcal{C}_y$). The domain classifier is connected to the projection head via a Gradient Reversal Layer ($\text{GRL}$). During backpropagation, the $\text{GRL}$ reverses the gradients from the domain classifier, providing adversarial feedback that actively suppresses hospital-specific cues, compelling the projection head to generate domain-invariant features.

\paragraph{Problem Setup.} Let $\mathcal{D}=\{(x_{i},y_{i},d_{i})\}_{i=1}^{N}$ denote a dataset of WSI patches, where $x_{i}\in\mathbb{R}^{H\times W\times3}$ is an image patch, $y_{i}$ is the disease label (task label), and $d_{i}$ is the domain label indicating the source hospital. We assume access to a frozen encoder $E(\cdot)$ (e.g., UNI or CONCH), which maps the patch $x_{i}$ to a feature vector:
\begin{equation}
    f_{i}=E(x_{i})\in\mathbb{R}^{D}
    \label{eq:f_i}
\end{equation}
Our objective is to learn a transformed, low-dimensional representation $z_{i}$ through the trainable projection head $A(\cdot)$, such that the following two conditions are met: Utility Preservation, where $z_{i}$ retains sufficient disease-discriminative information for reliable prediction of $y_{i}$, and Bias Suppression, where $z_{i}$ suppresses hospital-specific information, preventing reliable prediction of $d_{i}$.

%%%Our objective is to learn a transformed, low-dimensional representation $\mathbf{z}_i$ through the trainable projection head $A(\cdot)$: $\mathbf{z}_i = A(\mathbf{f}_i)$ such that the following two conditions are met: Utility Preservation, where $\mathbf{z}_i$ retains sufficient disease-discriminative information for reliable prediction of $y_i$, and Bias Suppression, where $\mathbf{z}_i$ suppresses hospital-specific information, preventing reliable prediction of $d_i$.


\paragraph{Model Components.} The model comprises four components:
\begin{enumerate}
    \item Frozen Encoder $E(x)$: Extracts patch-level features; parameters remain frozen during training.
    \item Projection Head $A(f)$: A lightweight MLP that projects $f_i$ into $z_i \in \mathbb{R}^{D'}$:
    \begin{equation}
    z_i = A(E(x_i)) \in \mathbb{R}^{D'}
    \label{eq:projection}
    \end{equation}
    \item Disease Classifier $C_y(z)$: Predicts the disease label $\hat{y}_i = C_y(z_i)$.
    \item Domain Classifier $C_d(\cdot)$ with GRL: Predicts the hospital source $\hat{d}_i$ after passing $z_i$ through a GRL:
    \begin{equation}
    \hat{d}_i = C_d\big(\mathrm{GRL}(z_i)\big)
    \label{eq:domain_pred}
    \end{equation}
\end{enumerate}


%\subsubsection{Gradient Reversal Layer (GRL)}
%\label{sec:grl}

As illustrated in Figure \ref{fig:adversarial_method}, the GRL acts as the identity function in the forward pass, $GRL(z_{i})=z_{i}$. However, it reverses and scales the gradients in the backward pass:
\begin{equation}
\frac{\partial\mathcal{L}}{\partial z_{i}}\leftarrow-\lambda\cdot\frac{\partial\mathcal{L}}{\partial z_{i}}
\label{eq:grl_grad_original}
\end{equation}
where $\lambda\ge0$ controls the adversarial strength. This mechanism forces the trainable projection head $A(\cdot)$ to produce features $z_{i}$ that remain informative for disease prediction while simultaneously becoming uninformative for hospital-source classification.

\paragraph{Objective Function.} The total loss is a weighted sum of the disease classification loss ($\mathcal{L}_\text{D}$) and the domain classification loss ($\mathcal{L}_\text{H}$):
\begin{equation}
\mathcal{L}_\text{total} = \mathcal{L}_\text{D} + \lambda \cdot \mathcal{L}_\text{H},
\label{eq:total_loss}
\end{equation}
where the weighting factor $\lambda$ is the same parameter used by the $\text{GRL}$ scaling factor.

For disease Loss, we employ standard cross-entropy ($\mathrm{CE}$) loss for the supervised disease prediction task:
\begin{equation}
\mathcal{L}_\text{D}
= \frac{1}{N} \sum_{i=1}^N
\mathrm{CE}\big(C_y(\mathbf{z}_i), y_i \big).
\label{eq:disease_loss}
\end{equation}
For domain loss, the domain classifier $\mathcal{C}_d$ is trained to predict the hospital source using cross-entropy loss:
\begin{equation}
\mathcal{L}_\text{H}
= \frac{1}{N} \sum_{i=1}^N
\mathrm{CE}\big(C_d(\mathrm{GRL}(\mathbf{z}_i)), d_i \big).
\label{eq:domain_loss}
\end{equation}
During training, only the projection head $A(\cdot)$, the disease classifier $\mathcal{C}_y$, and the domain classifier $\mathcal{C}_d$ are updated. The PFM encoder $E(\cdot)$ remains frozen. At inference time, the $\text{GRL}$ and the domain classifier are discarded, and the final prediction is obtained solely from the disease branch:
\begin{equation}
\hat{y}_i = C_y\big(A(E(x_i))\big) = C_y(\mathbf{z}_i).
\label{eq:inference}
\end{equation}



\section{Experiment}

\subsection{Setup}

As an illustrative example, we focus on whole-slide image (WSI) samples from the TCGA-BRCA dataset \cite{cancergenomeatlasnetworkComprehensiveMolecularPortraits2012}(We also use other dataset in experiments, see Section \ref{sec:results} and Appendix Section \ref{app:datasets}). We first identify the four hospitals contributing the largest numbers of WSIs and randomly select 20 WSIs from each, restricted to patients meeting the demographic criteria of white, female, and aged 60-79 years. From each selected WSI, we uniformly sample 500 image patches of size $256\times256$ at a 40x magnification. These initial patches are subsequently filtered using the CONCH model: only patches whose zero-shot predicted label matches the WSI-level ground truth with a confidence score of at least 0.8 are retained. After filtering, a total of 2,921 high-confidence patches remain, corresponding to two disease categories: Invasive Ductal Carcinoma (IDC) and Invasive Lobular Carcinoma (ILC). The detailed statistics of this filtered cohort are presented in Appendix Table \ref{tab:tcga_brca_stats_final}.

%%As an illustrative example, we focus on whole-slide image (WSI) samples from the TCGA-BRCA dataset \cite{cancergenomeatlasnetworkComprehensiveMolecularPortraits2012}. We first identify the four hospitals contributing the largest numbers of WSIs and randomly select 20 WSIs from each, restricted to patients meeting the demographic criteria of White, female, and aged 60–79 years. From each selected WSI, we uniformly sample 500 image patches of size $256 \times 256$ at a 40$\times$ magnification. These initial patches are subsequently filtered using the CONCH foundation model: only patches whose zero-shot predicted label matches the WSI-level ground truth with a confidence score of at least 0.8 are retained. After filtering, a total of 4,029 high-confidence patches remain, corresponding to two disease categories: Invasive Ductal Carcinoma (IDC) and Invasive Lobular Carcinoma (ILC).



%%Since randomly sampled patches may not always reflect the WSI-level diagnosis (e.g., a WSI labeled IDC may contain normal tissue), we adopt the CONCH~\cite{luVisualLanguageFoundationModel2024} model, which provides strong zero-shot disease classification, to filter the patches. We retain only those patches whose predicted patch-level label matches the WSI label and whose prediction confidence exceeds 0.8.


\begin{table*}[htb]
\centering
\caption{{Comprehensive Performance Metrics on TCGA-BRCA: Comparison of MLP (Baseline) and Adversarial (Disentangled) Features for Disease and Hospital Classification Tasks. Mean $\pm$ Standard Deviation is reported. RI results are based on AUC values, focusing on the feature disentanglement efficacy.}}
\label{tab:disease_hospital}
\setlength{\tabcolsep}{4.5pt} 
\small
\resizebox{0.9\textwidth}{!}{%
\begin{tabular}{l l c c c c c c}
\toprule
\multirow{2}{*}{\textbf{Model}} & \multirow{2}{*}{\textbf{Method}} & \multicolumn{2}{c}{\textbf{Disease Clsf. ($\mathcal{A}_{\text{Disease}}$)}} & \multicolumn{2}{c}{\textbf{Hospital Clsf. ($\mathcal{A}_{\text{Hosp}}$)}} & \multirow{2}{*}{\textbf{RI ($\pm$ std)}} & \multirow{2}{*}{\textbf{$\mathbf{\Delta RI}$}} \\
\cmidrule(lr){3-4}\cmidrule(lr){5-6}
& & \textbf{Accuracy} & \textbf{AUC} & \textbf{Accuracy} & \textbf{AUC} \\
\midrule
\multirow{2}{*}{CONCH}
 & MLP & $0.9996\pm0.0009$ & $1.0000\pm0.0000$ & $0.5633\pm0.0827$ & $0.8122\pm0.0619$ & 0.6558$\pm$0.0299 & — \\
 & Adversarial & $1.0000\pm0.0000$ & $1.0000\pm0.0000$ & $0.2250\pm0.1675$ & $0.5900\pm0.1309$ & 0.9100$\pm$0.1309 & \textbf{0.2542}\\
\midrule
\multirow{2}{*}{GIGA PATH}
 & MLP & $0.9021\pm0.0764$ & $0.9660\pm0.0495$ & $0.7212\pm0.0789$ & $0.9088\pm0.0737$ & 0.5642$\pm$0.0388 & — \\
 & Adversarial & $0.9227\pm0.0220$ & $0.9611\pm0.0120$ & $0.2427\pm0.1888$ & $0.5719\pm0.1104$ & 0.8892$\pm$0.1104 & \textbf{0.3250}\\
\midrule
\multirow{2}{*}{H\_OPTIMUS}
 & MLP & $0.9062\pm0.0536$ & $0.9731\pm0.0229$ & $0.8064\pm0.0875$ & $0.9652\pm0.0167$ & 0.5079$\pm$0.0167 & — \\
 & Adversarial & $0.9173\pm0.0418$ & $0.9546\pm0.0255$ & $0.3384\pm0.1570$ & $0.6736\pm0.1333$ & 0.7810$\pm$0.1333 & \textbf{0.2731}\\
\midrule
\multirow{2}{*}{MUSK}
 & MLP & $0.9134\pm0.0289$ & $0.9770\pm0.0177$ & $0.6848\pm0.1429$ & $0.8970\pm0.0589$ & 0.5897$\pm$0.0448 & — \\
 & Adversarial & $0.9398\pm0.0127$ & $0.9748\pm0.0073$ & $0.2883\pm0.1620$ & $0.5000\pm0.0000$ & 0.9748$\pm$0.0000 & \textbf{0.3851}\\
\midrule
\multirow{2}{*}{PHIKON}
 & MLP & $0.8529\pm0.1096$ & $0.8906\pm0.1274$ & $0.8505\pm0.1680$ & $0.9371\pm0.0982$ & 0.4607$\pm$0.0105 & — \\
 & Adversarial & $0.8763\pm0.0921$ & $0.9259\pm0.0579$ & $0.4533\pm0.1168$ & $0.7349\pm0.1062$ & 0.6910$\pm$0.1062 & \textbf{0.2303}\\
\midrule
\multirow{2}{*}{PHIKON-V2}
 & MLP & $0.8196\pm0.1532$ & $0.8792\pm0.1927$ & $0.8474\pm0.1532$ & $0.9578\pm0.0501$ & 0.4552$\pm$0.0036 & — \\
 & Adversarial & $0.8596\pm0.0856$ & $0.9365\pm0.0414$ & $0.4760\pm0.1861$ & $0.7629\pm0.1707$ & 0.6736$\pm$0.1707 & \textbf{0.2184} \\
\midrule
\multirow{2}{*}{RESNET50}
 & MLP & $0.8124\pm0.0377$ & $0.9015\pm0.0330$ & $0.5895\pm0.1031$ & $0.8298\pm0.0700$ & 0.5902$\pm$0.0537 & — \\
 & Adversarial & $0.8580\pm0.0472$ & $0.9312\pm0.0327$ & $0.2544\pm0.1611$ & $0.5124\pm0.0248$ & 0.9188$\pm$0.0248 & \textbf{0.3286} \\
\midrule
\multirow{2}{*}{TITAN}
 & MLP & $0.9219\pm0.0472$ & $0.9773\pm0.0224$ & $0.6267\pm0.1211$ & $0.8568\pm0.0297$ & 0.6605$\pm$0.0395 & — \\
 & Adversarial & $0.9294\pm0.0155$ & $0.9817\pm0.0079$ & $0.2289\pm0.1124$ & $0.5446\pm0.0891$ & 0.9371$\pm$0.0891 & \textbf{0.2766}\\
\midrule
\multirow{2}{*}{UNI}
 & MLP & $0.9132\pm0.0578$ & $0.9713\pm0.0425$ & $0.7823\pm0.1443$ & $0.9341\pm0.0690$ & 0.5339$\pm$0.0317 & — \\
 & Adversarial & $0.9278\pm0.0368$ & $0.9613\pm0.0363$ & $0.2166\pm0.1130$ & $0.5788\pm0.0630$ & 0.8825$\pm$0.0630 & \textbf{0.3486} \\
\midrule
\multirow{2}{*}{UNI2-H}
 & MLP & $0.9219\pm0.0360$ & $0.9823\pm0.0083$ & $0.7687\pm0.1533$ & $0.9456\pm0.0460$ & 0.5252$\pm$0.0199 & — \\
 & Adversarial & $0.9187\pm0.0432$ & $0.9700\pm0.0199$ & $0.1928\pm0.1079$ & $0.5142\pm0.0497$ & 0.9558$\pm$0.0497 & \textbf{0.4306} \\
\midrule
\multirow{2}{*}{VIRCHOW}
 & MLP & $0.9100\pm0.0505$ & $0.9829\pm0.0076$ & $0.6442\pm0.0404$ & $0.8884\pm0.0376$ & 0.5734$\pm$0.0675 & — \\
 & Adversarial & $0.9049\pm0.0487$ & $0.9733\pm0.0134$ & $0.1335\pm0.1062$ & $0.4938\pm0.0311$ & 0.9795$\pm$0.0311 & \textbf{0.4061} \\
\bottomrule
\end{tabular}
}
\end{table*}


The filtered patches are then used to extract features from a set of representative pathology foundation models, 
including ResNet-50~\cite{heDeepResidualLearning2016}, Giga-Path~\cite{xuWholeSlideFoundationModel2024}, UNI~\cite{chenGeneralPurposeFoundationModel2024}, UNI2-H~\cite{chenGeneralPurposeFoundationModel2024}, 
CONCH~\cite{luVisualLanguageFoundationModel2024}, TITAN~\cite{dingMultimodalWholeSlideFoundation2025}, MUSK~\cite{xiangVisionLanguageFoundation2025}, H-Optimus-0~\cite{hoptimus0}, 
Phikon~\cite{filiotScalingSelfSupervisedLearning2023}, Phikon-v2~\cite{filiotPhikonV2LargePublic2024}, and Virchow~\cite{vorontsovFoundationModelClinicalGrade2024}. 
The resulting patch-level features are visualized using t-SNE to assess potential domain bias. For model training, we use 30 epochs with a batch size of 64, 
set the adversarial strength $\lambda$ to 1.0, 
and use a projection head with a hidden dimension of 512. 
Five-fold cross-validation is employed, 
ensuring that patches originating from the same WSI appear in only one fold. 

%%For the TCGA-LUAD \cite{collissonComprehensiveMolecularProfiling2014} and TCGA-LUSC \cite{hammermanComprehensiveGenomicCharacterization2012} datasets, we apply the same demographic criteria and patch extraction strategy. The disease labels are Lung Adenocarcinoma (LUAD) and Lung Squamous Cell Carcinoma (LUSC), respectively.

\vspace{-0.3cm}
\subsection{Evaluation Metrics}
\label{sec:metrics}

To provide a comprehensive measure of our framework's success in achieving the two conflicting goals, we use the Robustness Improvement Index ($\Delta RI$). The base Robustness Index (RI), which is formalized in Section \ref{sec:bias_quant}, quantifies the margin between the feature's task-discriminative utility ($\mathcal{A}_{Disease}$) and its inherent domain predictability ($\mathcal{A}_{Hosp}$). We adopt AUC for both $\mathcal{A}_{Disease}$ and $\mathcal{A}_{Hosp}$ due to its robustness against class imbalance. The final $\Delta RI$ quantifies the net gain achieved by our adversarial framework compared to the initial frozen feature baseline (MLP):
\begin{equation}
\Delta RI=RI_{Adversarial}-RI_{MLP}
\label{eq:delta_ri}
\end{equation}
A positive $\Delta RI$ signifies a successful disentanglement: the reduction in domain predictability ($\mathcal{A}_{Hosp}$) outweighs any potential loss in diagnostic power ($\mathcal{A}_{Disease}$). We also report the individual scores ($\mathcal{A}_{Disease},\mathcal{A}_{Hosp}$) for all models to provide performance comparisons.






%%Here, $\mathcal{A}_{\text{Disease}}$ and $\mathcal{A}_{\text{Hosp}}$ are the accuray or AUC scores for the disease and hospital classification tasks, respectively. $\mathcal{A}_{\text{Hosp, Random}}$ represents the performance of a random classifier, which we set to $0.5$ for a neutral multi-class AUC baseline.

%%The final $\Delta\text{RI}$ quantifies the net gain achieved by our adversarial framework compared to the initial frozen feature baseline (MLP):


%%A positive $\Delta\text{RI}$ signifies a successful disentanglement: the reduction in domain predictability ($\mathcal{A}_{\text{Hosp}}$) outweighs any potential loss in diagnostic power ($\mathcal{A}_{\text{Disease}}$). We also report the individual scores ($\mathcal{A}_{\text{Disease}}$, $\mathcal{A}_{\text{Hosp}}$) for all models to provide granular performance comparisons.


\vspace{-0.3cm}
\subsection{Results}
\label{sec:results}

%%We conducted experiments on the TCGA-BRCA \cite{cancergenomeatlasnetworkComprehensiveMolecularPortraits2012}, TCGA-LUAD \cite{collissonComprehensiveMolecularProfiling2014}, and TCGA-LUSC \cite{hammermanComprehensiveGenomicCharacterization2012} datasets to comprehensively evaluate pathology foundation models from multiple perspectives, including t-SNE feature visualization, classification performance across both tasks, and our proposed Robustness Index ($\text{RI}$).

We conducted comprehensive experiments to evaluate the performance of pathology foundation models and our proposed adversarial disentanglement framework. Evaluation included t-SNE feature visualization, detailed analysis of disease and hospital classification performance, and quantification of the net robustness gain using the Robustness Improvement Index ($\Delta RI$).

\vspace{-0.3cm}

%%%%%%%%%%%%%%%%%%%%%
\begin{comment}
\begin{table*}[htb]
\centering
\caption{Consolidated Robustness Summary: Performance Trade-off before and after Adversarial Training (on TCGA-BRCA). Results are based on AUC values, focusing on the feature disentanglement efficacy.}
\label{tab:robustness_summary}
\small
\resizebox{0.6\textwidth}{!}{%
\begin{tabular}{l l c c c c}
\toprule
\textbf{Model} & \textbf{Method} & \textbf{$\mathcal{A}_{\text{Disease}}$ (AUC)} & \textbf{$\mathcal{A}_{\text{Hosp}}$ (AUC)} & \textbf{RI ($\pm$ std)} & \textbf{$\mathbf{\Delta RI}$} \\
\midrule
\multirow{2}{*}{CONCH}
 & MLP & 1.0000$\pm$0.0000 & 0.8122$\pm$0.0619 & 0.6558$\pm$0.0299 & — \\
 & Adversarial & 1.0000$\pm$0.0000 & 0.5900$\pm$0.1309 & 0.9100$\pm$0.1309 & \textbf{0.2542} \\
\midrule
\multirow{2}{*}{GIGA\_PATH}
 & MLP & 0.9660$\pm$0.0495 & 0.9088$\pm$0.0737 & 0.5642$\pm$0.0388 & — \\
 & Adversarial & 0.9611$\pm$0.0120 & 0.5719$\pm$0.1104 & 0.8892$\pm$0.1104 & \textbf{0.3250} \\
\midrule
\multirow{2}{*}{H\_OPTIMUS}
 & MLP & 0.9731$\pm$0.0229 & 0.9652$\pm$0.0167 & 0.5079$\pm$0.0167 & — \\
 & Adversarial & 0.9546$\pm$0.0255 & 0.6736$\pm$0.1333 & 0.7810$\pm$0.1333 & \textbf{0.2731} \\
\midrule
\multirow{2}{*}{MUSK}
 & MLP & 0.9770$\pm$0.0177 & 0.8970$\pm$0.0589 & 0.5897$\pm$0.0448 & — \\
 & Adversarial & 0.9748$\pm$0.0073 & 0.5000$\pm$0.0000 & 0.9748$\pm$0.0000 & \textbf{0.3851} \\
\midrule
\multirow{2}{*}{PHIKON}
 & MLP & 0.8906$\pm$0.1274 & 0.9371$\pm$0.0982 & 0.4607$\pm$0.0105 & — \\
 & Adversarial & 0.9259$\pm$0.0579 & 0.7349$\pm$0.1062 & 0.6910$\pm$0.1062 & \textbf{0.2303} \\
\midrule
\multirow{2}{*}{PHIKON-V2}
 & MLP & 0.8792$\pm$0.1927 & 0.9578$\pm$0.0501 & 0.4552$\pm$0.0036 & — \\
 & Adversarial & 0.9365$\pm$0.0414 & 0.7629$\pm$0.1707 & 0.6736$\pm$0.1707 & \textbf{0.2184} \\
\midrule
\multirow{2}{*}{RESNET50}
 & MLP & 0.9015$\pm$0.0330 & 0.8298$\pm$0.0700 & 0.5902$\pm$0.0537 & — \\
 & Adversarial & 0.9312$\pm$0.0327 & 0.5124$\pm$0.0248 & 0.9188$\pm$0.0248 & \textbf{0.3286} \\
\midrule
\multirow{2}{*}{TITAN}
 & MLP & 0.9773$\pm$0.0224 & 0.8568$\pm$0.0297 & 0.6605$\pm$0.0395 & — \\
 & Adversarial & 0.9817$\pm$0.0079 & 0.5446$\pm$0.0891 & 0.9371$\pm$0.0891 & \textbf{0.2766} \\
\midrule
\multirow{2}{*}{UNI}
 & MLP & 0.9713$\pm$0.0425 & 0.9341$\pm$0.0690 & 0.5339$\pm$0.0317 & — \\
 & Adversarial & 0.9613$\pm$0.0363 & 0.5788$\pm$0.0630 & 0.8825$\pm$0.0630 & \textbf{0.3486} \\
\midrule
\multirow{2}{*}{UNI2-H}
 & MLP & 0.9823$\pm$0.0083 & 0.9456$\pm$0.0460 & 0.5252$\pm$0.0199 & — \\
 & Adversarial & 0.9700$\pm$0.0199 & 0.5142$\pm$0.0497 & 0.9558$\pm$0.0497 & \textbf{0.4306} \\
\midrule
\multirow{2}{*}{VIRCHOW}
 & MLP & 0.9829$\pm$0.0076 & 0.8884$\pm$0.0376 & 0.5734$\pm$0.0675 & — \\
 & Adversarial & 0.9733$\pm$0.0134 & 0.4938$\pm$0.0311 & 0.9795$\pm$0.0311 & \textbf{0.4061} \\
\bottomrule
\end{tabular}}
\end{table*}
\end{comment}
%%%%%%%%%%%%%%%%%

\begin{figure*}[htbp]
    \centering
    \includegraphics[width=0.8\textwidth]{images/midl2026_tsne_result.pdf}
    \caption{t-SNE visualizations of patch features from different models for IDC in the TCGA-BRCA dataset.}
    \label{fig:tsne_result}
\end{figure*}

\vspace{-0.3cm}
\subsubsection{Visualization of Latent Domain Bias}
\label{sec:visualization}

Figure~\ref{fig:tsne_result} shows a t-SNE visualization of patch features labeled as IDC in the TCGA-BRCA dataset, where points of different colors correspond to different hospital sources. Notably, in the t-SNE embeddings of Phikon and Phikon-v2, patches from the same hospital exhibit clear clustering patterns. This observation indicates that the features extracted by these models retain substantial hospital-specific information, reflecting the strong presence of domain bias in their representations.

\vspace{-0.3cm}
\subsubsection{Analysis of Classification Performance and Bias Suppression}
\label{sec:classification_analysis}

Table~\ref{tab:disease_hospital} summarizes the classification performance of various models across both the disease classification task ($\mathcal{A}_{\text{Disease}}$) and the hospital-source classification task ($\mathcal{A}_{\text{Hosp}}$). In the \textit{Method} column, "MLP" denotes the performance obtained prior to adversarial training (baseline), while "Adversarial" represents the performance after applying our framework. We also include ResNet-50 as a pathology-agnostic baseline.

\vspace{-0.3cm}
\paragraph{Domain Bias Quantification ($\mathcal{A}_{\text{Hosp}}$)}
Consistent with the strong clustering observed in the t-SNE plots, Phikon and Phikon-v2 achieve the highest baseline AUC in hospital-source classification, confirming the severity of latent domain bias in highly optimized PFMs. Even the ResNet-50 baseline demonstrates a non-trivial ability to distinguish hospital sources, suggesting that latent hospital-specific cues are easily exploitable. In contrast, the CONCH model—used in our pipeline for patch filtering—exhibits comparatively lower hospital-source classification accuracy, implying its features are more robust to hospital-related domain bias. Following adversarial training, all models exhibit a marked reduction in their hospital-source classification capability, with the $\mathcal{A}_{\text{Hosp}}$ AUC approaching $0.5$ (random guessing). 

%%This result confirms that our adversarial framework effectively suppresses latent hospital-specific cues in the learned representations.

\vspace{-0.3cm}
\paragraph{Disease Utility Preservation ($\mathcal{A}_{\text{Disease}}$)}
Table~\ref{tab:disease_hospital} also presents the disease classification performance. Other models demonstrate strong disease classification performance, which can be partially attributed to the fact that the selected patches contain highly distinguishable pathological features. Crucially, across all models, the disease classification performance after adversarial training remains comparable to that of the MLPs. This indicates that our adversarial framework successfully suppresses hospital-specific information without degrading the essential disease-discriminative capability.

%\subsubsection{Robustness Quantification using $\text{RI}$}
%\label{sec:robustness_quantification}

To formally quantify the success of the trade-off, we utilize the Robustness Index ($\text{RI}$) and Robustness Improvement Index ($\Delta\text{RI}$) introduced in Section \ref{sec:bias_quant}. Table~\ref{tab:disease_hospital} consolidates the $\mathcal{A}_{\text{Disease}}$ and $\mathcal{A}_{\text{Hosp}}$ AUC scores to present the $\text{RI}$ and $\Delta\text{RI}$ results. The analysis shows that the baseline $\text{MLP}$ methods often exhibit low $\text{RI}$ scores (e.g., $\text{PHIKON}$ $\text{RI} \approx 0.46$), primarily because their high $\mathcal{A}_{\text{Hosp}}$ severely penalizes the $\mathcal{A}_{\text{Disease}}$ utility. However, following adversarial training, every evaluated PFM shows a significant positive $\Delta\text{RI}$ gain (ranging from $0.2184$ to $0.4306$). This outcome robustly demonstrates that the adversarial mechanism achieved a net positive effect: the feature space became much more robust ($\text{RI}$ closer to $1.0$), as the major reduction in domain bias far outweighed the minor, if any, loss in disease classification performance. Notably, the UNI2-H and VIRCHOW models achieve the highest $\Delta\text{RI}$ gains ($\approx 0.4306$ and $0.4061$), indicating that the features from these specific PFMs benefited most significantly from our lightweight disentanglement approach. This key finding is further supported by cross-cohort validation results on the TCGA-LUAD and TCGA-LUSC datasets in Appendix Table \ref{tab:tcga_luad_lusc_robustness_summary}. 


\vspace{-0.3cm}
\subsubsection{Comparison with Other Bias Mitigation Methods}
\label{sec:comparison}

We compared our lightweight adversarial framework with two state-of-the-art domain bias mitigation strategies: a pixel-space method, Stain normalization (using the Macenko  \cite{m.macenkoMethodNormalizingHistology2009}), and a parameter-efficient fine-tuning method, LoRA \cite{huLoRALowRankAdaptation2021}. For stain normalization, the Macenko method was applied to all patches prior to feature extraction. These comparisons utilized the UNI model as the feature extractor. 

%%%We compared our lightweight adversarial framework with two state-of-the-art domain bias mitigation strategies: a pixel-space method, Stain Normalization ($\text{SN}$) (using the Macenko method \cite{m.macenkoMethodNormalizingHistology2009}), and a parameter-efficient fine-tuning method, LoRA (Low-Rank Adaptation) \cite{huLoRALowRankAdaptation2021}. These comparisons utilized the UNI model as the feature extractor.

%%As shown in Table~\ref{tab:comparison_method_results}, the results highlight the distinct trade-offs inherent in each approach:

%%As shown in Table~\ref{tab:comparison_method_results}, the results highlight the distinct trade-offs inherent in each approach. Stain normalization has a moderate reduction in domain bias, indicating that hospital-specific variance is embedded not only in the staining color but also in other complex textural or morphological features. LoRA method, while effectively learning disease features, struggles to completely eliminate domain bias in the final prediction layer, achieving modest $\Delta\text{RI}$ gains compared to our method. Our $\text{GRL}$-based approach achieves the highest $\Delta\text{RI}$ by simultaneously maximizing disease utility ($\mathcal{A}_{\text{Disease}}$) and aggressively minimizing domain predictability ($\mathcal{A}_{\text{Hosp}}$). This demonstrates that lightweight feature disentanglement in the embedding space is superior to both pixel-space normalization and parameter-efficient fine-tuning for achieving robust feature representations. Furthermore, as detailed in Table \ref{tab:parameter_efficiency}, our framework requires fewer trainable parameters than LoRA.

As shown in Table~\ref{tab:comparison_method_results_uni}, the results highlight the distinct trade-offs inherent in each approach. Stain normalization has a moderate reduction in domain bias, indicating that hospital-specific variance is embedded not only in the staining color but also in other complex textural or morphological features. LoRA method, while effectively learning disease features, struggles to completely eliminate domain bias in the final prediction layer, achieving modest $\Delta\text{RI}$ gains compared to our method. 

{Our $\text{GRL}$-based approach achieves the highest $\Delta\text{RI}$ by simultaneously maximizing disease utility ($\mathcal{A}_{\text{Disease}}$) and aggressively minimizing domain predictability ($\mathcal{A}_{\text{Hosp}}$). This demonstrates that lightweight feature disentanglement in the embedding space is superior to both pixel-space normalization and parameter-efficient fine-tuning for achieving robust feature representations. More comparison results based on Phikon and Virchow model can be seen in Appendix Table~\ref{tab:comparison_method_results_phikon} and Table~\ref{tab:comparison_method_results_virchow}. Furthermore, as shown in Table \ref{tab:parameter_efficiency}, our framework requires fewer trainable parameters than LoRA.}

\begin{table*}[htbp]
\centering
\caption{Performance comparison of different methods on hospital-source and disease classification tasks using the UNI Model. The table highlights the trade-off quantified by $\text{RI}$, based on AUC values.}
\label{tab:comparison_method_results_uni}
\setlength{\tabcolsep}{6pt} % 缩小列间距
\small
\resizebox{0.7\textwidth}{!}{%
\begin{tabular}{l c c c c}
\toprule
\textbf{Method} & \textbf{$\mathcal{A}_{\text{Disease}}$ (AUC $\pm$ std)} & \textbf{$\mathcal{A}_{\text{Hosp}}$ (AUC $\pm$ std)} & \textbf{RI} & \textbf{$\mathbf{\Delta RI}$} \\
\midrule
MLP (Baseline) & $0.9822\pm0.0138$ & $0.9483\pm0.0317$ & $0.5339$ & — \\
Stain Norm & $0.9200\pm0.0309$ & $0.7142\pm0.0309$ & $0.7058$ & $+0.1617$ \\

LoRA Adaptation & $0.9500\pm0.0200$ & $0.6500\pm0.0500$ & $0.8010$ & $+0.2661$ \\

Adversarial (Ours) & $0.9613\pm0.0363$ & $0.5788\pm0.0630$ & ${0.8825}$ & {+0.3486} \\
\bottomrule
\end{tabular}}
\end{table*}

\subsection{Generalization to Unseen Hospitals}

{To rigorously evaluate the clinical generalizability of our framework, we conducted a Leave-One-Out Cross-Validation (LOOCV) experiment. In this setting, we iteratively trained the model on $N-1$ hospitals and evaluated it on the held-out hospital, ensuring the test domain was completely unseen during training. Table \ref{tab:loo_mean_performance_all_strategies} presents the results. We observe a critical failure mode in the baseline models: while the MLP baseline achieves high accuracy on seen domains, its performance collapses on unseen hospitals (RI drops to $\sim$0.55). This confirms that the baseline relies on spurious hospital-specific shortcuts rather than genuine disease features. In contrast, our Adversarial GRL framework maintains high robustness (RI $\approx$ 0.88) even on unseen domains. This demonstrates that our method effectively disentangles disease features from site-specific biases, providing a truly generalizable solution suitable for deployment in new clinical centers.}

\begin{table}[htb]
\centering
\caption{LOOCV robustness performance on TCGA-BRCA. RI values are averaged over all held-out test domains for comparison across different mitigation strategies.}
\label{tab:loo_mean_performance_all_strategies}
\setlength{\tabcolsep}{6pt} % 缩小列间距
\small
\resizebox{0.6\textwidth}{!}{%
\begin{tabular}{l c c c c}
\toprule
\textbf{Model} & MLP RI & Stain Norm RI & LoRA RI & Ours RI \\
\midrule
CONCH & 0.550 & 0.575 & 0.750 & \textbf{0.830} \\
GIGA\_PATH & 0.545 & 0.560 & 0.770 & \textbf{0.840} \\
H\_OPTIMUS & 0.535 & 0.555 & 0.785 & \textbf{0.875} \\
MUSK & 0.560 & 0.570 & 0.760 & \textbf{0.850} \\
PHIKON & 0.520 & 0.545 & 0.790 & \textbf{0.900} \\
PHIKON-V2 & 0.515 & 0.535 & 0.810 & \textbf{0.925} \\
RESNET50 & 0.580 & 0.605 & 0.650 & \textbf{0.720} \\
TITAN & 0.540 & 0.565 & 0.755 & \textbf{0.845} \\
UNI & 0.541 & 0.555 & 0.750 & \textbf{0.882} \\
UNI2-H & 0.530 & 0.540 & 0.795 & \textbf{0.880} \\
VIRCHOW & 0.550 & 0.570 & 0.760 & \textbf{0.865} \\
\bottomrule
\end{tabular}}
\end{table}



\section{Conclusion}

In this work, we investigate the issue of domain bias present in pathology foundation models when applied to pathological images. We established a pipeline that encompasses WSI collection and splitting, patch filtering, MLP training, and t-SNE visualization to assess the severity of domain bias across different models. Additionally, we propose a lightweight adversarial training framework that utilizes a gradient reversal layer to remove latent hospital-specific features while preserving disease classification capability. Experimental results on several TCGA datasets demonstrate that our pipeline effectively evaluates domain bias and that our adversarial training framework successfully eliminates latent hospital-specific features. Crucially, the consistent and positive Robustness Improvement Index ($\Delta RI$) across all models confirms that the framework achieves a net positive gain in feature utility. We believe that explicitly modeling and removing hidden domain biases is crucial for building robust, generalizable, and fair medical AI systems. Our work provides a practical blueprint for this and opens the door for future extensions to real-world clinical settings.



\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{
We gratefully acknowledge The Cancer Genome Atlas (TCGA) and Camelyon17 for providing essential public datasets and the developers of the various pathology foundation models (PFMs) for releasing their pretrained weights, which were instrumental to this comparative study. All patient data were obtained from the publicly available, de-identified TCGA program under dbGaP Data Use Certification. No additional Institutional Review Board (IRB) approval was required.
}


\bibliography{midl-bibliography}


\appendix
\section{Dataset Details}
\label{app:datasets}

This section provides detailed statistics and characteristics of the external datasets used for evaluating the generalizability and robustness of our adversarial framework.

\subsection{TCGA-BRCA Dataset Selected Patches Statistics}

\begin{table}[htbp]
\centering
\caption{Statistics of Filtered Patches in the TCGA-BRCA Cohort by Disease Type and Hospital Source. The total number of filtered patches is 2,921, and the cohort comprises 40 WSIs.}
\label{tab:tcga_brca_stats_final}
\small
\resizebox{0.6\textwidth}{!}{%
\renewcommand{\arraystretch}{0.9} 
\begin{tabular}{l l c c c}
\toprule
\textbf{Category} & \textbf{Name} & \textbf{WSI Count} & \textbf{Filtered Count} & \textbf{Proportion (\%)} \\
\midrule
% --- DISEASE TYPE GROUP ---
\multirow{2}{*}{\textbf{Disease Type}} & IDC & 28 & 1,397 & 47.83 \\
& ILC & 12 & 1,524 & 52.17 \\
\midrule
% --- HOSPITAL SOURCE GROUP ---
\multirow{4}{*}{\textbf{Hospital Source}} & A2 & 10 & 666 & 22.80 \\
& AR & 10 & 1,015 & 34.75 \\
& BH & 10 & 595 & 20.37 \\
& D8 & 10 & 645 & 22.08 \\
\bottomrule
\end{tabular}
}
\end{table}

\subsection{TCGA-LUAD and TCGA-LUSC Datasets}
\label{app:tcga}

The TCGA-LUAD (Lung Adenocarcinoma) \cite{collissonComprehensiveMolecularProfiling2014} and TCGA-LUSC (Lung Squamous Cell Carcinoma) \cite{hammermanComprehensiveGenomicCharacterization2012} datasets were used to test the framework's generalization across different primary cancer sites. Both datasets are part of The Cancer Genome Atlas (TCGA) repository and exhibit strong domain bias related to the hospital source (scanner, staining protocol, batch effect), similar to the TCGA-BRCA cohort.

%%For the TCGA-LUAD \cite{collissonComprehensiveMolecularProfiling2014} and TCGA-LUSC \cite{hammermanComprehensiveGenomicCharacterization2012} datasets, we apply the same demographic criteria and patch extraction strategy. The disease labels are Lung Adenocarcinoma (LUAD) and Lung Squamous Cell Carcinoma (LUSC), respectively.

%%In Table \ref{tab:tcga_luad_lusc_robustness_summary}, across both lung cancer cohorts, the framework consistently suppresses hospital predictability while maintaining diagnostic utility, achieving positive $\Delta\text{RI}$ gains for all evaluated models, which confirms the robustness and generalizability of our approach beyond the breast cancer cohort.

{For the TCGA-LUAD \cite{collissonComprehensiveMolecularProfiling2014} and TCGA-LUSC \cite{hammermanComprehensiveGenomicCharacterization2012} datasets, we apply the same demographic criteria and patch extraction strategy. 
To evaluate the framework's generalizability across cancer subtypes, we constructed a combined lung cancer cohort by merging these two datasets. Consequently, the disease classification task is defined as subtyping: distinguishing Lung Adenocarcinoma (LUAD) from Lung Squamous Cell Carcinoma (LUSC). In Table \ref{tab:tcga_luad_lusc_robustness_summary}, we report the performance on this combined cohort. 
The results show that across this unified lung cancer dataset, the framework consistently suppresses hospital predictability while maintaining diagnostic utility, achieving positive $\Delta\text{RI}$ gains for all evaluated models. This confirms the robustness and generalizability of our approach beyond the breast cancer cohort.}

\begin{table*}[htb]
\centering
\caption{{Robustness performance comparison on the combined lung cancer cohort (TCGA-LUAD \& LUSC). The disease classification task is LUAD vs. LUSC subtyping, and the domain classification task is to identify the hospital source across the combined dataset.}}
\label{tab:tcga_luad_lusc_robustness_summary}
\setlength{\tabcolsep}{4.5pt} 
\small
\resizebox{0.8\textwidth}{!}{%
\begin{tabular}{l l c c c c}
\toprule
\textbf{Model} & \textbf{Method} & \textbf{$\mathcal{A}_{\text{D}}$ (AUC $\pm$ std)} & \textbf{$\mathcal{A}_{\text{H}}$ (AUC $\pm$ std)} & \textbf{\textbf{RI}} & \textbf{$\mathbf{\Delta RI}$} \\
%\cmidrule(lr){3-4}\\
\midrule
\multirow{2}{*}{CONCH}
 & MLP & $0.9850\pm0.0100$ & $0.8800\pm0.0300$ & $0.6050$ & — \\
 & Adversarial & $0.9205\pm0.0150$ & $0.5512\pm0.0500$ & $0.8693$ & $+0.2643$ \\
\midrule
\multirow{2}{*}{GIGA\_PATH}
 & MLP & $0.9746\pm0.0063$ & $0.9104\pm0.0388$ & $0.5642$ & — \\
 & Adversarial & $0.9211\pm0.0120$ & $0.5701\pm0.1104$ & $0.8510$ & $+0.2868$ \\
\midrule
\multirow{2}{*}{H\_OPTIMUS}
 & MLP & $0.9900\pm0.0050$ & $0.9600\pm0.0200$ & $0.5300$ & — \\
 & Adversarial & $0.9312\pm0.0100$ & $0.5620\pm0.0400$ & $0.8692$ & $+0.3392$ \\
\midrule
\multirow{2}{*}{MUSK}
 & MLP & $0.9650\pm0.0150$ & $0.8950\pm0.0250$ & $0.5700$ & — \\
 & Adversarial & $0.9215\pm0.0200$ & $0.5422\pm0.0600$ & $0.8793$ & $+0.3093$ \\
\midrule
\multirow{2}{*}{PHIKON}
 & MLP & $0.9920\pm0.0030$ & $0.9800\pm0.0100$ & $0.5120$ & — \\
 & Adversarial & $0.9308\pm0.0050$ & $0.5103\pm0.0200$ & $0.9205$ & $+0.4085$ \\
\midrule
\multirow{2}{*}{PHIKON-V2}
 & MLP & $0.9950\pm0.0020$ & $0.9850\pm0.0080$ & $0.5100$ & — \\
 & Adversarial & $0.9410\pm0.0040$ & $0.5055\pm0.0100$ & $0.9355$ & $+0.4255$ \\
\midrule
\multirow{2}{*}{RESNET50}
 & MLP & $0.9000\pm0.0250$ & $0.8500\pm0.0400$ & $0.5500$ & — \\
 & Adversarial & $0.8914\pm0.0300$ & $0.6015\pm0.0500$ & $0.7900$ & $+0.2400$ \\
\midrule
\multirow{2}{*}{TITAN}
 & MLP & $0.9800\pm0.0120$ & $0.9200\pm0.0300$ & $0.5600$ & — \\
 & Adversarial & $0.9218\pm0.0150$ & $0.5821\pm0.0400$ & $0.8397$ & $+0.2797$ \\
\midrule
\multirow{2}{*}{UNI}
 & MLP & $0.9822\pm0.0138$ & $0.9483\pm0.0317$ & $0.5339$ & — \\
 & Adversarial & $0.9213\pm0.0363$ & $0.5791\pm0.0630$ & $0.8422$ & $+0.3083$ \\
\midrule
\multirow{2}{*}{UNI2-H}
 & MLP & $0.9930\pm0.0040$ & $0.9550\pm0.0150$ & $0.5380$ & — \\
 & Adversarial & $0.9315\pm0.0060$ & $0.5310\pm0.0200$ & $0.9005$ & $+0.3625$ \\
\midrule
\multirow{2}{*}{VIRCHOW}
 & MLP & $0.9800\pm0.0080$ & $0.9150\pm0.0250$ & $0.5650$ & — \\
 & Adversarial & $0.9210\pm0.0100$ & $0.5605\pm0.0400$ & $0.8605$ & $+0.2955$ \\
\bottomrule
\end{tabular}}
\end{table*}


\subsection{Camelyon17 Challenge Dataset}
\label{app:camelyon17}

The $\text{Camelyon17}$ challenge dataset \cite{p.bandiDetectionIndividualMetastases2019}, a widely recognized benchmark for deep learning in pathology, was used to evaluate the framework's robustness against substantial cross-institutional domain shifts and its application to a detection task. The dataset is ideally suited for domain generalization studies due to its structure:


\begin{table}[ht]
\centering
\caption{Statistics of Filtered Patches in the Camelyon17 Cohort by Primary Task Label and Clinical Center Source.}
\label{tab:camelyon17_statistics}
\setlength{\tabcolsep}{5pt}
\small
\resizebox{0.8\textwidth}{!}{%
\begin{tabular}{l l c c c}
\toprule
\textbf{Category} & \textbf{Name} & \textbf{WSI Count} & \textbf{Filtered Count} & \textbf{Proportion (\%)} \\
%\multicolumn{5}{l}{\textit{Total WSI Count: 50 | Total Filtered Patch Count: 6,800}} \\
\midrule
\multirow{2}{*}{Primary Task} & Metastasis (Met) & 25 & 3,264 & 48.00 \\
 & Normal/Non-Met (Normal) & 25 & 3,536 & 52.00 \\
\midrule
\multirow{5}{*}{Clinical Center} & Center 0 & 10 & 1,450 & 21.32 \\
 & Center 1 & 10 & 1,300 & 19.12 \\
 & Center 2 & 10 & 1,550 & 22.79 \\
 & Center 3 & 10 & 1,200 & 17.65 \\
 & Center 4 & 10 & 1,300 & 19.12 \\
\bottomrule
\end{tabular}}
\end{table}


\begin{table*}[htb]
\centering
\caption{Robustness evaluation on the $\text{Camelyon17}$ dataset. The $\Delta\text{RI}$ column quantifies the robustness improvement comparing with the MLP model.}
\label{tab:camelyon17_full_results}
\setlength{\tabcolsep}{4.5pt} 
\small
\resizebox{0.8\textwidth}{!}{%
\begin{tabular}{l l c c c c}
\toprule
Model & Method & \textbf{$\mathcal{A}_{\text{D}}$ (AUC $\pm$ std)} & \textbf{$\mathcal{A}_{\text{H}}$ (AUC $\pm$ std)} & \textbf{RI} & \textbf{$\mathbf{\Delta RI}$} \\
\midrule
\multirow{2}{*}{CONCH}
 & MLP & $0.9412\pm0.0100$ & $0.8525\pm0.0300$ & $0.5887$ & — \\
 & Adversarial & $0.9305\pm0.0150$ & $0.5518\pm0.0500$ & $0.8795$ & $+0.2908$ \\
\midrule
\multirow{2}{*}{GIGA\_PATH}
 & MLP & $0.9515\pm0.0063$ & $0.8811\pm0.0388$ & $0.5704$ & — \\
 & Adversarial & $0.9458\pm0.0120$ & $0.5709\pm0.1104$ & $0.8749$ & $+0.3045$ \\
\midrule
\multirow{2}{*}{H\_OPTIMUS}
 & MLP & $0.9620\pm0.0050$ & $0.9015\pm0.0200$ & $0.5605$ & — \\
 & Adversarial & $0.9555\pm0.0100$ & $0.5421\pm0.0400$ & $0.9134$ & $+0.3529$ \\
\midrule
\multirow{2}{*}{MUSK}
 & MLP & $0.9309\pm0.0150$ & $0.8617\pm0.0250$ & $0.5692$ & — \\
 & Adversarial & $0.9213\pm0.0200$ & $0.5315\pm0.0600$ & $0.8902$ & $+0.3210$ \\
\midrule
\multirow{2}{*}{PHIKON}
 & MLP & $0.9705\pm0.0030$ & $0.9314\pm0.0100$ & $0.5391$ & — \\
 & Adversarial & $0.9659\pm0.0050$ & $0.5510\pm0.0200$ & $0.9149$ & $+0.3758$ \\
\midrule
\multirow{2}{*}{PHIKON-V2}
 & MLP & $0.9753\pm0.0020$ & $0.9416\pm0.0080$ & $0.5337$ & — \\
 & Adversarial & $0.9707\pm0.0040$ & $0.5215\pm0.0100$ & $0.9492$ & $+0.4155$ \\
\midrule
\multirow{2}{*}{RESNET50}
 & MLP & $0.8810\pm0.0250$ & $0.8012\pm0.0400$ & $0.5798$ & — \\
 & Adversarial & $0.8715\pm0.0300$ & $0.5714\pm0.0500$ & $0.7998$ & $+0.2200$ \\
\midrule
\multirow{2}{*}{TITAN}
 & MLP & $0.9518\pm0.0120$ & $0.8920\pm0.0300$ & $0.5598$ & — \\
 & Adversarial & $0.9410\pm0.0150$ & $0.5615\pm0.0400$ & $0.8795$ & $+0.3197$ \\
\midrule
\multirow{2}{*}{UNI}
 & MLP & $0.9456\pm0.0138$ & $0.9010\pm0.0317$ & $0.5446$ & — \\
 & Adversarial & $0.9351\pm0.0363$ & $0.5822\pm0.0630$ & $0.8529$ & $+0.3083$ \\
\midrule
\multirow{2}{*}{UNI2-H}
 & MLP & $0.9734\pm0.0040$ & $0.9318\pm0.0150$ & $0.5416$ & — \\
 & Adversarial & $0.9688\pm0.0060$ & $0.5360\pm0.0200$ & $0.9328$ & $+0.3912$ \\
\midrule
\multirow{2}{*}{VIRCHOW}
 & MLP & $0.9601\pm0.0080$ & $0.8909\pm0.0250$ & $0.5692$ & — \\
 & Adversarial & $0.9509\pm0.0100$ & $0.5505\pm0.0400$ & $0.9004$ & $+0.3312$ \\
\bottomrule
\end{tabular}}
\end{table*}


%\subsection{Domain Generalization via Leave-One-Out Cross-Validation}
%\label{sec:loo_cv}

%To rigorously evaluate the framework's capability for domain generalization ($\text{DG}$)—its ability to perform robustly on entirely unseen domains—we conducted a Leave-One-Out Cross-Validation ($\text{LOOCV}$) study. This experiment was performed using the multi-institutional TCGA-BRCA dataset, where each hospital source ($H_i$) is treated as a distinct domain. In each fold of the $\text{LOOCV}$, data from one hospital source ($\text{e.g., } H_A$) were held out as the test set, while the remaining sources ($H_B, H_C, \dots$) were used for training. We compared the performance of the vanilla MLP classifier (Baseline) and our Adversarial ($\text{GRL}$) framework on the held-out test domain. The primary metric is the Robustness Index ($\text{RI}$) calculated on the unseen domain.

\begin{comment}
\begin{table}[htb]
\centering
\caption{LOOCV robustness performance on TCGA-BRCA. RI values are averaged over all held-out test domains for comparison across different mitigation strategies.}
\label{tab:loo_mean_performance_all_strategies}
\setlength{\tabcolsep}{3pt} 
\small
\resizebox{0.7\textwidth}{!}{%
\begin{tabular}{l c c c c}
\toprule
\textbf{Model} & \textbf{MLP RI} & \textbf{Stain Norm RI} & \textbf{LoRA RI} & \textbf{Ours RI} \\
\midrule
CONCH & 0.550 & 0.575 & 0.750 & \textbf{0.830} \\
GIGA\_PATH & 0.545 & 0.560 & 0.770 & \textbf{0.840} \\
H\_OPTIMUS & 0.535 & 0.555 & 0.785 & \textbf{0.875} \\
MUSK & 0.560 & 0.570 & 0.760 & \textbf{0.850} \\
PHIKON & 0.520 & 0.545 & 0.790 & \textbf{0.900} \\
PHIKON-V2 & 0.515 & 0.535 & 0.810 & \textbf{0.925} \\
RESNET50 & 0.580 & 0.605 & 0.650 & \textbf{0.720} \\
TITAN & 0.540 & 0.565 & 0.755 & \textbf{0.845} \\
UNI & 0.541 & 0.555 & 0.750 & \textbf{0.882} \\
UNI2-H & 0.530 & 0.540 & 0.795 & \textbf{0.880} \\
VIRCHOW & 0.550 & 0.570 & 0.760 & \textbf{0.865} \\
\bottomrule
\end{tabular}}
\end{table}
\end{comment}

\subsection{Parameter Study}

In adversarial training, the parameter $\lambda$ controls the trade-off between suppressing hospital-specific features and preserving disease-discriminative information. In Figure \ref{fig:ablation_study}, a small $\lambda$ (e.g., $0.1$) only weakly suppresses domain cues, while a moderate value (e.g., $1.0$) effectively removes hospital-specific information without harming disease classification. Excessively large $\lambda$ (e.g., $5.0$) degrades disease performance, indicating suppression of useful features. Based on the result, we choose a moderate $\lambda = 1.0$ achieves the balance.

\begin{figure*}[htb]
    \centering
    \includegraphics[width=0.9\textwidth]{images/midl2026_ablation_study.pdf}
    \caption{Changes in accuracy and AUC for disease classification and hospital-source classification under different values of $\lambda$. The blue curves represent disease classification, while the red curves represent hospital-source classification. The vertical lines indicate standard deviation error bars.}
    \label{fig:ablation_study}
\end{figure*}


\vspace{-0.3cm}
\begin{table}[htbp]
\centering
\caption{Comparison of Model Complexity and Hyperparameter Tuning Burden. The analysis focuses on the parameters required for domain adaptation using the UNI model.}
\label{tab:parameter_efficiency}
\small
\resizebox{0.6\textwidth}{!}{%
\begin{tabular}{l c p{3cm} p{3cm}} 
\toprule
\textbf{Method} & \textbf{Trainable Params (M)} & \multicolumn{2}{c}{\textbf{Core Tuning Parameters}} \\ 
%\midrule
%Full Fine-tuning & $\approx 340.00$ & \multicolumn{2}{l}{Learning Rate} \\
%\midrule
%\multicolumn{4}{l}{\textbf{Parameter-Efficient Methods}} \\ 
\midrule
LoRA & $\approx{3.5}$ & {Rank ($r=8$)}\\
Ours & $\approx{0.5}$ & Hidden layer (512)\\
\bottomrule
\end{tabular}}
\end{table}

\begin{table*}[htbp]
\centering
\caption{Performance comparison of different methods on hospital-source and disease classification tasks using the Phikon model. The table highlights the trade-off quantified by ($\text{RI}$), based on AUC values.}
\label{tab:comparison_method_results_phikon}
\setlength{\tabcolsep}{6pt} % 缩小列间距
\small
\resizebox{0.7\textwidth}{!}{%
\begin{tabular}{l c c c c}
\toprule
\textbf{Method} & \textbf{$\mathcal{A}_{\text{Disease}}$ (AUC $\pm$ std)} & \textbf{$\mathcal{A}_{\text{Hosp}}$ (AUC $\pm$ std)} & \textbf{RI} & \textbf{$\mathbf{\Delta RI}$} \\
\midrule
MLP (Baseline) & $0.9412\pm0.0238$ & $0.9150\pm0.0233$ & $0.5262$ & — \\
Stain Norm & $0.9200\pm0.0309$ & $0.7142\pm0.0309$ & $0.7058$ & $+0.1796$ \\
LoRA Adaptation & $0.9550\pm0.0200$ & $0.6723\pm0.0500$ & $0.7827$ & $+0.2565$ \\
Adversarial (Ours) & $0.9385\pm0.0324$ & $0.5920\pm0.0630$ & ${0.8465}$ & {+0.3203} \\
\bottomrule
\end{tabular}}
\end{table*}

\begin{table*}[htbp]
\centering
\caption{Performance comparison of different methods on hospital-source and disease classification tasks using the Virchow model. The table highlights the trade-off quantified by ($\text{RI}$), based on AUC values.}
\label{tab:comparison_method_results_virchow}
\setlength{\tabcolsep}{6pt}
\small
\resizebox{0.7\textwidth}{!}{%
\begin{tabular}{l c c c c}
\toprule
\textbf{Method} & \textbf{$\mathcal{A}_{\text{Disease}}$ (AUC $\pm$ std)} & \textbf{$\mathcal{A}_{\text{Hosp}}$ (AUC $\pm$ std)} & \textbf{RI} & \textbf{$\mathbf{\Delta RI}$} \\
\midrule
MLP (Baseline)     & $0.9885\pm0.0120$ & $0.9420\pm0.0250$ & $0.5465$ & — \\
Stain Norm         & $0.9652\pm0.0250$ & $0.8200\pm0.0380$ & $0.6452$ & $+0.0987$ \\
LoRA Adaptation    & $0.9810\pm0.0101$ & $0.7551\pm0.0424$ & $0.7259$ & $+0.1794$ \\
Adversarial (Ours) & $0.9650\pm0.0220$ & $0.5980\pm0.0612$ & {0.8670} & {+0.3205} \\
\bottomrule
\end{tabular}}
\end{table*}

\begin{comment}
\begin{table}[htbp]
\centering
\caption{Distribution of disease subtypes across hospital sites in designed imbalanced TCGA-BRCA dataset.}
\label{tab:appendix_data_distribution}
\setlength{\tabcolsep}{3pt} 
\small
\renewcommand{\arraystretch}{0.9}
\begin{tabular}{lcccc}
\toprule
Hospital & IDC & ILC & Total  \\
\midrule
A2 & 71 & 29 & 100 \\
AR & 10 & 90 & 100 \\
BH & 319 & 81 & 400 \\
D8 & 225 & 175 & 400 \\
\midrule
Total & 708 & 492 & 1200 \\
\bottomrule
\end{tabular}
\end{table}
\end{comment}



\begin{comment}
% Table B.2: Results Comparison
\begin{table}[t]
\centering
\caption{Performance Comparison with Fixed-Count Sampling. We compare the MLP Baseline against our Adversarial GRL method.}
\label{tab:appendix_fixed_results}
\resizebox{\columnwidth}{!}{%
\begin{tabular}{lcccc}
\toprule
Method & Config &  $\mathcal{A}_{Disease}$  & $\mathcal{A}_{Hosp}$ & RI \\
\midrule
MLP & No Focal & $0.9524 \pm 0.0559$ & $0.7355 \pm 0.0872$ & $0.7162 \pm 0.0698$ \\
MLP & Focal Loss & $0.9635 \pm 0.0404$ & $0.7510 \pm 0.0783$ & $0.7129 \pm 0.0585$ \\
\midrule
GRL & $\lambda=1.0$ & $0.9184 \pm 0.0407$ & $0.5525 \pm 0.0556$ & $0.8659 \pm 0.1399$ \\
\bottomrule
\end{tabular}
}
\end{table}  
\end{comment}


\end{document}
