\documentclass{midl}

\usepackage{mwe} % to get dummy images
\usepackage{tikz}
\usepackage{graphicx}
\usepackage{multirow}
\usepackage{multicol}
\usepackage{url}
\usepackage{orcidlink}
\usepackage{amssymb}
\usepackage{booktabs}
\usepackage{algorithm}
\usepackage{algpseudocode}
\usepackage{color, colortbl}
\usepackage{pifont}
\usepackage{xcolor}
\usepackage{wrapfig}
\usepackage{makecell}

\definecolor{LightCyan}{rgb}{0.88,1,1}
\definecolor{LightRed}{rgb}{1,0.88,0.88}
\definecolor{Gray}{rgb}{0.8,0.8,0.8}
\definecolor{LightGray}{rgb}{0.92,0.92,0.92}
\definecolor{LightPurple}{RGB}{226, 225, 254}

\newcolumntype{a}{>{\columncolor{LightCyan}}c}
\newcolumntype{b}{>{\columncolor{LightPurple}}c}
\newcolumntype{d}{>{\columncolor{LightRed}}c}

% Define the counter
\newcounter{myprop}
\newcounter{mydefcounter}
\newcounter{mylemcounter}

\newcommand{\myprop}[1]{%
  \stepcounter{myprop}%
  \noindent\textbf{Proposition \themyprop. }#1\par
}

\newcommand{\mydef}[1]{%
  \stepcounter{mydefcounter}%
  \noindent\textbf{Definition \themydefcounter. }#1\par
}

\newcommand{\mylem}[1]{%
  \stepcounter{mylemcounter}%
  \noindent\textbf{Lemma \themylemcounter. }#1\par
}

\newcommand{\cmark}{\ding{51}}%
\newcommand{\xmark}{\ding{55}}%

\renewcommand{\theadfont}{\bfseries}

\jmlrvolume{-- Accepted}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026 submission}
\jmlrvolume{-- 357}
\editors{Accepted at MIDL 2026}

\title[Exchangeability vs I.I.D.]{Is Exchangeability better than I.I.D.\ to handle Data Distribution Shifts while Pooling Data for Data-scarce Medical image segmentation?}


\midlauthor{
\Name{Ayush Roy\nametag{$^{1}$}} \Email{aroy25@buffalo.edu}\\
\Name{Samin Enam\nametag{$^{1}$}} \Email{saminena@buffalo.edu}\\
\Name{Jun Xia\nametag{$^{1}$}} \Email{junxia@buffalo.edu}\\
\Name{Won Hwa Kim\nametag{$^{2}$}} \Email{wonhwa@postech.ac.kr}\\
\Name{Vishnu Suresh Lokhande\nametag{$^{1}$}} \Email{vishnulo@buffalo.edu}\\
\addr $^{1}$ University at Buffalo (SUNY) \\
\addr $^{2}$ Pohang University of Science and Technology (POSTECH) 
}

\begin{document}

\maketitle

\begin{abstract}
Data scarcity is a major challenge in medical imaging, particularly for deep learning models. While data pooling (combining datasets from multiple sources) and data addition (adding more data from a new dataset) have been shown to enhance model performance, they are not without complications. Specifically, increasing the size of the training dataset through pooling or addition can induce distributional shifts, negatively affecting downstream model performance, a phenomenon known as the “Data Addition Dilemma”. While the traditional i.i.d. assumption may not hold in multi-source contexts, assuming exchangeability across datasets provides a more practical framework for data pooling. In this work, we investigate medical image segmentation under these conditions, drawing insights from causal frameworks to propose a method for controlling foreground-background feature discrepancies across all layers of deep networks. This approach improves feature representations, which are crucial in data-addition scenarios. Our method achieves state-of-the-art segmentation performance on histopathology and ultrasound images across five datasets, including a novel ultrasound dataset that we have curated and contributed. Qualitative results demonstrate more refined and accurate segmentation maps compared to prominent baselines across three model architectures. The code is available on \href{https://github.com/AyushRoy2001/Exchangeable-feature-disentanglement}{Github}.
\end{abstract}

\begin{keywords}
List of keywords, comma separated.
\end{keywords}

\section{Introduction}
\label{sec:intro}

\begin{figure*}%[ht]
    \centering
    \includegraphics[width=\linewidth, keepaspectratio]{images/INTRO.png}
    \vspace{-2.0em}
    \caption{\footnotesize (a) \textbf{Strong correlation between Dice and \(\mathcal{L}_{\textbf{fd}}\) (foreground-background feature discrepancy loss).} Strong correlation in both NucleiSegNet decoder and CMUNet encoder layers for ultrasound and histopathology images. (b) \textbf{Impact of Data Distribution Shift on Model Performance.} Adding S2 (similar distribution) to S1 training improves S1 test Dice, as expected with more data. However, adding S3 (distribution shift) degrades performance, consistent with \cite{shen2024data}. (c) \textbf{Proposed \(\mathcal{L}_{\textbf{fd}}\) applied to all U-Net layers.} Encoder (green), Decoder (grey), and Bottleneck (orange) features represent mediator \(Z\), optimized by \(\mathcal{L}_{\textbf{fd}}\). Each layer uses \(\mathcal{L}_{\textbf{fd}}\) with a unique learnable parameter \(\alpha\).} 
    \vspace{-2.0em}
    \label{fig:intro}
\end{figure*}

\begin{figure}[!b]
    \centering
    \includegraphics[width=0.9\linewidth, keepaspectratio]{images/Ablation.png}
    \vspace{-1.0em}
    \caption{\footnotesize (a) $\alpha$ (layer-wise weights) vs $\mathcal{L}_\textbf{fd}$ (feature discrepancy loss) for NucleiSegNet layers (TNBC) shows a similar trend across all models and datasets. (b) Right and left shifts in the test sample distribution for Dice scores and $\mathcal{L}_\textbf{fd}$ after applying $\mathcal{L}_\textbf{fd}$ (orange curve) for CMUNet (UDIAT), with a similar trend across datasets. Refined activation maps justify this improvement in Dice scores (see Figure \ref{fig:Heatmap}, Dec 4 and Bot) after penalizing foreground-background discrepancy with $\mathcal{L}_\textbf{fd}$. (c) Causal graph linking input X, mediator Z, label Y, and unobserved confounders U.} 
    \label{fig:ablation}
\end{figure}

Medical imaging datasets often suffer from limited sample sizes due to budget constraints and strict study criteria, including genetic risk factors. This scarcity is further compounded by the lack of diagnostic labels, posing challenges for deep learning models that rely on supervised learning. Small datasets amplify the risk of models learning spurious correlations \cite{thompson2014enigma, lokhande2022equivariance}, while distributional disparities hinder generalization to real-world clinical settings. In addition to that, smaller number of training samples lead to data memorization, data interpolation, and high variance models \cite{nakkiran2021deep,power2022grokking,lin2023over,ying2019overview}, all causing poor generalization. Though deep learning advancements show promise, issues of data quality and distribution mismatches remain significant barriers \cite{moyer2018invariant}. Semi-supervised learning and data augmentation provide partial solutions with varying effectiveness \cite{chapelle2006semi}. Pooling data from multiple sites, combined with techniques like covariate matching and meta-analysis, has improved model robustness and generalization \cite{lokhande2022equivariance}.  

\textbf{Limitations of Data Augmentation in Medical Imaging.} Data augmentation techniques like rotations, flips, and crops generate synthetic training samples to improve model robustness \cite{carmon2019unlabeled}, but in medical imaging they often introduce clinically unrealistic artifacts. Flipping or cropping brain images can disrupt inherent anatomical asymmetries critical for diagnosis \cite{akash2021learning,mehta2023efficient}, while such methods also fail to preserve realistic object boundaries (e.g., tumor margins) and spatial relationships, limiting their utility in semantic segmentation \cite{oliver2018realistic,goceri2023medical}. \textbf{Alternatives: Data Pooling and Data Addition.} Given augmentation's limitations, two alternatives emerge: data pooling and data addition. Pooling aggregates multi-institutional datasets to enhance statistical power and diversity, but faces distributional shifts (scanner variations, population differences) and non-i.i.d. data structures (Fig. \ref{fig:intro}b), requiring harmonization algorithms \cite{moyer2020scanner,lokhande2022equivariance,roy2026beyond}. Data addition incrementally integrates new data into pre-trained models, requiring adaptation strategies to reconcile distribution mismatches. However, the ``data addition dilemma'' \cite{shen2024data} reveals that expanding training data can paradoxically degrade performance due to unresolved distribution shifts, underscoring the need for robust incremental learning strategies. \textbf{Causality-Driven Approaches.} To address distribution shifts and annotation biases in medical imaging (Sec. 1–2), causality-driven frameworks \cite{castro2020causality} are ideal for segmentation tasks in breast cancer and Alzheimer's Disease (AD). Breast cancer (most prevalent in women) causes over 14,000 annual deaths in Algeria alone \cite{lagree2021review,aps2020cancer}, with detection relying on imaging to identify lesions \cite{evain2021breast,touami2021microcalcification}, while AD diagnosis hinges on quantifying tau protein aggregates. Current quantification methods are labor-intensive and error-prone, necessitating automation.

Traditional segmentation models often fail to generalize due to unobserved confounders (scanner artifacts, anatomical variability) that corrupt the causal relationship between images X and annotations Y. Causal inference provides a principled framework \cite{scholkopf2012causal,pearl2009causality,bareinboim2016causal}. As in Fig. \ref{fig:ablation}c, confounders $U$ (imaging protocols, demographics) influence both $X$ and $Y$, introducing spurious correlations. We adopt frontdoor adjustment \cite{pearl2009causality} using mediator $Z$ (foreground-background feature discrepancy) to disentangle causal effects. Our Feature Discrepancy Loss $L_{fd}$ operationalizes this by enhancing $Z$'s robustness to $U$, minimizing distributional shifts across datasets, ensuring $Y$ depends causally on $X$ rather than confounders. This addresses the "data addition dilemma" (Sec. 2) by stabilizing features during incremental learning. Selection bias from noise in $X$ that distorts $X \rightarrow Y$ \cite{castro2020causality} is mitigated, while label noise in $Y$ remains a separate challenge handled via label correction methods, enabling generalization in real-world clinical settings with unobserved biases. \footnote{A comprehensive review of related works is provided in Supplementary Sec. \ref{sec:related_works}}

\noindent
{\bf Contributions.} This paper focuses on the segmentation task in medical imaging, a field that \textit{still} presents significant challenges in accurately delineating complex anatomical structures and pathologies \cite{malhotra2022retracted}. Moreover, we leverage smaller models like UNet for our analysis. They remain sufficient in medical imaging due to their ability to accurately segment with limited data and minimal reliance on prompts or extensive fine-tuning \citep{yousef2023u,ahmadi2023comparative}. Our contributions stem from the observation that the Dice Score, a commonly used metric for evaluating segmentation quality, correlates with the discrepancy between foreground and background features in the intermediate representations generated by neural networks. This observation holds true across ultrasound and histopathology images (Fig~\ref{fig:intro} (a)), prompting the question: \textit{Can controlling for foreground-background feature discrepancy improve the quality of these representations and, consequently, the Dice Score?} We show that it does. Specifically, we propose: {\bf(a)} feature discrepancy loss to enhance feature distinction, reducing over- and under-segmentation in homogeneous pixel distributions; {\bf(b)} theoretical bound showing that the negative logarithm of the Dice coefficient serves as a lower bound for the feature discrepancy loss, ensuring improved Dice scores when optimizing for this loss; {\bf(c)} theoretical proof demonstrating the fact that the proposed feature discrepancy loss constrains the magnitude of the UNet layer weights, preventing the formation of a high-variance model prone to data memorization, a common issue in medical imaging datasets prone to limited samples; {\bf(d)} introduction of a new ultrasound breast cancer dataset focused on triple-negative breast cancer (TNBC); and {\bf (e)} a causal approach to address dataset distribution shift when integrating data from multiple sources. We achieve better segmentation performance across five datasets and significantly improve the segmentation performance of three prominent architectures.


\vspace{-1em}
\section{Method}
\label{sec:method}

Causal diagrams formalize assumptions about data generation, improving model robustness and generalization to clinical data, which enhances diagnostic tools. Causal reasoning helps address data scarcity by analyzing cause-effect relationships. In our study of medical images ($X$) and their corresponding segmentation ground truth ($Y$), we explore the causal relationship between them. The relationship may be causal ($X \rightarrow Y$), indicating $Y$ depends on $X$, or anticausal ($Y \rightarrow X$), predicting the cause from the effect. The task is to estimate $P(Y\mid X)$. Manual segmentation is influenced by image content, resolution, contrast, and annotator understanding, thus suggesting a causal model, $X \rightarrow Y$.

\vspace{-5pt}
\begin{axiom}
\label{ax:mod}
    {\bf (Modularity for $\mathbf{X\rightarrow Y}$): } In the causal graph where X causes Y, intervening on X changes only the mechanism determining X, while the mechanism determining Y given X remains invariant.    
\end{axiom}
\vspace{-0.5em}

Axiom~\ref{ax:mod} indicates that $P(X)$ offers minimal information compared to $P(Y\mid X)$, implying that data augmentation and semi-supervised learning techniques are theoretically inadequate for resolving the data scarcity issue. A model trained on image-derived annotations will mainly reproduce the manual annotation process instead of predicting a pre-imaging ground truth, like the `true' anatomy. While efforts to enhance data augmentation techniques for segmentation tasks continue \cite{yellapragada2024pathldm}, our approach emphasizes utilizing existing data to improve segmentation outcomes.

\subsection{Handling Data Scarcity through Causal Mediation}

In data-scarce scenarios, the effective utilization of available samples is crucial. One approach focuses on improving the performance of underperforming samples, aligning with the Rawlsian principle \cite{lundgard2020measuring} of prioritizing the worst-off samples. While techniques like upweighting show promise, they are impractical here due to the challenge of estimating reliable probability distributions in medical imaging datasets. To address this, we introduce causal mediation by incorporating mediator $Z$, as depicted in Fig.~\ref{fig:intro} c. Derived from the image $X$, $Z$ serves as a differentiable proxy for $Y$, mediating the relationship to enhance performance. \footnote{We do not explicitly include the site/scanner variable $S$ because the figure highlights only the variables directly used by the model ($X$, $Z$, $Y$, and the unobserved $U$). In practice, $S$ is an observed nuisance factor that influences $X$ (i.e., $S \!\rightarrow\! X$), but it does not enter the forward pass or the loss explicitly. $\mathcal{L}_{fd}$ and $\mathcal{L}^\textbf{exch}_\textbf{fd}$ therefore address site-driven variation indirectly (reducing the portion of appearance variability in $Z$ that is attributable to differences in $S$) without requiring site labels.}

\begin{myprop}
    {\textbf{(Mediation in Causal Prediction Model):} Given a causal diagram $X \rightarrow Y$, introducing a mediator $Z$ to create the structure $X \rightarrow Z \rightarrow Y$, and assuming a strong correlation between $Y$ and $Z$, this results in: i)  Conditional Independence: $(X \perp Y) \mid Z$ ii)  Preserved Modularity: $P(X) \perp P(Y\mid X)$ iii)  Functional Relationship: $P(Y\mid X) = \int P(Y \mid Z) P(Z \mid X)$\cite{nazarovs2021graph}}
\end{myprop}

The relationship shown in Proposition 1 indicates that $P(Y \mid X)$ depends on $P(Z \mid X)$, as $Z$ mediates $X \to Y$. This indicates that an accurate determination of $P(Z \mid X)$ allows for precise estimation of $P(Y \mid X)$.

\begin{example}
Consider \( X \sim \mathcal{N}(0,1) \), where \( \mathcal{N} \) denotes the normal distribution. Define \( Z = a X + \epsilon_1 \) and \( Y = b Z + \epsilon_2 \), where \( \epsilon_1 \sim \mathcal{N}(0,1) \) and \( \epsilon_2 \sim \mathcal{N}(0,1) \), and \( a \) and \( b \) are constants. Under these definitions, we have the following conditional distributions: \( Z \mid X \sim \mathcal{N}(aX, 1) \), \( Y \mid Z \sim \mathcal{N}(bZ, 1) \), and consequently \( Y \mid X \sim \mathcal{N}(abX, 1 + b^2) \). 
\end{example}

The example demonstrates that $P(Y \mid X)$ is a function of $P(Z \mid X)$, as the mean of $Y \mid X$ (represented as $abX$) depends on the mean of $Z \mid X$ (which is $aX$). Moreover, conditional independence is preserved, as knowing $X$ provides no further information about $Y$ given $Z$ \cite{nazarovs2021graph}.

\vspace{-1em}
\subsection{Mediator as a Feature Discrepancy Measure}

\begin{figure}[!t]
    \centering
    \includegraphics[width=0.7\linewidth, keepaspectratio]{images/Data_Addition_Dilemma.png}
    \vspace{-20pt}
    \caption{\footnotesize \textbf{Data Addition Dilemma.} An ablation showing the performance of various loss functions for histopathology and ultrasound datasets under the "Data Addition Dilemma" \cite{shen2024data}. Data from $D_{novel}$ is added to $D_{base}$ to observe how losses handle distribution shifts. $\mathcal{L}^\textbf{exch}_\textbf{fd}$ (\textcolor{orange}{orange}) outperforms others in mitigating distribution shift when pooling data from multiple sources. The US-TNBC dataset has fewer samples, so we only added UDIAT dataset samples until their number matched (see Table \ref{dataset_summary}.)} 
    \vspace{-2em}
    \label{fig:Data_Dilemma}
\end{figure}

We now focus on identifying the appropriate mediator variable $Z$. From the previous section, we know that a good mediator $Z$ should exhibit a strong correlation with $Y$ and be derived from $X$. $Z$ corresponds to the intermediate representations of the U-Net architecture \cite{ronneberger2015u}, satisfying $X \to Z$. To ensure a strong correlation between $Y$ and $Z$, the foreground features of $Z$ must be highly discriminative from the background. In the U-Net architecture, the feature map $F$ is represented by height, width, and channel. Ground-truth masks or clustering methods during training help identify indicators $\tilde{y}$ that distinguish foreground from background features \cite{sims2023seg}. To enforce this discriminative property, we penalize the following loss.
% The mediator variable $Z$ is defined as a quantitative measure of features extracted from a neural network that characterize the image features (i.e., objects of interest) $F$. This definition is pertinent to our segmentation task. Let $F$ denote these features, which can be derived from any representation learning architecture, including the UNET architecture \cite{ronneberger2015u}. In the UNET architecture, as in many other architectures, $F$ comprises three dimensions: height, width, and channel. Access to ground truth masks during training or clustering methods aids in identifying indicators $\tilde{y}$ that differentiate between foreground and background features \cite{sims2023seg}. We show that increasing the distance between features representing the foreground and background enhances the estimation of the mediator $Z$. The distance is measured through a penalty function. The foreground and background feature distance penalty loss is formally defined as follows:

\begin{mydef}
{\textbf{(Feature Discrepancy Loss):} Let $F$ denote the features extracted from any network architecture and $\tilde{y}$ represent the indicator variables identifying foreground features (the ground truth segmentation mask). We define the channel-averaged foreground features as $F_{g} = \sum_{k} \left( \sum_{i,j} F[i,j,k] \otimes \tilde{y}[i,j,k] \right)$ and the channel-averaged background features as $B_{g} = \sum_{i,j} F[i,j,k] \otimes (1 - \tilde{y}[i,j,k])$, where $\otimes$ denotes element-wise multiplication. The feature discrepancy loss is then given by:
\vspace{-5pt}
    \begin{align}
        \mathcal{L}_\textbf{fd} = -\log\Big(\|F_{g}-B_{g}\|^2\Big)
    \end{align}
    \vspace{-15pt}
    \label{def:1}}
\end{mydef}
\vspace{-10pt}

In the previous discussion, \( F_{g} - B_{g} \) represents the feature distance (FD) between foreground and background. Penalizing this feature difference helps the model distinguish between foreground and background better, reducing over- and under-segmentation. In Lemma 1, we prove that the negative logarithm of the Dice score lower bounds the feature-distance loss, indicating that minimizing the feature-distance loss can improve Dice scores (Supplementary Sec \ref{suppl-sec-B1} for proof). 

\begin{mylem}
{Relationship between feature discrepancy loss $\mathcal{L}_\textbf{fd}$, segmentation Dice score, and constant $k$ for feature vector $F$ derived from image $X$:
\vspace{-5pt}
\begin{align}
-log(Dice \times (k + 1)) \leq \mathcal{L}_\textbf{fd}
\end{align}}
\end{mylem}

% \begin{proof}
% \small
% Let \(\otimes\) denote element-wise multiplication. Then we get the relation between Dice score, the predicted segmentation mask $y$, and the ground truth segmentation mask $\tilde{y}$ as:

% \begin{align}
% \sum_{i,j,k} \tilde{y}_{ijk} = \frac{Dice}{2} \times \frac{\sum_{i,j,k} y_{ijk} + \sum_{i,j,k} \tilde{y}_{ijk}}{\sum_{i,j,k} y_{ijk}} 
% \quad \nonumber \\
% \text{(since \(Dice = \frac{2 \sum_{i,j,k} y_{ijk} \cdot \tilde{y}_{ijk} + \epsilon}{\sum_{i,j,k} y_{ijk} + \sum_{i,j,k} \tilde{y}_{ijk} + \epsilon}\))}
% \end{align}

% Now, simplifying FD (feature discrepancy), we get:

% \begin{align}
% FD = \frac{\| \sum_{k} \left( \sum_{i,j} F_{i,j,k} \otimes \tilde{y}_{i,j,k} - \sum_{i,j} F_{i,j,k} \otimes (1 - \tilde{y}_{i,j,k}) \right) \|_2}{\| \sum_{i,j,k} F_{ijk} \|_{2}} 
% \quad \nonumber \\
% \text{(Using \ref{def:1}).}
% \end{align}

% \begin{align}
% FD \leq \frac{\| 2 \sum_{i,j,k} F_{i,j,k} \otimes \tilde{y}_{i,j,k} \|_2}{\| \sum_{i,j,k} F_{ijk} \|_{2}} + \frac{\| \sum_{i,j,k} F_{i,j,k} \|_2}{\| \sum_{i,j,k} F_{ijk} \|_{2}}
% \quad \nonumber \\
% \text{(using the triangle inequality).}
% \end{align}

% \begin{align}
% FD - 1 \leq \frac{\| \sum_{i,j,k} F_{i,j,k} \otimes Dice \times \frac{\sum_{i,j,k} y_{ijk} + \sum_{i,j,k} \hat{y}_{ijk}}{\sum_{i,j,k} y_{ijk}} \|_2}{\| \sum_{i,j,k} F_{ijk} \|_{2}}
% \quad \nonumber \\
% \text{(Substituting \(\sum_{i,j,k} \tilde{y}_{ijk}\)).}
% \end{align}

% Since \(\sum_{i,j,k} \tilde{y}_{ijk}\) and \(\sum_{i,j,k} y_{ijk}\) are constants during testing, let \(\frac{\sum_{i,j,k} \tilde{y}_{ijk}}{\sum_{i,j,k} y_{ijk}} = k'\):  

% \begin{align}
% -log(FD) \geq -log(Dice \times (k + 1)) 
% \quad \nonumber \\
% \text{(Taking \(-\log\) on both sides).}
% \end{align}

% \begin{align}
% \mathcal{L}_\textbf{fd} \geq -log(Dice \times (k + 1))
% \quad \nonumber \\
% \text{(This completes the proof).}
% \end{align}
% \end{proof}

An increase in the Dice score results in a decrease of the lower bound, which allows for a decrease in $\mathcal{L}_\textbf{fd}$. As shown in Figure \ref{fig:intro} (a), this relationship justifies the observed correlation between $\mathcal{L}_\textbf{fd}$ and the Dice score for all models\footnote{Although Lemma 1's bound may not be tight, experiments (Figure \ref{fig:ablation} (b) and Table \ref{ablation}) show a strict upper-lower bound relationship, indicating that minimizing $\mathcal{L}_\textbf{fd}$ directly improves the Dice score.}.
\vspace{-0.5em}
\begin{axiom} \textbf{(SGD Suboptimality for Convex Lipschitz Functions):}
\label{axiom:sgd_convergence}
Let $f: \mathbb{R}^d \to \mathbb{R}$ be a convex function with Lipschitz constant $L$. For step size $\eta_t = \frac{1}{L\sqrt{T}}$ and $T$ iterations, SGD achieves:
$\mathbb{E}\left[f(\theta_T) - f(\theta^*)\right] \leq \frac{C L}{\sqrt{T}}$,
where $C > 0$ is a constant, $\theta_T$ is the parameter at iteration $T$, and $\theta^*$ is the optimal parameter.  
\end{axiom}

Axiom \ref{axiom:sgd_convergence} recalls a well-established fact that the suboptimality of Stochastic Gradient Descent (SGD) is bound by the Lipschitz constant ($L$) \cite{hardt2016train}. Details of the proof of Axiom \ref{axiom:sgd_convergence} can be found in \cite{shalev2014understanding} (page 187). In Lemma~\ref{lemma:weight_bound}, we prove that minimizing $\mathcal{L}_\textbf{fd}$ constrains the weight matrix \( W \) in each UNet layer by damping gradient updates (Supplementary Sec \ref{suppl-sec-B2} for proof). This prevents \( W \) from growing excessively to maximize feature discrepancy, which is crucial for medical imaging datasets that are significantly smaller than natural scene datasets. Large \( W \) risks data interpolation and memorization contribute to high variance, increasing the risk of overfitting. By enforcing feature separation \textit{without} relying on a large \( ||W||_2 \), $\mathcal{L}_{\text{fd}}$ acts as an implicit regularizer, effectively bounding The Lipschitz constant \( L \) and improving generalization effect (see Axiom \ref{axiom:sgd_convergence}) empirically validated by higher test Dice scores (see Table \ref{ablation}). $\gamma$ controls the separation–norm trade-off via optimization. $\mathcal{L}_{\text{fd}}$ balances overfitting and discrimination. %The trade-off between feature separation and weight norm is controlled by \( \gamma \), implicitly determined by the optimization trajectory. $\mathcal{L}_{\text{fd}}$ achieves an optimal balance, reducing overfitting while preserving discriminative power.

%\vspace{-0.5em}
\begin{mylem} {\textbf{(Weight Norm Bound via Feature Discrepancy Loss):}
\label{lemma:weight_bound}
Let $W \in \mathbb{R}^{d \times d}$ denote the weight matrix of a UNet layer producing features $F = W \otimes x$, where $x \in \mathbb{R}^{d \times d}$ is the input to that layer. With $x_g$ and $x_b$ denoting foreground and background features, 
\vspace{-10pt}
\begin{align}
\mathcal{L}_{\text{fd}} = -\log\left( ||W \otimes (x_g - x_b)||_2^2 \right)
\end{align}
%\vspace{-5pt}
Minimizing $\mathcal{L}_{\text{fd}}$ bounds $||W||_2$, reducing the layer’s Lipschitz constant.}
\end{mylem}

% \begin{proof}
% \small
% Let $\Delta x = x_g - x_b$ denote the inherent foreground-background separation in the input space. The loss $\mathcal{L}_{\text{fd}}$ incentivizes maximizing $||W \otimes \Delta x||_2^2$ where $\otimes$ is the hadamard product. Now we can frame the hadamard product in a different way to represent $W \otimes \Delta x$ as $W_{exp} \times \Delta x_{exp}$ where $\times$ is matrix multiplication, $W_{exp} \in \mathbb{R}^{d^2 \times d^2}$ is a diagonalized form of $W$ and $x_{exp} \in \mathbb{R}^{d^2 \times 1}$ is a reshaped form of $x$. The Lipschitz constant $L$ of the layer $F = W_{exp} \times x$ is the spectral norm of $W_{exp}$:

% \begin{align}
% L = ||W_{exp}||_2 = \sup_{||x_{exp}||_2 = 1} ||W_{exp} \times x_{exp}||_2
% \end{align}

% This measures the maximum amplification of the input by $W_{exp}$. To minimize $\mathcal{L}_{\text{fd}}$, the optimization ensures $||W_{exp} \times \Delta x_{exp}||_2^2 \geq \gamma$ for some $\gamma > 0$. By Cauchy-Schwarz:

% \begin{align}
% ||W_{exp} \times \Delta x_{exp}||_2 \leq ||W_{exp}||_2 ||\Delta x_{exp}||_2.
% \end{align}

% Squaring both sides:

% \begin{align}
% \gamma \leq ||W_{exp} \times \Delta x_{exp}||_2^2 \leq ||W_{exp}||_2^2 ||\Delta x_{exp}||_2^2 \\ \implies ||W_{exp}||_2 \geq \frac{\sqrt{\gamma}}{||\Delta x_{exp}||_2}.
% \end{align}

% Thus, $\gamma$ defines the \textit{minimum required separation} between foreground and background features.

% The gradient of $\mathcal{L}_{\text{fd}}$ with respect to $W$ is:

% \begin{align}
% \nabla_W \mathcal{L}_{\text{fd}} = -\frac{2}{||W \otimes \Delta x||_2^2} (W \circ \Delta x)(\Delta x)^T.
% \end{align}

% The term $\frac{1}{||W \otimes \Delta x||_2^2}$ acts as an \textit{adaptive damping factor}: as $||W \otimes \Delta x||_2^2$ increases (better separation), the gradient magnitude decreases. This prevents $W$ from growing excessively to inflate separation artificially, thereby bounding $||W||_2$ and thus $||W_{exp}||_2$ (since $W_{exp}$ is a diagonalized form of $W$).

% The network achieves $||W_{exp} \times \Delta x_{exp}||_2^2 \geq \gamma$ with the smallest possible $||W||_2$ (due to gradient damping) ensuring \textit{lower variance model} (reduced sensitivity to input perturbations) and preventing overfitting. Furthermore, the Lipschitz constant $L$, is also reduced indicating a \textit{tighter generalization bounds} (the suboptimal error bound, $\mathcal{E}_{\text{gen}} \propto L$).
% \end{proof}

\vspace{-1em}
\subsubsection{Implementation details}
\label{sec:main_method}

{\bf Segmentation Loss $\mathcal{L}_\textbf{seg}$.} To penalize spatial prediction, $\mathcal{L}_\textbf{seg}$ integrates Dice loss \cite{soomro2018strided} and Binary Cross Entropy (BCE) loss \cite{jadon2020survey}, both essential for image segmentation. These losses evaluate model performance by comparing expected and actual masks. $\mathcal{L}_\textbf{seg}$ is a linear combination of Dice and BCE loss, as given in \cite{roy2024eu} (Supplementary Sec \ref{suppl-sec-A} for more details). \\
{\bf Layer-wise Feature Discrepancy Loss $\mathcal{L}_\textbf{fd}$ and hyper-parameter $\alpha$ regulation.} The U-Net architecture employs an encoder-decoder structure with skip connections, enabling the extraction of multi-scale features at varying spatial resolutions. We introduce $\mathcal{L}_{\text{fd}}$ at each feature layer to enhance segmentation accuracy. It is to be observed that the feature dimension for each layer of UNet is different. For applying $\mathcal{L}_\textbf{fd}$ to each layer, we max pool $\tilde{y}$ to match the feature dimension of that particular layer to extract $F_g$ and $B_g$. This approach strengthens the model's discriminative power by encouraging distinct feature learning across layers, as illustrated in Fig. \ref{fig:intro}(b). A trainable hyperparameter \( \alpha \) is introduced to control the importance of each layer in the feature discrepancy loss \( \mathcal{L}_\textbf{fd} \), with unique \( \alpha \) values assigned per layer. This balances segmentation accuracy \( \mathcal{L}_\textbf{seg} \) and feature discrepancy at each level. Ablation (Section~\ref{sec:ablation}) reports the final \( \alpha \) values, showing each layer's contribution to improved segmentation performance. \\ 
{\bf Warm-Starting \( \alpha \).}  In the initial model updates, \(\alpha\) = 0, optimizing exclusively for \(\mathcal{L}_\textbf{seg}\) without factoring in the penalty function \(\mathcal{L}_\textbf{fd}\). This method enables \(\alpha\) to progressively rise from zero to infinity, consistent with the literature \cite{bertsekas1997nonlinear}. This approach enables a seamless shift from a constrained to an unconstrained problem, allowing for a thorough exploration of the solution space. Starting with a small penalty helps to mitigate potential ill-conditioning associated with large penalties at the outset. We start with \(\alpha\) set to 0, permitting the algorithm to iterate multiple times before activating \(\alpha\) for training.

\vspace{-1em}
\section{Experiments}
\label{sec:res}
\label{sec:setup}

We begin by detailing the experimental setup, including datasets, architectures, and a novel triple-negative breast cancer segmentation dataset. Sec \ref{sec:quantitative} \ref{sec:ablation} show how Feature Discrepancy Loss improves segmentation across datasets. In Sec \ref{data_dilemma}, we demonstrate its effectiveness in preserving performance despite distributional shifts.

\noindent
\textbf{Setup.} Experimental setup and dataset details can be seen in Supplementary Sec \ref{sec:setup}. We assess three prominent U-Net variants: AttentionUNet \cite{jimenez2022visual}, which employs gated attention mechanisms for improved segmentation; NucleiSegNet \cite{lal2021nucleisegnet}, designed for overlapping boundaries and varying nuclei sizes; and CMUNet \cite{10230609}, which combines multi-scale attention gates and a ConvMixer module to capture both global and local features (details in Supplementary Sec \ref{suppl-sec-H}).

\vspace{-0.5em}
\subsection{Quantitative Results on individual datasets}
\label{sec:quantitative}

The effects of $\mathcal{L}_\textbf{fd}$ are detailed in Table \ref{ablation}, which presents results for all samples, as well as for the Worst-off and Best-off samples based on Dice scores. Table~\ref{dataset_summary} presents the numbers of the best-off and worst-off samples utilized in our experiments. The worst-off samples are the samples with the lower dice scores than other samples (see Fig. \ref{fig:intro} (a)) as seen without the application of $\mathcal{L}_\textbf{fd}$. The best-off samples are the direct opposite of the worst-off samples, and their count is equal to that of the worst-off samples. The threshold in Table \ref{dataset_summary} is an estimate of the approximate value of Dice scores below which the selected worst-off samples lie. In the case of CMUNet on the US-TNBC dataset, a slight decrease in the Dice score (-0.23) for Best-off samples is offset by improvements in Worst-off samples as $\mathcal{L}_\textbf{fd}$ increases the overall average performance while emphasizing the worst-off samples. On the new US-TNBC dataset, $\mathcal{L}_\textbf{fd}$ results in higher overall Dice scores. The improvements corroborate the theoretical findings in Lemma 1.  (\textit{Takeaway:} Penalizing $\mathcal{L}_\textbf{fd}$ enhances segmentation performance across models and datasets.)

Furthermore, we also demonstrate that the proposed $\mathcal{L}_{fd}$ is \emph{not limited to binary segmentation} and naturally extends to multi-class segmentation settings (see Supplementary Sec \ref{sec:multiclass}).

\begin{table*}[!t]
    \centering
    \tiny
    \setlength{\tabcolsep}{1.0pt}
    %\renewcommand{\arraystretch}{0.7}
    \resizebox{0.9\textwidth}{!}
    {\begin{tabular}{ccdacbcacbcacbc}
        \toprule
        \textbf{Model} & \textbf{Dataset} & \textbf{$\mathcal{L}_\textbf{fd}$} & \multicolumn{4}{c}{\textbf{Worst Off Samples}} & \multicolumn{4}{c}{\textbf{Best Off Samples}} & \multicolumn{4}{c}{\textbf{All Samples}} \\
        \cmidrule(lr){4-7} \cmidrule(lr){8-11} \cmidrule(lr){12-15}
        & & & \textbf{Dice} & \textbf{$\Delta$ Dice} & \textbf{IoU} & \textbf{$\Delta$ IoU} & \textbf{Dice} & \textbf{$\Delta$ Dice} & \textbf{IoU} & \textbf{$\Delta$ IoU} & \textbf{Dice} & \textbf{$\Delta$ Dice} & \textbf{IoU} & \textbf{$\Delta$ IoU} \\
        \midrule
        \multirow{7}{2.5cm}{\centering AttnUNet \cite{jimenez2022visual}} 
        & \multirow{2}{*}{UDIAT} & \xmark & 22.42 & \multirow{2}{*}{+0.9} & 29.47 & \multirow{2}{*}{+0.8} & 75.86 & \multirow{2}{*}{+1.4} & 68.46 & \multirow{2}{*}{+1.0} & 67.21 & \multirow{2}{*}{+1.7} & 35.61 & \multirow{2}{*}{+2.8} \\
        & & \cmark & \textbf{23.28} & & \textbf{30.31} & & \textbf{77.29} & & \textbf{69.50} & & \textbf{68.96} & & \textbf{38.43} & \\
        \cmidrule(lr){2-15}
        & \multirow{3}{*}{TNBC} & \xmark & 77.88 & \multirow{3}{*}{0.0} & 68.64 & \multirow{3}{*}{+0.0} & 85.82 & \multirow{3}{*}{+0.4} & 74.38 & \multirow{3}{*}{+3.2} & 80.61 & \multirow{3}{*}{+0.5} & 67.79 & \multirow{3}{*}{+1.4} \\
        & & \cmark & \textbf{77.86} & & \textbf{68.66} & & \textbf{86.25} & & \textbf{77.57} & & \textbf{81.16} & & \textbf{69.19} & \\
        \cmidrule(lr){2-15}
        & \multirow{2}{*}{MoNuSeg} & \xmark & 66.03 & \multirow{2}{*}{+2.5} & 52.38 & \multirow{2}{*}{+0.7} & 82.57 & \multirow{2}{*}{+1.0} & 73.48 & \multirow{2}{*}{+1.0} & 75.92 & \multirow{2}{*}{+2.0} & 61.28 & \multirow{2}{*}{+1.6} \\
        & & \cmark & \textbf{68.61} & & \textbf{53.06} & & \textbf{83.62} & & \textbf{74.50} & & \textbf{77.97} & & \textbf{62.87} & \\
        \cmidrule(lr){2-15}
        & \multirow{2}{*}{AD $256$} & \xmark & 56.35 & \multirow{2}{*}{+1.3} & 31.92 & \multirow{2}{*}{+1.2} & 81.34 & \multirow{2}{*}{+4.3} & 70.88 & \multirow{2}{*}{+2.0} & 61.14 & \multirow{2}{*}{+3.5} & 43.87 & \multirow{2}{*}{+2.8} \\
        & & \cmark & \textbf{57.67} & & \textbf{33.10} & & \textbf{85.64} & & \textbf{72.93} & & \textbf{64.69} & & \textbf{46.67} & \\
        \midrule
        \multirow{4}{2.5cm}{\centering CMUNet \cite{10230609}} 
        & \multirow{2}{*}{UDIAT} & \xmark & 31.56 & \multirow{2}{*}{+1.6} & 26.58 & \multirow{2}{*}{+1.6} & 90.88 & \multirow{2}{*}{+4.4} & 88.25 & \multirow{2}{*}{+1.8} & 81.85 & \multirow{2}{*}{+2.4} & 69.87 & \multirow{2}{*}{+3.1} \\
        & & \cmark & \textbf{33.19} & & \textbf{28.17} & & \textbf{95.32} & & \textbf{90.01} & & \textbf{84.22} & & \textbf{73.02} & \\
        \cmidrule(lr){2-15}
        & \multirow{2}{*}{US-TNBC} & \xmark & 25.08 & \multirow{2}{*}{+1.9} & 21.44 & \multirow{2}{*}{+0.9} & 86.27 & \multirow{2}{*}{-0.2} & 68.09 & \multirow{2}{*}{+1.3} & 49.59 & \multirow{2}{*}{+0.6} & 34.53 & \multirow{2}{*}{+2.0} \\
        & & \cmark & \textbf{26.94} & & \textbf{22.35} & & \textbf{86.04} & & \textbf{69.35} & & \textbf{50.22} & & \textbf{36.52} & \\
        \midrule
        \multirow{5}{2.5cm}{\centering NuSegNet \cite{lal2021nucleisegnet}} 
        & \multirow{3}{*}{TNBC} & \xmark & 77.29 & \multirow{3}{*}{+2.1} & 68.00 & \multirow{3}{*}{+0.4} & 86.49 & \multirow{3}{*}{+0.3} & 71.29 & \multirow{3}{*}{+1.3} & 81.69 & \multirow{3}{*}{+1.0} & 69.22 & \multirow{3}{*}{+1.4} \\
        & & \cmark & \textbf{79.40} & & \textbf{68.42} & & \textbf{88.82} & & \textbf{72.58} & & \textbf{82.65} & & \textbf{70.58} & \\
        \cmidrule(lr){2-15}
        & \multirow{2}{*}{MoNuSeg} & \xmark & 63.95 & \multirow{2}{*}{+0.7} & 50.05 & \multirow{2}{*}{+2.1} & 84.61 & \multirow{2}{*}{+0.3} & 70.40 & \multirow{2}{*}{+1.2} & 80.95 & \multirow{2}{*}{+0.7} & 67.91 & \multirow{2}{*}{+0.7} \\
        & & \cmark & \textbf{64.61} & & \textbf{52.11} & & \textbf{84.96} & & \textbf{71.65} & & \textbf{81.69} & & \textbf{68.65} & \\
        \cmidrule(lr){2-15}
        & \multirow{2}{*}{AD $256$} & \xmark & 32.55 & \multirow{2}{*}{+3.2} & 23.19 & \multirow{2}{*}{+2.3} & 64.75 & \multirow{2}{*}{+6.4} & 46.28 & \multirow{2}{*}{+5.1} & 51.15 & \multirow{2}{*}{+5.4} & 36.17 & \multirow{2}{*}{+4.4} \\
        & & \cmark & \textbf{35.78} & & \textbf{25.46} & & \textbf{71.15} & & \textbf{51.35} & & \textbf{56.57} & & \textbf{40.61} & \\
        \bottomrule
    \end{tabular}}
    \caption{\footnotesize \textbf{Ablation study on the application of $\mathcal{L}_\textbf{fd}$.} The improvement for low dice (Worst Off), high dice (Best Off), and all test samples (All Samples) is evident after applying $\mathcal{L}_\textbf{fd}$. NucleiSegNet \cite{jimenez2022visual} (histopathology) is not applicable to UDIAT and US-TNBC, while CMUNet \cite{10230609} (ultrasound) does not apply to TNBC. Attention UNet \cite{jimenez2022visual} performs poorly on US-TNBC (Dice: 12.96). Changes in Dice ($\Delta$ Dice) and IoU ($\Delta$ IoU) are shown across all test settings.}
    \vspace{-4em}
    \label{ablation}
\end{table*}

\vspace{-0.5em}
\subsection{Qualitative Results on individual datasets}
\label{sec:qualitative}

Qualitative results for the TNBC, MoNuSeg, AD, US-TNBC, and UDIAT datasets are presented in Figures \ref{fig:Qualitative}. The red-highlighted areas in the predicted masks without $\mathcal{L}_\textbf{fd}$ indicate segmentation errors, while the green-highlighted regions reflect corrections made by applying $\mathcal{L}_\textbf{fd}$. These experiments illustrate how $\mathcal{L}_\textbf{fd}$ enhances segmentation through boundary refinement and reducing segmentation errors. The resulting masks display sharper, more accurate contours of key structures, preserving fine details and ensuring better anatomical representation. From Fig. \ref{fig:Heatmap}, we can see that the introduction of $\mathcal{L}_\textbf{fd}$ significantly reduces the unnecessary activations and streamlines the focus of the model to the region of interest, thus enhancing the segmentation performance.  (\textit{Takeaway:} Penalizing for $\mathcal{L}_\textbf{fd}$ results in sharper boundaries, improved detail preservation, and increased consistency.)

\vspace{-0.5em}
\subsection{Ablation Studies on Feature Discrepancy Loss and Dataset Performance}
\label{sec:ablation}

{\bf Impact of the $\alpha$ Parameter on Feature Discrepancy Loss.} 
As discussed in Section~\ref{sec:main_method}, $\alpha$ is a trainable parameter that initially starts at zero and regulates the penalty of feature discrepancy loss, $\mathcal{L}_\textbf{fd}$, for each layer of the neural network; the final values of $\alpha$ indicate that the layer with the highest value had the most influence on improving the overall dice scores (see Figure~\ref{fig:ablation} (a)). Furthermore, applying $\mathcal{L}_\textbf{fd}$ across all layers yielded consistently better Dice scores (+1.3–1.8\% across datasets) compared to selective layers (Enc 1, Dec 4, Bot), indicating that refined features from earlier layers enhance the discriminative quality of the final segmentation output.

\noindent
{\bf Comparison with State-of-the-Art Models.} For the TNBC \cite{naylor2018segmentation}, UDIAT \cite{yap2017automated}, and MoNuSeg \cite{kumar2019multi} datasets, our method achieves Dice score improvements of $+0.96$ (TNBC), $+0.74$ (MoNuSeg), and $+0.75$ (UDIAT) compared to CMUNet \cite{10230609} and NucleiSegNet \cite{lal2021nucleisegnet}, demonstrating the effectiveness of penalizing feature discrepancy in modalities with high foreground-background similarity. For AD, larger patches ($256\times256$ pixels) capture broader context, including background and neighboring pixels, while smaller patches ($128\times128$ pixels) focus primarily on plaque regions with limited context. Using the same experimental setup as \cite{jimenez2022visual}, we observe performance improvements in AttnUNet \cite{jimenez2022visual} and NucleiSegNet \cite{lal2021nucleisegnet} with  $\mathcal{L}_\textbf{fd}$, as shown in Fig. \ref{fig:sota_plots}.

\noindent
{\bf Changes in \(\mathcal{L}_{\textbf{fd}}\) and Dice scores at the sample level.} In Figure~\ref{fig:ablation} (a), a trend between \(\mathcal{L}_{\textbf{fd}}\) and Dice is noted, with some samples exhibiting poor scores in both metrics. Figure~\ref{fig:ablation} (b) presents a frequency plot for \(\mathcal{L}_{\textbf{fd}}\) (orange) and Dice (blue). A shift in \(\mathcal{L}_{\textbf{fd}}\) to lower values and Dice scores to higher values is observed, indicating a significant improvement in Dice scores at the sample level. This can also be seen in Supplementary Sec \ref{suppl-sec-C} that the samples move towards a lower $\mathcal{L}_\textbf{fd}$ and a higher Dice score region in the $\mathcal{L}_\textbf{fd}$ vs Dice plot. We also validate that the experiments are statistically significant in Supplementary Sec \ref{suppl-sec-F}. 

\noindent
{\bf Performance comparison of various loss functions under noisy data.} Supplementary Sec \ref{suppl-sec-I} highlights the robustness of $\mathcal{L}_\textbf{fd}$ over other loss functions due to the disentanglement between the foreground-background features, ensuring robust discriminative features under noisy data conditions.

\vspace{-1em}
\section{Mitigating Data Distribution Shifts Under Assumed Exchangeability}
\label{data_dilemma}
\vspace{-5pt}

Recent work emphasizes expanding medical imaging datasets by pooling data from multiple sources \cite{chytas2024pooling,lokhande2022equivariance,roy2026beyond}. While early efforts apply invariant representation learning to handle covariate shifts, they often address only limited factors\cite{akash2021learning}. The \textbf{Data Addition Dilemma} \cite{shen2024data}, underscores a critical issue: increasing training data size across sources can induce distributional shifts that degrade model performance. Traditional methods based on independent and identically distributed (i.i.d.) assumptions fail in cross-dataset scenarios. \textit{Why i.i.d. is not realistic?} While the i.i.d. assumption is standard in most machine learning pipelines and often effective, it becomes overly restrictive in data addition scenarios. Exchangeability, being a weaker and more realistic assumption, better reflects the practical data generation process. For instance, in curating datasets like US-TNBC, new samples often depend on previously collected batches, violating i.i.d. but remaining consistent with exchangeability. We formalize exchangeability as a foundational assumption (Axiom 3 Supplementary Sec. \ref{axiom:exchangeability}) ensuring that the joint distribution of multi-source data remains invariant under permutation. This weaker assumption provides a theoretically sound basis for pooling without introducing bias from distributional shifts, and is consistent with the causal mediation framework in Section 2.1.

This challenge arises when combining a novel dataset, $\mathcal{D}_\textbf{novel}$, with a base dataset, $\mathcal{D}_\textbf{base}$, as their joint use violates i.i.d. assumptions. To address this, we leverage \textbf{exchangeability}, which extends beyond i.i.d. by ensuring that the joint distribution remains invariant under index permutations (Definition 2). By treating $\mathcal{D}_\textbf{base}$ and $\mathcal{D}_\textbf{novel}$ as exchangeable, we design a modified penalty loss function spanning both datasets. This ensures that discrepancies between foreground and background features across datasets remain comparable to within-dataset discrepancies, mitigating distributional shifts effectively. Algorithm 1 (Supplementary Sec \ref{suppl-sec-D}) outlines the training process using $ \mathcal{L}^\textbf{exch}_\textbf{fd}$.

%\vspace{-5pt}
\begin{mydef}
{\textbf{(Feature Discrepancy Loss under assumed exchangeability):}  
$F_{g}(\mathcal{D})$/ $B_{g}(\mathcal{D})$ represents foreground/background features from a randomly sampled dataset $\mathcal{D}$, which can be either $\mathcal{D}_\textbf{novel}$ or $\mathcal{D}_\textbf{base}$ dataset.
\vspace{-5pt}
\begin{align}
    \mathcal{L}^\textbf{exch}_\textbf{fd} =  
    -\log\Big( &\|F_{g}(\mathcal{D}_\textbf{base}) - B_{g}(\mathcal{D}_\textbf{novel})\|^2
    + \|F_{g}(\mathcal{D}_\textbf{novel}) - B_{g}(\mathcal{D}_\textbf{base})\|^2 \Big)
\end{align}}
\end{mydef}

\vspace{-1em}
\subsection{Experiments on Data Addition Dilemma}
\label{sec:data_addition_dilemma}
We selected TNBC as our base dataset, denoted as \(\mathcal{D}_\textbf{base}\), using the MoNuSeg dataset as our novel dataset, labeled \(\mathcal{D}_\textbf{novel}\). We added samples from MoNuSeg sequentially, to \(\mathcal{D}_\textbf{base}\) (For example, in the first setup we use $D_{base}$+$\frac{D_{novel}}{16}$ while testing it on $D_{base}$, in the next setup we use $D_{base}$+$\frac{D_{novel}}{8}$ while testing it on $D_{base}$ and so on). All evaluations were performed on \(\mathcal{D}_\textbf{base}\). Similarly, for the ultrasound datasets, we designated US-TNBC as \(\mathcal{D}_\textbf{base}\) and UDIAT as \(\mathcal{D}_\textbf{novel}\), with samples from UDIAT added in batches of $15$ images. We compared three methods: a naive method without penalties, a method penalizing for \(\mathcal{L}_\textbf{fd}\), and a method penalizing for \(\mathcal{L}_\textbf{fd} + \mathcal{L}^\textbf{exch}_\textbf{fd}\). We compare these three losses with the existing losses that deal with disentanglement ($\mathcal{L}_{\textbf{con}}$\cite{chaitanya2020contrastive}) and layer-wise supervision ($\mathcal{L}_{\textbf{deeps}}$\cite{dou20163d}). Notably, the naive method, $\mathcal{L}_{\textbf{con}}$\cite{chaitanya2020contrastive} and $\mathcal{L}_{\textbf{deeps}}$\cite{dou20163d} exhibited a decrease in test set accuracy on \(\mathcal{D}_\textbf{base}\) as more samples from \(\mathcal{D}_\textbf{novel}\) were incorporated, consistent with the findings of \cite{shen2024data}. \(\mathcal{L}_\textbf{fd} + \mathcal{L}^\textbf{exch}_\textbf{fd}\) resulted in an overall performance improvement, as illustrated in Fig.~\ref{fig:Data_Dilemma}. While $\mathcal{L}^\textbf{exch}_\textbf{fd}$ shares conceptual similarity with local/pixel-wise contrastive losses \cite{chaitanya2020contrastive}, Fig.~\ref{fig:Data_Dilemma} shows contrastive loss ($L_{con}$) suffers significant performance drops (7–19\%). This aligns with prior findings (e.g., Sec. 2 of \cite{akash2021learning}) showing contrastive losses require complex modifications to handle data addition and distribution shifts due to strong i.i.d. assumptions, making the weaker exchangeability assumption more realistic.

To rigorously explain the non-monotonic trends observed in Fig. \ref{fig:Data_Dilemma}, we quantify the distributional mismatch introduced at the data-addition stages. We compute the Kullback--Leibler (KL) divergence and Jensen--Shannon (JS) distance between $\mathcal{D}_\textbf{base}$ and the incrementally added subset of $\mathcal{D}_\textbf{novel}$. Images are converted to grayscale using the luminosity method, and pixel intensity distributions are estimated via normalized histograms. Metrics are computed separately for:
(i) foreground pixels (mask = 1),
(ii) background pixels (mask = 0), and
(iii) the overall image distribution.
A small $\epsilon$ is added to avoid numerical instability in KL computation.

\begin{table}[h]
\centering
\scriptsize
\caption{Distributional divergence between TNBC ($\mathcal{D}_\textbf{base}$) and MoNuSeg ($\mathcal{D}_\textbf{novel}$). FG and BG represent Foreground and Background respectively.}
\begin{tabular}{l|cc|cc|cc}
\toprule
\textbf{Addition Step} &
\textbf{FG KL} & \textbf{FG JS} &
\textbf{BG KL} & \textbf{BG JS} &
\textbf{Overall KL} & \textbf{Overall JS} \\
\midrule
$\mathcal{D}_\textbf{base}$ + $\mathcal{D}_\textbf{novel}/4$ &
0.6534 & 0.3390 &
0.7653 & 0.4034 &
0.7249 & 0.3844 \\
$\mathcal{D}_\textbf{base}$ + $\mathcal{D}_\textbf{novel}/2$ &
0.2125 & 0.1948 &
0.3301 & 0.2516 &
0.2995 & 0.2471 \\
$\mathcal{D}_\textbf{base}$ + $\mathcal{D}_\textbf{novel}$ &
0.2227 & 0.2173 &
0.3501 & 0.2792 &
0.3160 & 0.2622 \\
\bottomrule
\end{tabular}
\vspace{-1.5em}
\label{tab:data_addition_1}
\end{table}

\noindent
\textit{i) Table \ref{tab:data_addition_1} and Fig. \ref{fig:Data_Dilemma}:} As we can see, the distributional difference between $\mathcal{D}_\textbf{base}$ and $\mathcal{D}_\textbf{novel}/4$ is high and this indicates lower performance as seen in all baselines. With increase in more data, the distributional difference between $\mathcal{D}_\textbf{base}$ and $\mathcal{D}_\textbf{novel}/2$ also reduces leading to peak in performance. Furthermore, the distributional difference between $\mathcal{D}_\textbf{base}$ and $\mathcal{D}_\textbf{novel}$ increases which correlates with an overall dip in the performance of the baselines in that region. This explain the overall trend seen in Fig. \ref{fig:Data_Dilemma}.

\begin{table}[h]
\centering
\scriptsize
\caption{Distributional divergence between US-TNBC ($\mathcal{D}_\textbf{base}$) and UDIAT ($\mathcal{D}_\textbf{novel}$). FG and BG represent Foreground and Background respectively.}
\begin{tabular}{l|cc|cc|cc}
\toprule
\textbf{Addition Step} &
\textbf{FG KL} & \textbf{FG JS} &
\textbf{BG KL} & \textbf{BG JS} &
\textbf{Overall KL} & \textbf{Overall JS} \\
\midrule
$\mathcal{D}_\textbf{base}$ + $\mathcal{D}_\textbf{novel}/16$ &
0.0502 & 0.1083 &
0.0225 & 0.0612 &
0.0247 & 0.0662 \\
$\mathcal{D}_\textbf{base}$ + $\mathcal{D}_\textbf{novel}/8$ &
0.0502 & 0.1083 &
0.0225 & 0.0612 &
0.0247 & 0.0662 \\
$\mathcal{D}_\textbf{base}$ + $3\mathcal{D}_\textbf{novel}/16$ &
0.0110 & 0.0509 &
0.0019 & 0.0218 &
0.0022 & 0.0234 \\
$\mathcal{D}_\textbf{base}$ + $\mathcal{D}_\textbf{novel}/4$ &
0.0166 & 0.0628 &
0.0041 & 0.0310 &
0.0043 & 0.0318 \\
\bottomrule
\end{tabular}
\vspace{-1.5em}
\label{tab:data_addition_2}
\end{table}

\noindent
\textit{ii) Table \ref{tab:data_addition_2} and Fig. \ref{fig:Data_Dilemma}:} Similar to Table \ref{tab:data_addition_1}, the distributional difference between $\mathcal{D}_\textbf{base}$ and $\mathcal{D}_\textbf{novel}/16$ is high and this indicates lower performance as seen in all baselines. The distributional difference remains same for $\mathcal{D}_\textbf{base}$ and $\mathcal{D}_\textbf{novel}/8$ while the slight increment in performance seen in some baselines are indication of more availability of data. he distributional difference between $\mathcal{D}_\textbf{base}$ and $3\mathcal{D}_\textbf{novel}/16$, and $\mathcal{D}_\textbf{base}$ and $\mathcal{D}_\textbf{novel}/4$ is significantly less than the previous data addition regions, showing improved performance of the baselines (and thus the peaks). $\mathcal{L}_{con}$ is the worst performing baseline for both the data addition setups as contrastive objectives inherently rely on data for performance and is unable to show trends or maintain comparable performance with respect to other baselines due to the data scarcity in medical imaging.

\vspace{-1em}
\section{Conclusion}
\label{sec:con}
Data scarcity is a major challenge in medical imaging. To address this, our research introduces a novel feature discrepancy penalty function ($\mathcal{L}_{fd}$) that enhances segmentation performance across modalities like histopathology and ultrasound. Our method outperforms existing models and baselines, showing improved Dice scores for both the worst-off and best-off samples. To tackle the lack of datasets for triple-negative breast cancer (TNBC), we introduced a new ultrasound dataset focused on TNBC. $\mathcal{L}_{fd}$ reduces erroneous activation maps, enabling models to focus on relevant spatial regions more effectively. This is particularly impactful in the “Data Addition Dilemma" scenario, where pooling data from multiple sources introduces distribution shifts that degrade model performance. A modified version, $\mathcal{L}^{excg}_{fd}$, incorporates feature exchangeability to mitigate these shifts.

%\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{Prof. Lokhande acknowledges support from University at Buffalo startup funds, an Adobe Research Gift, an NVIDIA Academic Grant, and the National Center for Advancing Translational Sciences of the NIH (award UM1TR005296 to the University at Buffalo). Prof. Kim acknowledges support from RS-2019-II191906 (Graduate School of AI at POSTECH).}

\bibliography{midl26_357}

\appendix

\section{Related Works}
\label{sec:related_works}
Recent advances in deep learning have surpassed traditional segmentation methods like watersheds \cite{beucher2018morphological} and super-pixels \cite{li2015superpixel}, demonstrating high efficacy in medical image segmentation \cite{long2015fully,ronneberger2015u,chen2014semantic,singha2023alexsegnet,keaton2023celltranspose,kanadath2023multilevel}. While MCFNet \cite{Feng_2021_ICCV} captures spatial information, it struggles with complex staining patterns. Multimodal approaches \cite{dwivedi2022multi,10635449,chen2021multimodal,tomar2022tganet,zhao2024dtan,roy2024gru} integrate spatial and textual data but face challenges with homogeneous pixel distributions in medical images. 

Loss-based approaches like contrastive loss \cite{chaitanya2020contrastive,xu2021neutral}, deep supervision \cite{dou20163d}, and entropy minimization \cite{fleuret2021uncertainty} improve U-Net \cite{ronneberger2015u} segmentation by refining foreground and background representations. Entropy minimization and contrastive losses separate pixel-level class representations. Our method also uses foreground-background feature discrepancy but penalizes all U-Net layers. Deep supervision applies binary cross-entropy loss to decoder outputs without contrastive losses. Approaches like \cite{he2021consistent,gong2024segmentation} enhance class consistency and feature re-ranking, but none penalize feature discrepancy across all layers. Our work is the first to show that layer-wise foreground-background discrepancy improves U-Net \cite{ronneberger2015u} representations, addressing the ``data addition dilemma" \cite{shen2024data}. In addition to that, all the aforementioned losses are based on the strong i.i.d. assumption, which does not hold true always, as discussed in Sections 1 and 5. \textit{Furthermore, segmentation tasks involve assigning a label to each pixel (or region) in an image, resulting in a structured output with strong spatial dependencies among pixels or regions, unlike classification that assigns a single label to the entire image (treating each sample as an independent entity).} In segmentation, the data points (pixels or regions) within an image are not independent; their labels are often correlated due to spatial continuity and object boundaries. Therefore, the i.i.d. assumption is often violated, making exchangeability, a weaker and more realistic assumption, more appropriate.

SAM \cite{kirillov2023segment}, though powerful, is unsuitable for medical images due to its reliance on prompts and inability to handle numerous objects of interest without guidance \cite{mazurowski2023segment}. Large models like Transformers require extensive data, which is scarce in medical imaging \cite{he2023computer}, and fine-tuning pre-trained models can introduce modality biases \cite{barnett2024fine,kumar2022fine} and unobserved task-irrelevant confounders (U in Fig. \ref{fig:intro})\cite{zhang2024rethinking}. Furthermore, Transformer architectures like TransUNet \cite{chen2021transunet} and SwinUNet \cite{cao2022swin} suffer from oversegmentation and are unable to determine the foreground structure accurately compared to their UNet counterparts \cite{zhao2024comparison} (see Supplementary Sec \ref{suppl-sec-E}) for a comparison of the transformer architectures with our method). These challenges explain why U-Net variants remain the most widely adopted segmentation models in medical imaging \cite{yousef2023u,ahmadi2023comparative}. \footnote{We clarify that approaches like SIFA \cite{chen2019synergistic} were not included in the current study because \textit{they address a different problem setting.} Methods like SIFA explicitly assume a source-target paradigm with known domain labels and rely on adversarial feature alignment to match global distributions. In contrast, our work focuses on data-scarce pooling scenarios where datasets are incrementally added, domain boundaries may be ambiguous, and no source-target distinction or domain labels are assumed. \textit{Forcing such methods into our setting would require introducing artificial source-target splits and domain labels, fundamentally altering the problem formulation and leading to an unfair comparison.}}

\section{The segmentation loss}
\label{suppl-sec-A}

The Dice loss \cite{soomro2018strided} and Binary Cross Entropy (BCE) loss \cite{jadon2020survey} are crucial for image segmentation tasks, evaluating model performance by comparing predicted and actual masks. The dice loss ($L_{dice}$) and the BCE loss ($L_{bce}$) are defined in Eq. \ref{eq:dice} and \ref{eq:bce} respectively where $y_{ijk}$ represents the ground truth label for pixel $(i, j, k)$, $\tilde{y}_{ijk}$ represents the predicted probability for pixel $(i, j, k)$, $\epsilon$ is a small constant added for numerical stability to avoid division by zero or taking the log of zero, and $N$ is the total number of elements pixels.
\begin{equation}
\label{eq:dice}
    L_{dice} = 1 - \frac{2 \sum_{i,j,k} y_{ijk} \cdot \tilde{y}_{ijk} + \epsilon}{\sum_{i,j,k} y_{ijk} + \sum_{i,j,k} \tilde{y}_{ijk} + \epsilon}
\end{equation} 

\begin{equation}
\label{eq:bce}
\begin{aligned}
    L_{bce} = - \frac{1}{N} \sum_{i,j,k} \Big( &y_{ijk} \cdot \log(\tilde{y}_{ijk}) \\
    &+ (1 - y_{ijk}) \cdot \log(1 - \tilde{y}_{ijk}) + \epsilon)
\end{aligned}
\end{equation}

We use a linear combination of $L_{dice}$ and $L_{bce}$ as $L_{seg}$ \cite{roy2024eu}. This can be seen in Eq. \ref{eq:seg}

\begin{equation}
\label{eq:seg}
    L_{seg} = L_{dice}+L_{bce}
\end{equation}

\section{Proofs}

\subsection{Lemma 1}
\label{suppl-sec-B1}

\textbf{Lemma 1.} Relationship between feature discrepancy loss $\mathcal{L}_\textbf{fd}$, segmentation Dice score, and constant $k$ for feature vector $F$ derived from image $X$:
\begin{align}
-log(Dice \times (k + 1)) \leq \mathcal{L}_\textbf{fd}
\end{align}

\begin{proof}
% \small
Let \(\otimes\) denote element-wise multiplication. Then we get the relation between Dice score, the predicted segmentation mask $y$, and the ground truth segmentation mask $\tilde{y}$ as:

\begin{align}
\label{eq:y_pred}
\sum_{i,j,k} \tilde{y}_{ijk} = \frac{Dice}{2} \times \frac{\sum_{i,j,k} y_{ijk} + \sum_{i,j,k} \tilde{y}_{ijk}}{\sum_{i,j,k} y_{ijk}} 
\quad \nonumber \\
\text{(since \(Dice = \frac{2 \sum_{i,j,k} y_{ijk} \cdot \tilde{y}_{ijk} + \epsilon}{\sum_{i,j,k} y_{ijk} + \sum_{i,j,k} \tilde{y}_{ijk} + \epsilon}\))}
\end{align}

Now, simplifying FD (feature discrepancy) using Definition 1, we get:

\begin{align}
FD = \frac{\| \sum_{k} \left( \sum_{i,j} F_{i,j,k} \otimes \tilde{y}_{i,j,k} - \sum_{i,j} F_{i,j,k} \otimes (1 - \tilde{y}_{i,j,k}) \right) \|_2}{\| \sum_{i,j,k} F_{ijk} \|_{2}}
\end{align}

\begin{align}
FD \leq \frac{\| 2 \sum_{i,j,k} F_{i,j,k} \otimes \tilde{y}_{i,j,k} \|_2}{\| \sum_{i,j,k} F_{ijk} \|_{2}} + \frac{\| \sum_{i,j,k} F_{i,j,k} \|_2}{\| \sum_{i,j,k} F_{ijk} \|_{2}}
\quad \nonumber \\
\text{(using the triangle inequality).}
\end{align}

Now, using Eq. \ref{eq:y_pred} we get:

\begin{align}
FD - 1 \leq \frac{\| \sum_{i,j,k} F_{i,j,k} \otimes Dice \times \frac{\sum_{i,j,k} y_{ijk} + \sum_{i,j,k} \hat{y}_{ijk}}{\sum_{i,j,k} y_{ijk}} \|_2}{\| \sum_{i,j,k} F_{ijk} \|_{2}}
\end{align}

Since \(\sum_{i,j,k} \tilde{y}_{ijk}\) and \(\sum_{i,j,k} y_{ijk}\) are constants during testing, let \(\frac{\sum_{i,j,k} \tilde{y}_{ijk}}{\sum_{i,j,k} y_{ijk}} = k'\):  

\begin{align}
-log(FD) \geq -log(Dice \times (k + 1)) 
\quad \nonumber \\
\text{(Taking \(-\log\) on both sides).}
\end{align}

\begin{align}
\mathcal{L}_\textbf{fd} \geq -log(Dice \times (k + 1))
\end{align}

This completes the proof.
\end{proof}

\subsection{Lemma 2}
\label{suppl-sec-B2}

\textbf{Lemma 2. (Weight Norm Bound via Feature Discrepancy Loss):}
Let $W \in \mathbb{R}^{d \times d}$ denote the weight matrix of a UNet layer producing features $F = W \otimes x$, where $x \in \mathbb{R}^{d \times d}$ is the input to that layer. The relationship between $\mathcal{L}_\textbf{fd}$ and $W$ is given by:

\begin{align}
\mathcal{L}_{\text{fd}} = -\log\left( ||W \otimes (x_g - x_b)||_2^2 \right)
\end{align}

where $x_g$ and $x_b$ are foreground and background features of $x$, respectively. Minimizing $\mathcal{L}_{\text{fd}}$ implicitly enforces an upper bound on the spectral norm $||W||_2$, reducing the layer’s Lipschitz constant and improving generalization.

\begin{proof}
% \small
Let $\Delta x = x_g - x_b$ denote the inherent foreground-background separation in the input space. The loss $\mathcal{L}_{\text{fd}}$ incentivizes maximizing $||W \otimes \Delta x||_2^2$ where $\otimes$ is the hadamard product. Now we can frame the hadamard product in a different way to represent $W \otimes \Delta x$ as $W_{exp} \times \Delta x_{exp}$ where $\times$ is matrix multiplication, $W_{exp} \in \mathbb{R}^{d^2 \times d^2}$ is a diagonalized form of $W$ and $x_{exp} \in \mathbb{R}^{d^2 \times 1}$ is a reshaped form of $x$. The Lipschitz constant $L$ of the layer $F = W_{exp} \times x$ is the spectral norm of $W_{exp}$:

\begin{align}
L = ||W_{exp}||_2 = \sup_{||x_{exp}||_2 = 1} ||W_{exp} \times x_{exp}||_2
\end{align}

This measures the maximum amplification of the input by $W_{exp}$. To minimize $\mathcal{L}_{\text{fd}}$, the optimization ensures $||W_{exp} \times \Delta x_{exp}||_2^2 \geq \gamma$ for some $\gamma > 0$. By Cauchy-Schwarz:

\begin{align}
||W_{exp} \times \Delta x_{exp}||_2 \leq ||W_{exp}||_2 ||\Delta x_{exp}||_2.
\end{align}

Squaring both sides:

\begin{align}
\gamma \leq ||W_{exp} \times \Delta x_{exp}||_2^2 \leq ||W_{exp}||_2^2 ||\Delta x_{exp}||_2^2 \\ \implies ||W_{exp}||_2 \geq \frac{\sqrt{\gamma}}{||\Delta x_{exp}||_2}.
\end{align}

Thus, $\gamma$ defines the \textit{minimum required separation} between foreground and background features.

The gradient of $\mathcal{L}_{\text{fd}}$ with respect to $W$ is:

\begin{align}
\nabla_W \mathcal{L}_{\text{fd}} = -\frac{2}{||W \otimes \Delta x||_2^2} (W \otimes \Delta x)(\Delta x)^T.
\end{align}

The term $\frac{1}{||W \otimes \Delta x||_2^2}$ acts as an \textit{adaptive damping factor}: as $||W \otimes \Delta x||_2^2$ increases (better separation), the gradient magnitude decreases. This prevents $W$ from growing excessively to inflate separation artificially, thereby bounding $||W||_2$ and thus $||W_{exp}||_2$ (since $W_{exp}$ is a diagonalized form of $W$).

The network achieves $||W_{exp} \times \Delta x_{exp}||_2^2 \geq \gamma$ with the smallest possible $||W||_2$ (due to gradient damping) ensuring \textit{lower variance model} (reduced sensitivity to input perturbations) and preventing overfitting. Furthermore, the Lipschitz constant $L$, is also reduced, indicating a \textit{tighter generalization bounds} (the suboptimal error bound, $\mathcal{E}_{\text{gen}} \propto L$ as seen in Axiom 2).
\end{proof}

\subsection{Axiom 3}
\label{axiom:exchangeability}

\begin{axiom}[Exchangeability of Pooled Data]
Let $\{ (X_i, Y_i, S_i) \}_{i=1}^n$ denote a collection of data points from multiple sources, where $S_i$ indicates the source (e.g., scanner or site). The sequence is \emph{exchangeable} if for any permutation $\pi$ of the indices $\{1,\dots,n\}$, the joint distribution satisfies:
\begin{align}
P((X_1, Y_1, S_1), \dots, (X_n, Y_n, S_n)) = P((X_{\pi(1)}, Y_{\pi(1)}, S_{\pi(1)}), \dots, (X_{\pi(n)}, Y_{\pi(n)}, S_{\pi(n)})).
\end{align}
This implies that the order of samples carries no information about their joint distribution, even when $S_i$ induces dependence.
\end{axiom}

\paragraph{Proof Sketch.}
Exchangeability is a standard assumption in Bayesian and causal inference (e.g., de Finetti's theorem). In our setting, it is justified by the causal graph in Fig. 2c the confounders $U$ influence both $X$ and $Y$, but are conditionally independent of the source $S$ given the mediator $Z$. Under the front-door adjustment via $Z$, the causal effect $X \rightarrow Y$ is identifiable and invariant across sources. Consequently, permuting the source labels does not change the joint distribution of $(X, Y, Z)$, satisfying exchangeability. This provides the theoretical foundation for the exchangeable feature discrepancy loss $\mathcal{L}_{\text{fd}}^{\text{exch}}$ (Definition 2), which treats foreground/background features from $\mathcal{D}_{\text{base}}$ and $\mathcal{D}_{\text{novel}}$ as interchangeable, thereby mitigating distributional shifts.

\section{Qualitative analysis of Dice vs Feature discrepancy loss}
\label{suppl-sec-C}

Figure \ref{fig:movement} shows the improvement in the Dice scores of the samples with a decrease in $\mathcal{L}_\textbf{fd}$. As we can see in the Dice vs $\mathcal{L}_\textbf{fd}$ plot, as Dice score improves, $\mathcal{L}_\textbf{fd}$ decreases, and the points move to the top left corner of the plot. The green arrows signify the movement of each sample (each test image) after the use of $\mathcal{L}_\textbf{fd}$. The red arrow indicates an overall movement of the samples highlighting the flow towards the top left corner (increase in Dice and decrease in $\mathcal{L}_\textbf{fd}$). This signifies the importance of foreground and background feature disentanglement to ensure robust medical image segmentation under a data-scarce setting with complex backgrounds.

\begin{figure}[!t]
    \centering
    \includegraphics[width=0.8\linewidth, keepaspectratio]{images/Movement.png}
    \vspace{-20pt}
    \caption{\textbf{The change in Dice scores with change in $\mathcal{L}_\textbf{fd}$.} The plot with axis as Dice score and $\mathcal{L}_\textbf{fd}$ for samples of TNBC \cite{naylor2018segmentation} and US-TNBC for the Bottleneck (Bot) layer of NucleiSegNet \cite{lal2021nucleisegnet} and CMUNet \cite{10230609} are plotted respectively. The green arrows indicate the movement of each point after the use of $\mathcal{L}_\textbf{fd}$. The red arrow indicates the overall movement of the majority of the samples.}
    \vspace{-10pt}
    \label{fig:movement}
\end{figure}

\section{Algorithmic explanation of exchangeable Feature discrepancy loss}
\label{suppl-sec-D}

The algorithmic explanation of $\mathcal{L}^\textbf{exch}_\textbf{fd}$ for each iteration of training can be seen in Algorithm 1. Specifically, the foreground feature of image $i$ ($F_{g,i}$) pushes the background feature of image $j$ ($B_{g,j}$) in $\mathcal{L}^\textbf{exch}_\textbf{fd}$, while $F_{g,j}$ simultaneously pushes $B_{g,j}$ in $\mathcal{L}_\text{fd}$. This draws $F_{g,i}$ and $F_{g,j}$ closer, minimizing the distributional shift caused by differences in batch data sources.

% \begin{algorithm}[!h]
%     \small
%     \caption{$\mathcal{L}^\textbf{exch}_\textbf{fd}$ explained in Section 5}
%     \label{algo}
%     \begin{algorithmic}[1]
%         \State \textbf{Input:} Foreground features $F_{g}$ and background features $B_{g}$ for each image $i$ in a batch of size $n$
%         \For{each training iteration}
%             \For{$i \gets 1$ to $n$}
%                 \State $\mathcal{L}_\text{fd} = -\log(\| F_{g,i} - B_{g,i} \|_2)$ 
%                 \Comment{Penalizing feature distance of foreground and background for the same image}
                
%                 \State $\mathcal{L}^\textbf{exch}_\textbf{fd} = -\log(\| F_{g,i} - B_{g,i+k} \|_2)$ 
%                 \Comment{Where $k$ is arbitrary and introduced after shuffling $F_{g}$ and $B_{g}$ to ensure $F_{g,i}$ and $F_{g,j}$ are closer by repelling $B_{g,j}$}
                
%                 \State $L_i = \mathcal{L}_\text{fd} + \mathcal{L}^\textbf{exch}_\textbf{fd}$
%             \EndFor
%             \State $\text{loss} \gets \frac{1}{n} \sum_{i=1}^{n} \alpha \times L_i$
%         \EndFor
%         \State \textbf{Return:} $\text{loss}$
%     \end{algorithmic}
% \end{algorithm}
\begin{algorithm}[!h]
\small
\caption{$\mathcal{L}^{\textbf{exch}}_{\textbf{fd}}$ explained in Sec \ref{sec:data_addition_dilemma}}
\label{algo}

\KwIn{Foreground features $F_g$ and background features $B_g$ for each image in a batch of size $n$}

\For{each training iteration}{
    \For{$i = 1$ \KwTo $n$}{
        $\mathcal{L}_\text{fd} = -\log(\| F_{g,i} - B_{g,i} \|_2)$\;
        
        $\mathcal{L}^{\textbf{exch}}_\textbf{fd} = -\log(\| F_{g,i} - B_{g,i+k} \|_2)$\;
        
        $L_i = \mathcal{L}_\text{fd} + \mathcal{L}^{\textbf{exch}}_\textbf{fd}$\;
    }
    $\text{loss} \gets \frac{1}{n} \sum_{i=1}^{n} \alpha L_i$\;
}

\Return{$\text{loss}$}
\end{algorithm}

\section{Comparison with Transformer architectures}
\label{suppl-sec-E}

\begin{table}[htb]
    \centering
    \caption{Comparison with Transformer architectures. Scores are in \%.}
    \begin{tabular}{llllll} 
        \hline
        \multirow{2}{*}{\textbf{Model}} & \multicolumn{2}{c|}{\textbf{TNBC}} & \multicolumn{2}{c}{\textbf{MonuSeg}} \\
        \cline{2-5}
        & \textbf{Dice} & \textbf{IoU} & \textbf{Dice} & \textbf{IoU} \\
        \hline
        TransUNet \cite{chen2021transunet} & 76.50 & 72.04 & 77.46 & 63.85 \\
        SwinUNet \cite{cao2022swin} & 60.08 & 50.14 & 76.38 & 62.54 \\
        Ours & 82.65 & 70.58 & 81.69 & 68.65 \\
        \hline
    \end{tabular}
    \label{tab:transformer}
\end{table}

As seen in Table \ref{tab:transformer}, the transformer architectures are data hungry models and are not suitable for data-scarce medical image segmentation tasks. We see that TransUnet and SwinUnet have lower Dice and IoU scores even with respect to the base UNet models (See Table 2). This shows that data-hungry Transformer architectures are not an ideal choice for medical domains with data scarcity. 

\section{Significance testing} 
\label{suppl-sec-F}

\begin{table}[t]
\centering
\scriptsize
\setlength{\tabcolsep}{3pt}
\renewcommand\theadfont{\bfseries}
\begin{tabular}{@{}lcc@{\hspace{1.5em}}lcc@{}}
\toprule
\multicolumn{3}{c}{\textbf{MoNuSeg}} & \multicolumn{3}{c}{\textbf{US-TNBC}} \\
\cmidrule(r){1-3} \cmidrule(l){4-6}
\textbf{Model} & \textbf{Dice} & \textbf{IoU} & \textbf{Model} & \textbf{Dice} & \textbf{IoU} \\
\midrule
NuSegNet & 0.0015 & 0.0032 & CMUNet & 0.0045 & 0.0051 \\
AttnUNet & 0.0019 & 0.0007 & & & \\
\bottomrule
\end{tabular}
\caption{Statistical significance ($p$-values).}
\label{tab:pvalues}
\end{table}

To rigorously evaluate the impact of the $L_{fd}$, we conducted a statistical significance test. The baseline model (for example, CMUNet without $L_{fd}$) and five runs of the baseline model with $L_{fd}$ was used for a one-sample t-test. It can be seen from Table \ref{tab:pvalues} that $p$-values$\textless 0.01$, which makes the experiments statistically significant.

\section{Details about the US-TNBC dataset} 
\label{suppl-sec-G}

\begin{figure}%[ht]
    \centering
    \includegraphics[width=\linewidth, keepaspectratio]{images/Data_collection.png}
    \caption{The steps involved in the creation of the US-TNBC dataset.} 
    \label{fig:data_collection}
\end{figure}

The TNBC dataset focuses on Triple-Negative Breast Cancer tissues. The images are typically 721 x 570 pixels in size on average. It consists of 30 images, including 15 ultrasound images and 15 ground truth images. The data collected at baseline includes breast ultrasound images of women aged between 42 and 76 years old. This data was collected between 2022 and 2023, and the images are in PNG format. %The data was collected in collaboration with Roswell Park Comprehensive Cancer Center, following approved protocols of both Roswell Park Comprehensive Cancer Center and the University at Buffalo. 
To make the acquired data useful, some refinement tasks were performed. Firstly, the DICOM images were loaded into a DICOM reader, and the tumor images without marking or annotation were selected. Next, the DICOM files were converted into PNG format. The patient information was also eliminated using image cropping software. The images were cropped to retain maximum anatomical information while removing unnecessary boundaries and markers. The ground truth images were generated using Fiji, an open-source image processing program based on ImageJ2. The ground truth masks were produced and then inverted to match the UDIAT dataset mask convention, where the tumor masks are white and the background is black. This dataset is designed to evaluate algorithms for cancer detection, grading, and classification. The steps involved in the collection of the US-TNBC dataset are shown in Fig. \ref{fig:data_collection}. The dataset request link will be made accessible along with the code.

\section{Experimental Setup} 
\label{sec:setup}
We conduct experiments using four datasets: the TNBC dataset \cite{naylor2018segmentation} with histopathology images featuring dense glandular tissues and indistinct boundaries; the MonuSeg dataset \cite{kumar2019multi}, which includes Hematoxylin and Eosin-stained histopathology images; and a novel US-TNBC dataset comprising 15 ultrasound images of TNBC tissues collected in 2022-23, with ground truth masks generated using Fiji (more details in Supplementary Sec \ref{suppl-sec-G}). The UDIAT dataset \cite{yap2017automated} includes breast ultrasound images characterized by irregular tumor morphology and indistinct boundaries. Additionally, we evaluate an Alzheimer's histopathology dataset \cite{jimenez2022visual} for tau protein segmentation, with AD $256\times256$ and AD $128\times128$ versions, where the former contains a more complex background. Causal mediation and control of $\mathcal{L}_\textbf{fd}$ are independent of the neural network architecture.

\begin{table}[!t]
    \centering
    \scriptsize
    %\setlength{\tabcolsep}{6pt}  % Adjust column spacing
    %\renewcommand{\arraystretch}{1.1}  % Adjust row spacing
    \begin{tabular}{cccba} 
        \toprule
        \textbf{Dataset} & \textbf{Data Type} & \textbf{All Samples} & \textbf{Worst Off} & \textbf{Threshold} \\
        \midrule
        TNBC \cite{naylor2018segmentation} & Histopathology & 50 & 10 & 75.0 \\
        MoNuSeg \cite{kumar2019multi} & Histopathology & 44 & 25 & 70.0 \\
        UDIAT \cite{yap2017automated} & Ultrasound & 163 & 35 & 25.5 \\
        US-TNBC & Ultrasound & 15 & 10 & 65.5 \\
        AD \cite{jimenez2022visual} & Histopathology & 10k & 500 & 40.0 \\
        \bottomrule
    \end{tabular}
    \caption{\footnotesize \textbf{Summary of datasets.} ``All Samples" denote all the test samples of the dataset, whereas ``Worst Off" are the test samples with lower Dice scores. Threshold is an approximate estimate of the range of Dice score below which the ``Worst Off" samples lie.}
    \vspace{-20pt}
    \label{dataset_summary}
\end{table}

\section{Implementation details}
\label{suppl-sec-H}

We developed our segmentation model using Python and implemented it with the TensorFlow and Keras libraries. For data processing, we utilized numpy, OpenCV, and scikit-learn, enabling efficient data handling. We have used the high-performance NVIDIA TESLA P100 GPU to accelerate training and leverage hardware acceleration. The model has been trained for 100 epochs in the initial phase ($\alpha=0$) and 75 epochs in the second phase with $\mathcal{L}_\text{fd}$ ($\alpha \neq 0$). It has been seen that this specific initialization of $\alpha$ produces the best results as compared to other initialization values. This can be intuitively explained as the model learns to produce decent segmentation masks using the traditional $L_{seg}$ and a regularization by $\mathcal{L}_\text{fd}$ utilizes the prior knowledge gained by the model in the 75 epochs to refine the feature maps. \footnote{\textit{We set $\alpha = 75$ to allow the network to first learn stable and semantically meaningful foreground–background priors from the base data before introducing $\mathcal{L}_{fd}$.} Empirically, $\alpha>75$ does not yield further performance gains, indicating that these priors and the corresponding feature representations have already converged, and that delaying the regularizer further provides no additional benefit. In contrast, applying $\mathcal{L}_{fd}$ too early ($\alpha < 75$) leads to suboptimal performance, as foreground–background priors are not yet sufficiently formed; enforcing discrepancy at this stage acts on unstable features and introduces improper constraints. This behavior highlights the importance of activating $\mathcal{L}_{fd}$ only after reliable foreground–background priors have emerged.} This effectively achieves a better Dice score (+1.5-1.9\% across all datasets) than training the model with $\mathcal{L}_\text{fd}$ from the beginning. A train-test-validation split of 70-20-10\% has been applied. Callbacks were used to save the best-performing model during both training phases. To address non-uniform image sizes, all images have been resized to uniform $512\times512$ pixels for TNBC \cite{naylor2018segmentation}, the newly collected US-TNBC, and $256\times256$ for UDIAT \cite{yap2017automated} and AD \cite{jimenez2022visual} (both $256\times256$ and $128\times128$). We have applied data augmentation (horizontal and vertical flipping, rotations to the left and right by $90^{\circ}$) on the training set to train the models and on the test set to increase the number of data points for the plots. Evaluation of the models has been done on the test set without augmentation. Further details will be available with the code.

\section{Performance comparison} 
\label{suppl-sec-I}

\begin{figure*}%[ht]
    \centering
    \includegraphics[width=0.95\linewidth, keepaspectratio]{images/Qualitative.png}
    \vspace{-15pt}
    \caption{\footnotesize \textbf{Qualitative analysis of histopathology and ultrasound segmentation.} Results for (a) TNBC, (b) MoNuSeg, (c) AD 128, and (d) AD 256 datasets of NucleiSegNet, and for (e) US-TNBC and (f) UDIAT datasets of CMUNet, with (\textcolor{green}{green}) and without $\mathcal{L}_\textbf{fd}$. \textcolor{red}{Red} boxes highlight faulty segmentation without $\mathcal{L}_\textbf{fd}$, while \textcolor{green}{green} boxes show improvements with $\mathcal{L}_\textbf{fd}$.} 
    \vspace{-20pt}
    \label{fig:Qualitative}
\end{figure*}

\begin{figure*}[!t]
    \centering
    \includegraphics[width=\linewidth, keepaspectratio]{images/SOTA.png}
    \vspace{-15pt}
    \caption{\footnotesize \textbf{Comparison of the proposed method with existing models for various datasets.} We compare our model with MedT~\cite{valanarasu2021medical}, HistoSeg~\cite{wazir2022histoseg}, SPPNet~\cite{xu2023sppnet}, D-Net~\cite{islam2023densely}, MMPSO-S~\cite{kanadath2023multilevel}, TSCA-Net~\cite{fu2024tsca}, GRUNet \cite{roy2024gru} and AWGUNet \cite{10635449} for MoNuSeg \cite{kumar2019multi}, AWGUNet \cite{10635449}, GRUNet \cite{roy2024gru}, MCFNet \cite{Feng_2021_ICCV},  Deep-Fuzz \cite{das2023deep}, CellTrp \cite{keaton2023celltranspose}, ASNet \cite{singha2023alexsegnet}, MMPSO-S \cite{kanadath2023multilevel} for TNBC\cite{naylor2018segmentation}, and UNet \cite{ronneberger2015u}, SCAN \cite{zhang2020attentionbased}, STAN \cite{shareef2020stan}, RRC-Net \cite{chen2023rrcnet}, $EU^2Net$ \cite{roy2024eu}, CE-Net \cite{gu2019net}, and DAUNet \cite{pramanik2024dau} for UDIAT\cite{yap2017automated}. For AD 128 and 256, we compare our proposed method with UNet \cite{jimenez2022visual}, AttnUNet \cite{jimenez2022visual}, and NuSegNet \cite{tomar2022transresu}. Our proposed loss improves the Dice score of the best-performing architecture for AD 128 (NuSegNet \cite{tomar2022transresu}) and AD 256 (AttnUNet \cite{jimenez2022visual}) by 1.74 and 3.55, respectively.}
    \vspace{-15pt}
    \label{fig:sota_plots}
\end{figure*}

\begin{figure}[!t]
    \centering
    %\vspace{-8pt}
    \includegraphics[width=\linewidth, keepaspectratio]{images/Heatmaps.png}
    \vspace{-20pt}
    \caption{\footnotesize \textbf{Heatmaps with and without the use of $\mathcal{L}_{fd}$.} Heatmaps of the bottleneck layer (Bot) and the last decoder layer (Dec) of NucleiSegNet and CMUNet for AD and US-TNBC respectively. With $\mathcal{L}_{fd}$ (\textcolor{green}{green}), the erroneous activation maps are reduced, leading to better prediction.}
    \vspace{-10pt}
    \label{fig:Heatmap}
\end{figure}

Fig. \ref{fig:sota_plots} illustrates the performance of the state-of-the art methods comapred to ours. It is evident that the application of $\mathcal{L}_{fd}$ provides the ecessary boost to surpass the performance of the current methods. This also translates to the qualitative analysis as seen in Fig. \ref{fig:sota_plots}. Heatmaps seen in Fig. \ref{fig:Heatmap} indicate the refinement in the focus of the model's convolution layers after the application of $\mathcal{L}_\text{fd}$ leads to a finer predicted segmentation mask with lesser non overlapping regions corresponding to the ground truth.

To evaluate model robustness under noisy conditions, 
Gaussian noise was systematically added to the images. For each image $I \in \mathbb{R}^{H \times W \times C}$ in the dataset, 
zero-mean Gaussian noise $\epsilon$ with standard deviation $\sigma$ was sampled: 
\begin{equation}
\epsilon \sim \mathcal{N}(0,\, \sigma^2)
\end{equation}
The noisy image $\tilde{I}$ was then computed as:
\begin{equation}
\tilde{I} = \mathrm{clip}(I + \epsilon,\, 0,\, 1)
\end{equation}
where $\sigma$ was varied across experiments ($\sigma \in \{0.05, 0.10, 0.15, 0.20\}$) 
and $\mathrm{clip}(\cdot)$ ensured valid pixel intensities. This process maintained original data dimensions while simulating realistic sensor noise artifacts. In this experiment, we see that the proposed $\mathcal{L}_\textbf{fd}$ has lesser dip in the performance as compared to $L_{con}$, $L_{deeps}$ and $L_{seg}$ (Without $\mathcal{L}_\textbf{fd}$, i.e., a combination of Dice loss and BCE loss). This indicates that foreground-background feature disentanglement ensures robust feature extraction even for noisy/poor-quality images. 

\begin{figure}[!t]
    \centering
    \includegraphics[width=0.8\linewidth, keepaspectratio]{images/Noise_Analysis.png}
    \caption{Analysis of the loss functions in the presence of noise added to input images while training. Dice score dips are less for $\mathcal{L}_\textbf{fd}$ as compared to other losses as the strength of the noise increases.} 
    \label{fig:data_collection}
\end{figure}

\section{Extension to Multi-class settings}
\label{sec:multiclass}

To demonstrate the effectiveness of $\mathcal{L}_{fd}$, we evaluate our method on the CoNSeP dataset \cite{graham2019hover}, a standard benchmark for \emph{multi-class nuclei instance segmentation}, which contains multiple semantic nucleus categories annotated at the instance level. Following common practice in nuclei segmentation, the multi-class problem is decomposed into multiple binary foreground–background segmentation tasks, one per nucleus class (e.g., inflammatory, epithelial, spindle-shaped, etc.). Each binary task predicts a class-specific foreground mask against a shared background, and the final multi-class segmentation is obtained by aggregating class-wise predictions. This strategy is widely adopted in prior work, as it allows class-specific feature learning while preserving a consistent background representation. $\mathcal{L}_{fd}$ operates at the level of \emph{foreground vs.\ background feature separation} and does not assume a single foreground category. In the multi-class setting, $\mathcal{L}_{fd}$ is applied independently to each class-specific binary segmentation head, encouraging robust and disentangled foreground–background representations for every nucleus type. As a result, the method scales linearly with the number of classes and does not require any modification to the loss formulation. All experiments use a standard U-Net architecture following the nnU-Net training configuration, with no architectural changes. We report class-wise and overall performance on the CoNSeP \cite{graham2019hover} validation/test split using standard nuclei segmentation metrics. Class-wise metrics are computed independently for each nucleus category, and overall performance is obtained by averaging across classes. Table \ref{tab:consep_multiclass} confirm that $\mathcal{L}_fd$ is not restricted to binary segmentation, but applies directly to multi-class tasks through standard binary decomposition, while retaining its benefits in disentangling foreground–background representations under distribution shift. We will include this results in the supplementary of the camera-ready version.

\begin{table}[t]
\centering
\scriptsize
\caption{Multi-class nuclei segmentation results (Dice score) on CoNSeP.}
\begin{tabular}{lccccc}
\toprule
\textbf{Method} &
\textbf{Background} &
\textbf{Other} &
\textbf{Inflammatory} &
\textbf{Healthy Epithelial} &
\textbf{Malignant Epithelial} \\
\midrule
UNet & 91.94 & 0.87 & 37.03 & 16.37 & 58.67 \\
UNet+$\mathcal{L}_{fd}$ & 91.77 & 1.82 & 34.59 & 32.93 & 59.53 \\
\bottomrule
\end{tabular}

\vspace{3pt} % small gap between parts

\begin{tabular}{lccc}
\toprule
\textbf{Method} &
\textbf{Fibroblast} &
\textbf{Muscle} &
\textbf{Endothelial} \\
\midrule
UNet & 18.73 & 25.41 & 0.00 \\
UNet+$\mathcal{L}_{fd}$ & 28.18 & 36.48 & 0.00 \\
\bottomrule
\end{tabular}
\label{tab:consep_multiclass}
\end{table}

\end{document}
